diff --git a/dream-server/.env.schema.json b/dream-server/.env.schema.json
index d1af609b..94c4e5f0 100644
--- a/dream-server/.env.schema.json
+++ b/dream-server/.env.schema.json
@@ -434,6 +434,38 @@
       "type": "string",
       "description": "Enable image generation in Open WebUI (requires ComfyUI)",
       "default": "true"
+    },
+    "GPU_ASSIGNMENT_JSON_B64": {
+      "type": "string",
+      "description": "Base64-encoded GPU assignment JSON"
+    },
+    "LLAMA_SERVER_GPU_UUIDS": {
+      "type": "string",
+      "description": "GPU UUIDs assigned to llama-server (comma-separated, used by NVIDIA_VISIBLE_DEVICES)"
+    },
+    "LLAMA_ARG_SPLIT_MODE": {
+      "type": "string",
+      "description": "llama.cpp split mode (LLAMA_ARG_SPLIT_MODE): none | layer (pipeline) | row (tensor/hybrid)"
+    },
+    "LLAMA_ARG_TENSOR_SPLIT": {
+      "type": "string",
+      "description": "llama.cpp tensor split weights (LLAMA_ARG_TENSOR_SPLIT): comma-separated proportions e.g. 3,1"
+    },
+    "EMBEDDINGS_GPU_UUID": {
+      "type": "string",
+      "description": "GPU UUID assigned to embeddings service"
+    },
+    "COMFYUI_GPU_UUID": {
+      "type": "string",
+      "description": "GPU UUID assigned to ComfyUI"
+    },
+    "WHISPER_GPU_UUID": {
+      "type": "string",
+      "description": "GPU UUID assigned to Whisper"
+    },
+    "LLM_MODEL_SIZE_MB": {
+      "type": "integer",
+      "description": "Approximate model file size in MB (used for multi-GPU memory planning)"
     }
   }
 }
diff --git a/dream-server/README.md b/dream-server/README.md
index dfd10c1c..03158529 100644
--- a/dream-server/README.md
+++ b/dream-server/README.md
@@ -37,6 +37,8 @@ Known-good version baselines: [`docs/KNOWN-GOOD-VERSIONS.md`](docs/KNOWN-GOOD-VE
 
 ## 5-Minute Quickstart (Linux)
 
+> **Prerequisites:** `curl` and `jq` must be installed. The installer will auto-install `jq` if missing, but `curl` is required to fetch the installer itself.
+
 ```bash
 # One-line install (Linux — NVIDIA or AMD)
 curl -fsSL https://raw.githubusercontent.com/Light-Heart-Labs/DreamServer/v2.4.0/get-dream-server.sh | bash
diff --git a/dream-server/docker-compose.multigpu.yml b/dream-server/docker-compose.multigpu.yml
new file mode 100644
index 00000000..750920f8
--- /dev/null
+++ b/dream-server/docker-compose.multigpu.yml
@@ -0,0 +1,12 @@
+services:
+  llama-server:
+    environment:
+      NVIDIA_VISIBLE_DEVICES: "${LLAMA_SERVER_GPU_UUIDS:-all}"
+      LLAMA_ARG_SPLIT_MODE: "${LLAMA_ARG_SPLIT_MODE:-none}"
+      LLAMA_ARG_TENSOR_SPLIT: "${LLAMA_ARG_TENSOR_SPLIT:-}"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: [gpu]
diff --git a/dream-server/extensions/services/comfyui/compose.multigpu.yaml b/dream-server/extensions/services/comfyui/compose.multigpu.yaml
new file mode 100644
index 00000000..47c9f51b
--- /dev/null
+++ b/dream-server/extensions/services/comfyui/compose.multigpu.yaml
@@ -0,0 +1,9 @@
+services:
+  comfyui:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${COMFYUI_GPU_UUID}"]
+              capabilities: [gpu]
diff --git a/dream-server/extensions/services/embeddings/compose.multigpu.yaml b/dream-server/extensions/services/embeddings/compose.multigpu.yaml
new file mode 100644
index 00000000..e749bdfd
--- /dev/null
+++ b/dream-server/extensions/services/embeddings/compose.multigpu.yaml
@@ -0,0 +1,9 @@
+services:
+  embeddings:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${EMBEDDINGS_GPU_UUID}"]
+              capabilities: [gpu]
diff --git a/dream-server/extensions/services/whisper/compose.multigpu.yaml b/dream-server/extensions/services/whisper/compose.multigpu.yaml
new file mode 100644
index 00000000..51939a30
--- /dev/null
+++ b/dream-server/extensions/services/whisper/compose.multigpu.yaml
@@ -0,0 +1,9 @@
+services:
+  whisper:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${WHISPER_GPU_UUID}"]
+              capabilities: [gpu]
diff --git a/dream-server/installers/lib/compose-select.sh b/dream-server/installers/lib/compose-select.sh
index cd317d0d..be23e51b 100755
--- a/dream-server/installers/lib/compose-select.sh
+++ b/dream-server/installers/lib/compose-select.sh
@@ -7,7 +7,7 @@
 #          GPU backend, and capability profile
 #
 # Expects: SCRIPT_DIR, TIER, GPU_BACKEND, CAP_COMPOSE_OVERLAYS, LOG_FILE,
-#           log(), warn()
+#          GPU_COUNT, log(), warn()
 # Provides: resolve_compose_config() → sets COMPOSE_FILE, COMPOSE_FLAGS
 #
 # Modder notes:
@@ -91,6 +91,7 @@ resolve_compose_config() {
             --tier "$TIER" \
             --gpu-backend "$GPU_BACKEND" \
             --profile-overlays "${CAP_COMPOSE_OVERLAYS:-}" \
+            --gpu-count "${GPU_COUNT:-1}" \
             --env 2>>"$LOG_FILE")"
         load_env_from_output <<< "$COMPOSE_ENV"
     fi
diff --git a/dream-server/installers/lib/constants.sh b/dream-server/installers/lib/constants.sh
index ff2177ab..5617e7b4 100755
--- a/dream-server/installers/lib/constants.sh
+++ b/dream-server/installers/lib/constants.sh
@@ -49,6 +49,7 @@ BGRN='\033[1;32m'        # Bright green — emphasis, success, headings
 DGRN='\033[2;32m'        # Dim green — secondary text, lore
 AMB='\033[0;33m'         # Amber — warnings, ETA labels
 WHT='\033[1;37m'         # White — key URLs
+DIM='\033[2;37m'         # Dim white
 NC='\033[0m'             # Reset
 CURSOR='█'               # Block cursor for typing
 
diff --git a/dream-server/installers/lib/nvidia-topo.sh b/dream-server/installers/lib/nvidia-topo.sh
new file mode 100644
index 00000000..7b3b5037
--- /dev/null
+++ b/dream-server/installers/lib/nvidia-topo.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server Installer — NVIDIA GPU Topology Detection
+# ============================================================================
+# Part of: installers/lib/
+# Purpose: Detect NVIDIA Multi-GPU topology as well as basic GPU info
+#          and return as JSON. Sourced by detection.sh and 03-features.sh.
+#
+# Expects: nvidia-smi, warn(), err(), LINK_RANK
+# Provides: parse_nvidia_topo_matrix(), detect_nvidia_topo(), link_rank(),
+#           link_label(), get_rank()
+#
+# Modder notes:
+#   This script handles NVIDIA-specific topology detection including NVLink,
+#   PCIe, and NUMA relationships. It outputs structured JSON for consumption
+#   by the multi-GPU strategy selection logic.
+# ============================================================================
+
+link_rank() {
+  case "$1" in
+  NV4 | NV6 | NV8 | NV12 | NV18)  echo 100 ;;   # NVLink gen2/3
+  XGMI | XGMI2)                   echo 90  ;;   # AMD Infinity Fabric
+  NV1 | NV2 | NV3)                echo 80  ;;   # NVLink gen1
+  MIG)                            echo 70  ;;   # MIG instance, same die
+  PIX)                            echo 50  ;;   # Same PCIe switch
+  PXB)                            echo 40  ;;   # Multiple PCIe switches, same CPU
+  PHB)                            echo 30  ;;   # PCIe host bridge
+  NODE)                           echo 20  ;;   # Same NUMA, no direct bridge
+  SYS | SOC)                      echo 10  ;;   # Cross-NUMA (SOC = old name for SYS)
+  *)                              echo 0   ;;
+  esac
+}
+
+link_label() {
+  case "$1" in
+  NV*)   echo "NVLink" ;;
+  XGMI*) echo "InfinityFabric" ;;
+  MIG)   echo "MIG-SameDie" ;;
+  PIX)   echo "PCIe-SameSwitch" ;;
+  PXB)   echo "PCIe-CrossSwitch" ;;
+  PHB)   echo "PCIe-HostBridge" ;;
+  NODE)  echo "SameNUMA-NoBridge" ;;
+  SYS | SOC) echo "CrossNUMA" ;;
+  X)     echo "Self" ;;
+  *)     echo "Unknown" ;;
+  esac
+}
+parse_nvidia_topo_matrix() {
+  # Returns JSON array of {gpu_a, gpu_b, link_type, link_label, rank}
+  local matrix
+  matrix=$(nvidia-smi topo -m 2>/dev/null) || {
+    warn "nvidia-smi topo -m failed"
+    echo "[]"
+    return
+  }
+
+  local header_line headers=()
+  header_line=$(echo "$matrix" | grep -E '^\s+GPU[0-9]' | head -1)
+  read -ra headers <<<"$header_line"
+
+  # Collect pairs as TSV, then convert to JSON via jq
+  local pairs_tsv=""
+
+  while IFS= read -r line; do
+    [[ "$line" =~ ^(GPU[0-9]+|NIC[0-9]+) ]] || continue
+    local row_label
+    row_label=$(echo "$line" | awk '{print $1}')
+    [[ "$row_label" =~ ^GPU ]] || continue # only GPU rows
+    local gpu_a="${row_label#GPU}"
+    local cells=()
+    read -ra cells <<<"$line"
+    # cells[0] = row label, cells[1..] = columns
+    for col_idx in "${!headers[@]}"; do
+      local col_header="${headers[$col_idx]}"
+      [[ "$col_header" =~ ^GPU ]] || continue
+      local gpu_b="${col_header#GPU}"
+      [[ "$gpu_a" == "$gpu_b" ]] && continue  # skip self
+      [[ "$gpu_a" -ge "$gpu_b" ]] && continue # dedup (only A<B pairs)
+      local cell="${cells[$((col_idx + 1))]:-UNKNOWN}"
+      local rank
+      rank=$(link_rank "$cell")
+      local label
+      label=$(link_label "$cell")
+      pairs_tsv+="${gpu_a}	${gpu_b}	${cell}	${label}	${rank}"$'\n'
+    done
+  done <<<"$matrix"
+
+  if [[ -z "$pairs_tsv" ]]; then
+    echo "[]"
+    return
+  fi
+
+  printf '%s' "$pairs_tsv" | jq -Rn '[inputs | split("\t") | {
+    gpu_a: (.[0] | tonumber),
+    gpu_b: (.[1] | tonumber),
+    link_type: .[2],
+    link_label: .[3],
+    rank: (.[4] | tonumber)
+  }]'
+}
+
+detect_nvidia_topo() {
+  # Basic GPU list
+  local gpu_list
+  gpu_list=$(nvidia-smi --query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid \
+    --format=csv,noheader,nounits 2>/dev/null) || {
+    err "nvidia-smi query failed"
+    return 1
+  }
+
+  # Parse CSV into JSON array via jq
+  local gpus_json
+  gpus_json=$(echo "$gpu_list" | jq -Rn '[inputs | split(",") | map(gsub("^\\s+|\\s+$"; "")) | {
+    index: (.[0] | tonumber),
+    name: .[1],
+    memory_gb: ((.[2] | tonumber) / 1024 * 10 | round / 10),
+    pcie_gen: .[3],
+    pcie_width: .[4],
+    uuid: .[5]
+  }]')
+
+  local gpu_count
+  gpu_count=$(echo "$gpus_json" | jq 'length')
+
+  # MIG detection
+  local mig_mode="false"
+  if nvidia-smi -q 2>/dev/null | grep -q "MIG Mode.*Enabled"; then
+    mig_mode="true"
+  fi
+
+  # Driver version
+  local driver_ver
+  driver_ver=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 | xargs)
+
+  # Topology matrix
+  local topo_pairs
+  topo_pairs=$(parse_nvidia_topo_matrix)
+
+  # NUMA info
+  local numa_json="{}"
+  if command -v numactl &>/dev/null; then
+    local numa_nodes
+    numa_nodes=$(numactl --hardware 2>/dev/null | grep "^node [0-9]* cpus" | wc -l)
+    numa_json=$(jq -n --argjson n "$numa_nodes" '{nodes: $n}')
+  fi
+
+  # Compose final JSON
+  jq -n \
+    --arg vendor "nvidia" \
+    --argjson gpu_count "$gpu_count" \
+    --arg driver "$driver_ver" \
+    --argjson mig "$mig_mode" \
+    --argjson numa "$numa_json" \
+    --argjson gpus "$gpus_json" \
+    --argjson links "$topo_pairs" \
+    '{
+      vendor: $vendor,
+      gpu_count: $gpu_count,
+      driver_version: $driver,
+      mig_enabled: $mig,
+      numa: $numa,
+      gpus: $gpus,
+      links: $links
+    }'
+}
+
+# ============================================================================
+# Topology lookup helpers (used by 03-features.sh custom assignment path)
+# ============================================================================
+
+get_rank()  { echo "${LINK_RANK["$1,$2"]:-0}"; }
diff --git a/dream-server/installers/lib/tier-map.sh b/dream-server/installers/lib/tier-map.sh
index 91f127a8..1d346210 100755
--- a/dream-server/installers/lib/tier-map.sh
+++ b/dream-server/installers/lib/tier-map.sh
@@ -23,6 +23,7 @@ resolve_tier_config() {
             GGUF_URL=""
             GGUF_SHA256=""
             MAX_CONTEXT=200000
+            LLM_MODEL_SIZE_MB=0
             ;;
         ARC)
             # Intel Arc A770 (16 GB) and future Arc B-series (≥12 GB VRAM)
@@ -33,6 +34,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
             GGUF_SHA256="03b74727a860a56338e042c4420bb3f04b2fec5734175f4cb9fa853daf52b7e8"
             MAX_CONTEXT=32768
+            LLM_MODEL_SIZE_MB=5760    # Qwen3.5-9B-Q4_K_M (5.68 GB)
             GPU_BACKEND="sycl"
             N_GPU_LAYERS=99
             ;;
@@ -45,6 +47,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf"
             GGUF_SHA256="00fe7986ff5f6b463e62455821146049db6f9313603938a70800d1fb69ef11a4"
             MAX_CONTEXT=16384
+            LLM_MODEL_SIZE_MB=2870    # Qwen3.5-4B-Q4_K_M (2.74 GB)
             GPU_BACKEND="sycl"
             N_GPU_LAYERS=99
             ;;
@@ -55,6 +58,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF/resolve/main/Qwen3-Coder-Next-Q4_K_M.gguf"
             GGUF_SHA256="9e6032d2f3b50a60f17ce8bf5a1d85c71af9b53b89c7978020ae7c660f29b090"
             MAX_CONTEXT=131072
+            LLM_MODEL_SIZE_MB=48500   # 48.5 GB per HF file listing
             ;;
         SH_LARGE)
             TIER_NAME="Strix Halo 90+"
@@ -63,6 +67,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3-Coder-Next-GGUF/resolve/main/Qwen3-Coder-Next-Q4_K_M.gguf"
             GGUF_SHA256="9e6032d2f3b50a60f17ce8bf5a1d85c71af9b53b89c7978020ae7c660f29b090"
             MAX_CONTEXT=131072
+            LLM_MODEL_SIZE_MB=48500   # 48.5 GB per HF file listing
             ;;
         SH_COMPACT)
             TIER_NAME="Strix Halo Compact"
@@ -71,6 +76,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf"
             GGUF_SHA256="9f1a24700a339b09c06009b729b5c809e0b64c213b8af5b711b3dbdfd0c5ba48"
             MAX_CONTEXT=131072
+            LLM_MODEL_SIZE_MB=18600   # 18.6 GB per HF file listing
             ;;
         0)
             TIER_NAME="Lightweight"
@@ -79,6 +85,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf"
             GGUF_SHA256=""
             MAX_CONTEXT=8192
+            LLM_MODEL_SIZE_MB=1500    # Qwen3.5-2B-Q4_K_M (1.28 GB)
             ;;
         1)
             TIER_NAME="Entry Level"
@@ -87,6 +94,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
             GGUF_SHA256="03b74727a860a56338e042c4420bb3f04b2fec5734175f4cb9fa853daf52b7e8"
             MAX_CONTEXT=16384
+            LLM_MODEL_SIZE_MB=5760    # Qwen3.5-9B-Q4_K_M (5.68 GB)
             ;;
         2)
             TIER_NAME="Prosumer"
@@ -95,6 +103,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-9B-GGUF/resolve/main/Qwen3.5-9B-Q4_K_M.gguf"
             GGUF_SHA256="03b74727a860a56338e042c4420bb3f04b2fec5734175f4cb9fa853daf52b7e8"
             MAX_CONTEXT=32768
+            LLM_MODEL_SIZE_MB=5760    # Qwen3.5-9B-Q4_K_M (5.68 GB)
             ;;
         3)
             TIER_NAME="Pro"
@@ -103,6 +112,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/Qwen3.5-27B-Q4_K_M.gguf"
             GGUF_SHA256="84b5f7f112156d63836a01a69dc3f11a6ba63b10a23b8ca7a7efaf52d5a2d806"
             MAX_CONTEXT=32768
+            LLM_MODEL_SIZE_MB=16400   # Qwen3.5-27B-Q4_K_M (16.7 GB)
             ;;
         4)
             TIER_NAME="Enterprise"
@@ -111,6 +121,7 @@ resolve_tier_config() {
             GGUF_URL="https://huggingface.co/unsloth/Qwen3-30B-A3B-GGUF/resolve/main/Qwen3-30B-A3B-Q4_K_M.gguf"
             GGUF_SHA256="9f1a24700a339b09c06009b729b5c809e0b64c213b8af5b711b3dbdfd0c5ba48"
             MAX_CONTEXT=131072
+            LLM_MODEL_SIZE_MB=18600   # 18.6 GB per HF file listing
             ;;
         *)
             error "Invalid tier: $TIER. Valid tiers: 0, 1, 2, 3, 4, CLOUD, NV_ULTRA, SH_LARGE, SH_COMPACT, ARC, ARC_LITE"
diff --git a/dream-server/installers/phases/01-preflight.sh b/dream-server/installers/phases/01-preflight.sh
index 3645760c..a90ec363 100755
--- a/dream-server/installers/phases/01-preflight.sh
+++ b/dream-server/installers/phases/01-preflight.sh
@@ -42,11 +42,21 @@ if ! command -v curl &> /dev/null; then
 fi
 log "curl: $(curl --version 2>/dev/null | sed -n '1p')"
 
-# Check optional tools (warn but don't fail)
-OPTIONAL_TOOLS_MISSING=""
 if ! command -v jq &> /dev/null; then
-    OPTIONAL_TOOLS_MISSING="$OPTIONAL_TOOLS_MISSING jq"
+    log "jq not found - attempting auto-install..."
+    case "$PKG_MANAGER" in
+        dnf)    sudo dnf install -y jq ;;
+        pacman) sudo pacman -S --noconfirm jq ;;
+        zypper) sudo zypper install -y jq ;;
+        apk)    sudo apk add jq ;;
+        *)      sudo apt-get install -y jq ;;
+    esac
+    command -v jq &> /dev/null || error "Failed to install jq automatically. Install it manually and re-run."
 fi
+log "jq: $(jq --version 2>/dev/null)"
+
+# Check optional tools (warn but don't fail)
+OPTIONAL_TOOLS_MISSING=""
 if ! command -v rsync &> /dev/null; then
     OPTIONAL_TOOLS_MISSING="$OPTIONAL_TOOLS_MISSING rsync"
 fi
diff --git a/dream-server/installers/phases/02-detection.sh b/dream-server/installers/phases/02-detection.sh
index 28f3cd44..ddb16133 100755
--- a/dream-server/installers/phases/02-detection.sh
+++ b/dream-server/installers/phases/02-detection.sh
@@ -16,7 +16,9 @@
 #           TIER, TIER_NAME, LLM_MODEL, GGUF_FILE, GGUF_URL, MAX_CONTEXT,
 #           COMPOSE_FILE, COMPOSE_FLAGS, RAM_GB, DISK_AVAIL, BACKEND_ID,
 #           LLM_HEALTHCHECK_URL, LLM_PUBLIC_API_PORT,
-#           OPENCLAW_PROVIDER_NAME_DEFAULT, OPENCLAW_PROVIDER_URL_DEFAULT
+#           OPENCLAW_PROVIDER_NAME_DEFAULT, OPENCLAW_PROVIDER_URL_DEFAULT,
+#           GPU_TOPOLOGY_JSON, GPU_HAS_NVLINK, GPU_TOTAL_VRAM,
+#           LLM_MODEL_SIZE_MB
 #
 # Modder notes:
 #   Change tier auto-detection thresholds or add new hardware classes here.
@@ -280,6 +282,41 @@ if [[ $GPU_COUNT -gt 0 && "$GPU_BACKEND" == "intel" ]]; then
     log "Intel Arc backend: GPU_BACKEND=intel, VRAM=${GPU_VRAM}MB, Level Zero=${_level_zero_ok}"
 fi
 
+# -----------------------------------------------------------------------------
+# NVIDIA Multi-GPU Topology Detection
+# -----------------------------------------------------------------------------
+GPU_TOPOLOGY_JSON="{}"
+GPU_HAS_NVLINK="false"
+GPU_TOTAL_VRAM=0
+if [[ $GPU_COUNT -gt 1 && "$GPU_BACKEND" == "nvidia" ]]; then
+    ai "Detecting multi-GPU topology..."
+    if [[ -f "$SCRIPT_DIR/installers/lib/nvidia-topo.sh" ]]; then
+        # Source the topology detection script
+        source "$SCRIPT_DIR/installers/lib/nvidia-topo.sh"
+        
+        # Run topology detection and capture JSON output
+        GPU_TOPOLOGY_JSON=$(detect_nvidia_topo 2>>"$LOG_FILE") || {
+            warn "Multi-GPU topology detection failed — multi-GPU configuration disabled"
+            ai_warn "Could not detect GPU topology. Multi-GPU features will be skipped."
+            ai_warn "Check $LOG_FILE for details. You can re-run the installer after fixing the issue."
+            GPU_TOPOLOGY_JSON="{}"
+        }
+        
+        # Extract key topology information for tier assignment
+        if [[ -n "$GPU_TOPOLOGY_JSON" && "$GPU_TOPOLOGY_JSON" != "{}" ]]; then
+            GPU_HAS_NVLINK=$(echo "$GPU_TOPOLOGY_JSON" | jq -r '[.links[] | select(.link_type | startswith("NV"))] | length > 0')
+            GPU_TOTAL_VRAM=$(echo "$GPU_TOPOLOGY_JSON" | jq -r '[.gpus[].memory_gb] | add * 1024 | floor')
+            log "Multi-GPU topology: NVLink=$GPU_HAS_NVLINK, Total VRAM=${GPU_TOTAL_VRAM}MB"
+        else
+            log "topology detection returned empty, using basic GPU info"
+            GPU_TOTAL_VRAM=$((GPU_VRAM * GPU_COUNT))
+        fi
+    else
+        log "NVIDIA topology detection script not found, skipping detailed topology analysis"
+        GPU_TOTAL_VRAM=$((GPU_VRAM * GPU_COUNT))
+    fi
+fi
+
 # Auto-detect tier if not specified
 if [[ -z "$TIER" ]]; then
     PROFILE_TIER="$(normalize_profile_tier "${CAP_RECOMMENDED_TIER:-}")"
@@ -306,7 +343,26 @@ if [[ -z "$TIER" ]]; then
         fi
     elif [[ $GPU_VRAM -ge 90000 ]]; then
         TIER="NV_ULTRA"
-    elif [[ $GPU_COUNT -ge 2 ]] || [[ $GPU_VRAM -ge 40000 ]]; then
+    elif [[ $GPU_COUNT -ge 2 ]]; then
+        # Enhanced multi-GPU tier assignment based on topology
+        if [[ "$GPU_HAS_NVLINK" == "true" ]]; then
+            # High-bandwidth interconnect (NVLink)
+            if [[ $GPU_COUNT -ge 4 || $GPU_TOTAL_VRAM -ge 90000 ]]; then
+                TIER="NV_ULTRA"
+            else
+                TIER=4
+            fi
+        else
+            # PCIe or other interconnect
+            if [[ $GPU_COUNT -ge 4 ]]; then
+                TIER=4
+            elif [[ $GPU_TOTAL_VRAM -ge 40000 ]]; then
+                TIER=4
+            else
+                TIER=3
+            fi
+        fi
+    elif [[ $GPU_VRAM -ge 40000 ]]; then
         TIER=4
     elif [[ $GPU_VRAM -ge 20000 ]] || [[ $RAM_GB -ge 96 ]]; then
         TIER=3
diff --git a/dream-server/installers/phases/03-features.sh b/dream-server/installers/phases/03-features.sh
index 11d6a244..fb676e7b 100755
--- a/dream-server/installers/phases/03-features.sh
+++ b/dream-server/installers/phases/03-features.sh
@@ -6,10 +6,15 @@
 # Purpose: Interactive feature selection menu
 #
 # Expects: INTERACTIVE, DRY_RUN, TIER, ENABLE_VOICE, ENABLE_WORKFLOWS,
-#           ENABLE_RAG, ENABLE_OPENCLAW, show_phase(), show_install_menu(),
-#           log(), warn(), signal()
+#           ENABLE_RAG, ENABLE_OPENCLAW, GPU_COUNT, GPU_BACKEND,
+#           GPU_TOPOLOGY_JSON, LLM_MODEL_SIZE_MB, SCRIPT_DIR, VERBOSE, DEBUG,
+#           GPU_INDICES, GPU_UUIDS (arrays from topology),
+#           show_phase(), show_install_menu(), chapter(), bootline(),
+#           success(), log(), warn(), error(), signal()
 # Provides: ENABLE_VOICE, ENABLE_WORKFLOWS, ENABLE_RAG, ENABLE_OPENCLAW,
-#           OPENCLAW_CONFIG
+#           OPENCLAW_CONFIG, GPU_ASSIGNMENT_JSON,
+#           LLAMA_SERVER_GPU_UUIDS, WHISPER_GPU_UUID, COMFYUI_GPU_UUID,
+#           EMBEDDINGS_GPU_UUID, LLAMA_ARG_SPLIT_MODE, LLAMA_ARG_TENSOR_SPLIT
 #
 # Modder notes:
 #   Add new optional features to the Custom menu here.
@@ -84,3 +89,279 @@ if [[ "$ENABLE_OPENCLAW" == "true" ]]; then
 fi
 
 log "All services enabled (core install)"
+
+# Early return if single gpu
+if [[ "$GPU_COUNT" -le 1 ]]; then
+    log "Single GPU detected — skipping multi-GPU configuration."
+    return
+fi
+
+# Multi-GPU Configuration
+
+# write $GPU_TOPOLOGY_JSON into a tmpfile to use by the commands
+TOPOLOGY_FILE=$(mktemp /tmp/ds_gpu_topology.XXXXXX.json)
+trap "rm -f $TOPOLOGY_FILE" EXIT
+echo "$GPU_TOPOLOGY_JSON" > "$TOPOLOGY_FILE"
+
+ASSIGN_GPUS_SCRIPT="$SCRIPT_DIR/scripts/assign_gpus.py"
+
+# Validate topology gpu_count matches installer's GPU_COUNT (don't overwrite the canonical value)
+_topo_gpu_count=$(jq '.gpu_count // 0' "$TOPOLOGY_FILE")
+if [[ "$_topo_gpu_count" != "$GPU_COUNT" ]]; then
+    warn "Topology gpu_count ($_topo_gpu_count) differs from detected GPU_COUNT ($GPU_COUNT) — using detected value"
+fi
+VENDOR=$(jq -r '.vendor' "$TOPOLOGY_FILE")
+
+# Build GPU arrays keyed by actual GPU index
+# This ensures GPU_UUIDS[$idx] always maps to the correct GPU even if
+# nvidia-smi returns GPUs out of index order.
+declare -a GPU_INDICES=()
+declare -A GPU_NAMES=()
+declare -A GPU_VRAMS_GB=()
+declare -A GPU_UUIDS=()
+while IFS=$'\t' read -r _idx _name _mem _uuid; do
+    GPU_INDICES+=("$_idx")
+    GPU_NAMES["$_idx"]="$_name"
+    GPU_VRAMS_GB["$_idx"]="$_mem"
+    GPU_UUIDS["$_idx"]="$_uuid"
+done < <(jq -r '.gpus[] | [.index, .name, .memory_gb, .uuid] | @tsv' "$TOPOLOGY_FILE")
+
+declare -A LINK_RANK
+declare -A LINK_TYPE
+while IFS=$'\t' read -r a b rank ltype; do
+  LINK_RANK["$a,$b"]=$rank
+  LINK_RANK["$b,$a"]=$rank
+  LINK_TYPE["$a,$b"]=$ltype
+  LINK_TYPE["$b,$a"]=$ltype
+done < <(jq -r '.links[] | [.gpu_a, .gpu_b, .rank, .link_type] | @tsv' "$TOPOLOGY_FILE")
+
+# Automatic assignment
+run_automatic() {
+  echo ""
+  chapter "AUTOMATIC GPU ASSIGNMENT"
+  echo -e "  ${GRN}Running topology-aware assignment...${NC}"
+  echo ""
+
+  local result
+  result=$(python3 "$ASSIGN_GPUS_SCRIPT" \
+    --topology "$TOPOLOGY_FILE" --model-size "$LLM_MODEL_SIZE_MB" 2>&1) || {
+    echo -e "  ${RED}Assignment failed:${NC}\n  $result"
+    error "GPU assignment failed: $result"
+  }
+
+  local strategy mode tp pp mem_util
+  strategy=$(echo "$result" | jq -r '.gpu_assignment.strategy')
+  mode=$(echo     "$result" | jq -r '.gpu_assignment.services.llama_server.parallelism.mode')
+  tp=$(echo       "$result" | jq -r '.gpu_assignment.services.llama_server.parallelism.tensor_parallel_size')
+  pp=$(echo       "$result" | jq -r '.gpu_assignment.services.llama_server.parallelism.pipeline_parallel_size')
+  mem_util=$(echo "$result" | jq -r '.gpu_assignment.services.llama_server.parallelism.gpu_memory_utilization')
+
+  GPU_ASSIGNMENT_JSON="$result"
+  success "Assignment complete"
+  echo ""
+  echo -e "  ${WHT}Strategy:${NC}    ${BGRN}${strategy}${NC}"
+  echo -e "  ${WHT}Llama mode:${NC}  ${BGRN}${mode}${NC}"
+  echo ""
+  echo -e "  ${WHT}Service assignments:${NC}"
+
+  for svc in llama_server whisper comfyui embeddings; do
+    local labels=""
+    while IFS= read -r uuid; do
+      for i in "${GPU_INDICES[@]}"; do
+        [[ "${GPU_UUIDS[$i]}" == "$uuid" ]] && labels+="GPU${i} "
+      done
+    done < <(echo "$result" | jq -r ".gpu_assignment.services.${svc}.gpus[]" 2>/dev/null)
+    [[ -n "$labels" ]] && printf "  ${AMB}*${NC} %-16s ${BGRN}%s${NC}\n" "$svc" "$labels"
+  done
+
+  _show_json "$result"
+}
+
+# Custom assignment
+run_custom() {
+  [[ "$INTERACTIVE" == "true" ]] || { warn "run_custom called in non-interactive mode — skipping."; return; }
+  echo ""
+  chapter "CUSTOM GPU ASSIGNMENT"
+  echo -e "  ${GRN}Assign GPUs to each service manually.${NC}"
+  echo -e "  ${DIM}whisper / comfyui / embeddings: 1 GPU each.  llama_server: 1 or more.${NC}"
+  echo ""
+
+  declare -A CUSTOM_ASSIGNMENT
+  for svc in whisper comfyui embeddings; do
+    local valid=false
+    while ! $valid; do
+      read -rp "  GPU for ${WHT}${svc}${NC} (0-$((GPU_COUNT-1))): " chosen
+      if [[ "$chosen" =~ ^[0-9]+$ ]] && [[ $chosen -ge 0 ]] && [[ $chosen -lt $GPU_COUNT ]]; then
+        CUSTOM_ASSIGNMENT[$svc]=$chosen; valid=true
+      else
+        warn "  Invalid -- enter a number between 0 and $((GPU_COUNT-1))."
+      fi
+    done
+  done
+
+  echo ""
+  local used=("${CUSTOM_ASSIGNMENT[whisper]}" "${CUSTOM_ASSIGNMENT[comfyui]}" "${CUSTOM_ASSIGNMENT[embeddings]}")
+  local default_llama=""
+  for idx in "${GPU_INDICES[@]}"; do
+    local found=false
+    for u in "${used[@]}"; do [[ "$u" == "$idx" ]] && found=true; done
+    $found || default_llama+="${idx},"
+  done
+  default_llama="${default_llama%,}"
+
+  read -rp "  GPUs for ${WHT}llama_server${NC} [${default_llama}]: " llama_input
+  llama_input="${llama_input:-$default_llama}"
+  IFS=',' read -ra LLAMA_GPUS_CUSTOM <<< "$llama_input"
+  for g in "${LLAMA_GPUS_CUSTOM[@]}"; do
+    [[ "$g" =~ ^[0-9]+$ ]] && [[ $g -lt $GPU_COUNT ]] || error "Invalid GPU index '$g'"
+  done
+
+  echo ""
+  echo -e "  ${WHT}Assignment:${NC}"
+  printf "  ${AMB}*${NC} %-16s ${BGRN}" "llama_server"
+  for g in "${LLAMA_GPUS_CUSTOM[@]}"; do printf "GPU%s " "$g"; done
+  printf "${NC}\n"
+  for svc in whisper comfyui embeddings; do
+    printf "  ${AMB}*${NC} %-16s ${BGRN}GPU%s${NC}\n" "$svc" "${CUSTOM_ASSIGNMENT[$svc]}"
+  done
+
+  local all_assigned=("${LLAMA_GPUS_CUSTOM[@]}" "${CUSTOM_ASSIGNMENT[whisper]}" \
+                      "${CUSTOM_ASSIGNMENT[comfyui]}" "${CUSTOM_ASSIGNMENT[embeddings]}")
+  local unique; unique=$(printf '%s\n' "${all_assigned[@]}" | sort -u | wc -l)
+  local strategy="dedicated"
+  [[ $unique -lt ${#all_assigned[@]} ]] && strategy="colocated"
+  [[ $GPU_COUNT -eq 1 ]] && strategy="single"
+
+  local n=${#LLAMA_GPUS_CUSTOM[@]}
+  local min_rank=100
+  if [[ $n -gt 1 ]]; then
+    for ((x=0; x<n; x++)); do
+      for ((y=x+1; y<n; y++)); do
+        local r; r=$(get_rank "${LLAMA_GPUS_CUSTOM[$x]}" "${LLAMA_GPUS_CUSTOM[$y]}")
+        [[ $r -lt $min_rank ]] && min_rank=$r
+      done
+    done
+  fi
+
+  # NOTE: keep in sync with assign_gpus.py select_parallelism()
+  local mode tp pp mem_util
+  if   [[ $n -eq 1 ]];         then mode="none";     tp=1;  pp=1;        mem_util=0.95
+  elif [[ $min_rank -ge 80 ]]; then
+    if   [[ $n -le 3 ]];       then mode="tensor";   tp=$n; pp=1;        mem_util=0.92
+    else                            mode="hybrid";   tp=2;  pp=$((n/2)); mem_util=0.93; fi
+  elif [[ $min_rank -le 10 ]]; then mode="pipeline"; tp=1;  pp=$n;       mem_util=0.95
+  elif [[ $n -le 3 ]];         then mode="pipeline"; tp=1;  pp=$n;       mem_util=0.95
+  elif [[ $min_rank -ge 40 ]]; then mode="hybrid";   tp=2;  pp=$((n/2)); mem_util=0.93
+  else                              mode="pipeline"; tp=1;  pp=$n;       mem_util=0.95
+  fi
+
+  echo ""
+  echo -e "  ${WHT}Llama parallelism:${NC}  mode=${BGRN}${mode}${NC}  TP=${tp}  PP=${pp}  mem_util=${mem_util}  ${DIM}(min_rank=${min_rank})${NC}"
+  echo ""
+
+  read -rp "  Apply this configuration? [Y/n]: " confirm
+  confirm="${confirm:-Y}"
+  [[ ! $confirm =~ ^[Yy]$ ]] && warn "Cancelled." && return
+
+  local llama_uuids_json
+  llama_uuids_json=$(for g in "${LLAMA_GPUS_CUSTOM[@]}"; do echo "\"${GPU_UUIDS[$g]}\""; done | jq -sc '.')
+
+  local result
+  result=$(jq -n \
+    --arg     strategy        "$strategy" \
+    --argjson llama_gpus      "$llama_uuids_json" \
+    --arg     mode             "$mode" \
+    --argjson tp               "$tp" \
+    --argjson pp               "$pp" \
+    --argjson mem              "$mem_util" \
+    --arg     whisper_gpu     "${GPU_UUIDS[${CUSTOM_ASSIGNMENT[whisper]}]}" \
+    --arg     comfyui_gpu     "${GPU_UUIDS[${CUSTOM_ASSIGNMENT[comfyui]}]}" \
+    --arg     embeddings_gpu  "${GPU_UUIDS[${CUSTOM_ASSIGNMENT[embeddings]}]}" \
+    '{
+      gpu_assignment: {
+        version: "1.0", strategy: $strategy,
+        services: {
+          llama_server: {
+            gpus: $llama_gpus,
+            parallelism: { mode: $mode, tensor_parallel_size: $tp,
+                           pipeline_parallel_size: $pp, gpu_memory_utilization: $mem }
+          },
+          whisper:    { gpus: [$whisper_gpu] },
+          comfyui:    { gpus: [$comfyui_gpu] },
+          embeddings: { gpus: [$embeddings_gpu] }
+        }
+      }
+    }')
+
+  GPU_ASSIGNMENT_JSON="$result"
+  success "Custom configuration applied."
+  _show_json "$result"
+}
+
+_show_json() {
+  [[ "${VERBOSE:-false}" == "true" || "${DEBUG:-false}" == "true" ]] || return 0
+  echo ""; bootline
+  echo -e "${BGRN}GPU ASSIGNMENT JSON${NC}"
+  bootline; echo ""
+  echo "$1" | jq .
+  echo ""; bootline; echo ""
+}
+
+# --- Multi-GPU Config TUI ---
+GPU_ASSIGNMENT_JSON=""
+
+# If it is not an interactive session, run automatic assignment with default values
+if ! $INTERACTIVE || $DRY_RUN; then
+    log "Non-interactive mode: running automatic GPU assignment with default values."
+    run_automatic
+else
+    bootline
+    echo -e "${BGRN}MULTI-GPU CONFIGURATION${NC}"
+    bootline
+    echo ""
+    echo -e "  You have ${BGRN}${GPU_COUNT}${NC} GPUs available. How would you like to use them?"
+    echo ""
+    echo -e "  ${BGRN}[1]${NC} Automatic ${AMB}(Recommended)${NC}"
+    echo -e "      ${DIM}Let DreamServer pick the best topology-aware assignment${NC}"
+    echo ""
+    echo -e "  ${WHT}[2]${NC} Custom Configuration"
+    echo -e "      ${DIM}Assign GPUs to services manually${NC}"
+    echo ""
+
+    read -rp "  Selection [1]: " choice
+    choice="${choice:-1}"
+    case "$choice" in
+    1) run_automatic ;;
+    2) run_custom ;;
+    *) warn "Invalid selection. Defaulting to automatic."; run_automatic ;;
+    esac
+fi
+
+LLAMA_SERVER_GPU_UUIDS=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '.gpu_assignment.services.llama_server.gpus // [] | join(",")')
+if [[ -z "$LLAMA_SERVER_GPU_UUIDS" ]]; then
+    warn "LLAMA_SERVER_GPU_UUIDS is empty — NVIDIA_VISIBLE_DEVICES will fall back to 'all' (all GPUs visible to llama-server)"
+fi
+WHISPER_GPU_UUID=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '.gpu_assignment.services.whisper.gpus[0]?')
+COMFYUI_GPU_UUID=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '.gpu_assignment.services.comfyui.gpus[0]?')
+EMBEDDINGS_GPU_UUID=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '.gpu_assignment.services.embeddings.gpus[0]?')
+
+_mode=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '.gpu_assignment.services.llama_server.parallelism.mode // "none"')
+case "$_mode" in
+  tensor|hybrid) LLAMA_ARG_SPLIT_MODE="row"   ;;
+  pipeline)      LLAMA_ARG_SPLIT_MODE="layer" ;;
+  *)             LLAMA_ARG_SPLIT_MODE="none"  ;;
+esac
+unset _mode
+
+LLAMA_ARG_TENSOR_SPLIT=$(echo "$GPU_ASSIGNMENT_JSON" | jq -r '
+  .gpu_assignment.services.llama_server as $svc |
+  ($svc.parallelism.tensor_split // []) as $ts |
+  if ($ts | length) > 0
+  then $ts | map(tostring) | join(",")
+  else ($svc.gpus | length) as $n |
+    if $n > 1 then [range($n) | 1] | map(tostring) | join(",")
+    else "1"
+    end
+  end')
+
+rm -f "$TOPOLOGY_FILE"
diff --git a/dream-server/installers/phases/04-requirements.sh b/dream-server/installers/phases/04-requirements.sh
index 20c2673d..17c38a01 100755
--- a/dream-server/installers/phases/04-requirements.sh
+++ b/dream-server/installers/phases/04-requirements.sh
@@ -20,6 +20,7 @@ dream_progress 25 "requirements" "Checking system requirements"
 chapter "REQUIREMENTS CHECK"
 
 [[ -f "${SCRIPT_DIR:-}/lib/safe-env.sh" ]] && . "${SCRIPT_DIR}/lib/safe-env.sh"
+[[ -f "$SCRIPT_DIR/lib/service-registry.sh" ]] && . "$SCRIPT_DIR/lib/service-registry.sh"
 
 REQUIREMENTS_MET=true
 TIER_RANK="$(tier_rank "$TIER")"
diff --git a/dream-server/installers/phases/06-directories.sh b/dream-server/installers/phases/06-directories.sh
index 14f89a9f..e44a1f8b 100755
--- a/dream-server/installers/phases/06-directories.sh
+++ b/dream-server/installers/phases/06-directories.sh
@@ -11,12 +11,14 @@
 #           LLM_MODEL, MAX_CONTEXT, GGUF_FILE, COMPOSE_FLAGS,
 #           ENABLE_VOICE, ENABLE_WORKFLOWS, ENABLE_RAG, ENABLE_OPENCLAW,
 #           OPENCLAW_CONFIG, OPENCLAW_PROVIDER_NAME_DEFAULT,
-#           OPENCLAW_PROVIDER_URL_DEFAULT,
+#           OPENCLAW_PROVIDER_URL_DEFAULT, GPU_ASSIGNMENT_JSON,
+#           COMFYUI_GPU_UUID, WHISPER_GPU_UUID, EMBEDDINGS_GPU_UUID,
+#           LLAMA_SERVER_GPU_UUIDS, LLAMA_ARG_SPLIT_MODE, LLAMA_ARG_TENSOR_SPLIT,
 #           chapter(), ai(), ai_ok(), ai_warn(), log(), warn(), error()
 # Provides: WEBUI_SECRET, N8N_PASS, LITELLM_KEY, LIVEKIT_SECRET,
 #           DASHBOARD_API_KEY, OPENCODE_SERVER_PASSWORD, OPENCLAW_TOKEN,
 #           OPENCLAW_PROVIDER_NAME, OPENCLAW_PROVIDER_URL, OPENCLAW_MODEL,
-#           OPENCLAW_CONTEXT
+#           OPENCLAW_CONTEXT, GPU_ASSIGNMENT_JSON_B64 (in .env)
 #
 # Modder notes:
 #   This is the largest phase. Modify .env generation, add new config files,
@@ -287,6 +289,12 @@ MODELS_EOF
     ANTHROPIC_API_KEY=$(_env_get ANTHROPIC_API_KEY "${ANTHROPIC_API_KEY:-}")
     OPENAI_API_KEY=$(_env_get OPENAI_API_KEY "${OPENAI_API_KEY:-}")
     TOGETHER_API_KEY=$(_env_get TOGETHER_API_KEY "${TOGETHER_API_KEY:-}")
+    # Base64-encode GPU assignment JSON for safe .env storage
+    if [[ -n "$GPU_ASSIGNMENT_JSON" && "$GPU_ASSIGNMENT_JSON" != "{}" ]]; then
+        GPU_ASSIGNMENT_JSON_B64=$(echo "$GPU_ASSIGNMENT_JSON" | jq -c '.' | base64 -w0)
+    else
+        GPU_ASSIGNMENT_JSON_B64=""
+    fi
 
     # Generate .env file
     cat > "$INSTALL_DIR/.env" << ENV_EOF
@@ -400,6 +408,16 @@ LANGFUSE_INIT_USER_PASSWORD=${LANGFUSE_INIT_USER_PASSWORD}
 
 # ── Image Generation ──
 ENABLE_IMAGE_GENERATION=${ENABLE_COMFYUI:-true}
+
+#=== Multi-GPU Settings ===
+GPU_ASSIGNMENT_JSON_B64=${GPU_ASSIGNMENT_JSON_B64:-}
+COMFYUI_GPU_UUID=${COMFYUI_GPU_UUID:-}
+WHISPER_GPU_UUID=${WHISPER_GPU_UUID:-}
+EMBEDDINGS_GPU_UUID=${EMBEDDINGS_GPU_UUID:-}
+LLAMA_SERVER_GPU_UUIDS=${LLAMA_SERVER_GPU_UUIDS:-}
+LLAMA_ARG_SPLIT_MODE=${LLAMA_ARG_SPLIT_MODE:-none}
+LLAMA_ARG_TENSOR_SPLIT=${LLAMA_ARG_TENSOR_SPLIT:-}
+
 ENV_EOF
 
     chmod 600 "$INSTALL_DIR/.env"  # Secure secrets file
diff --git a/dream-server/scripts/assign_gpus.py b/dream-server/scripts/assign_gpus.py
new file mode 100644
index 00000000..bd8e40a0
--- /dev/null
+++ b/dream-server/scripts/assign_gpus.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""
+assign_gpus.py — GPU assignment algorithm for DreamServer
+
+Usage:
+    python3 assign_gpus.py --topology topo.json --model-size 70000
+    python3 assign_gpus.py --topology topo.json --model-size 70000 --enabled-services llama_server,whisper
+
+Output: gpu_assignment JSON to stdout
+Errors: to stderr, exit code 1
+"""
+
+import argparse
+import json
+import math
+import sys
+from dataclasses import dataclass, field
+from itertools import combinations
+from typing import Optional
+
+
+#  Constants 
+
+HIGH_BW_THRESHOLD = 80   # min rank for NVLink / XGMI
+DEFAULT_SERVICES  = ["llama_server", "whisper", "comfyui", "embeddings"]
+NON_LLAMA         = ["whisper", "comfyui", "embeddings"]
+
+
+#  Data Models 
+
+@dataclass
+class GPU:
+    index:     int
+    uuid:      str
+    name:      str
+    memory_mb: float
+
+@dataclass
+class Link:
+    gpu_a:      int
+    gpu_b:      int
+    link_type:  str
+    link_label: str
+    rank:       int
+
+@dataclass
+class Subset:
+    gpus:             list
+    min_link_rank:    int
+    total_vram_mb:    float
+    all_pairs_highbw: bool
+
+@dataclass
+class LlamaParallelism:
+    mode:                     str
+    tensor_parallel_size:     int
+    pipeline_parallel_size:   int
+    gpu_memory_utilization:   float
+    tensor_split:             Optional[list] = None
+
+@dataclass
+class ServiceAssignment:
+    gpus:        list
+    parallelism: Optional[LlamaParallelism] = None
+
+@dataclass
+class AssignmentResult:
+    strategy: str
+    services: dict
+
+
+#  Phase 1: Topology Analysis 
+
+def parse_gpus(topology: dict) -> list:
+    gpus = []
+    for g in topology["gpus"]:
+        gpus.append(GPU(
+            index=g["index"],
+            uuid=g["uuid"],
+            name=g["name"],
+            memory_mb=g["memory_gb"] * 1024,
+        ))
+    return gpus
+
+
+def parse_links(topology: dict) -> list:
+    links = []
+    for link in topology.get("links", []):
+        links.append(Link(
+            gpu_a=link["gpu_a"],
+            gpu_b=link["gpu_b"],
+            link_type=link["link_type"],
+            link_label=link["link_label"],
+            rank=link["rank"],
+        ))
+    return links
+
+
+def build_rank_matrix(links: list) -> dict:
+    """
+    rank_matrix[(min_idx, max_idx)] = rank
+    Pairs not in links default to 0.
+    """
+    matrix = {}
+    for link in links:
+        key = (min(link.gpu_a, link.gpu_b), max(link.gpu_a, link.gpu_b))
+        matrix[key] = link.rank
+    return matrix
+
+
+def get_rank(rank_matrix: dict, a: int, b: int) -> int:
+    return rank_matrix.get((min(a, b), max(a, b)), 0)
+
+
+def compute_subset(gpus: list, rank_matrix: dict) -> Subset:
+    """
+    Compute a Subset from a list of GPUs.
+    Single GPU: min_link_rank=0, all_pairs_highbw=True (no links needed).
+    """
+    if len(gpus) == 1:
+        return Subset(
+            gpus=gpus,
+            min_link_rank=0,
+            total_vram_mb=gpus[0].memory_mb,
+            all_pairs_highbw=True,
+        )
+
+    indices = [g.index for g in gpus]
+    ranks = [get_rank(rank_matrix, a, b) for a, b in combinations(indices, 2)]
+    min_rank = min(ranks)
+
+    return Subset(
+        gpus=gpus,
+        min_link_rank=min_rank,
+        total_vram_mb=sum(g.memory_mb for g in gpus),
+        all_pairs_highbw=(min_rank >= HIGH_BW_THRESHOLD),
+    )
+
+
+def enumerate_subsets(gpus: list, rank_matrix: dict) -> list:
+    """
+    Generate all non-empty subsets of GPUs, ordered by:
+      1. min_link_rank DESC  (topology quality)
+      2. subset size ASC     (prefer fewer GPUs, leave more for services)
+      3. total_vram DESC     (tiebreaker)
+    """
+    all_subsets = []
+    for size in range(1, len(gpus) + 1):
+        for combo in combinations(gpus, size):
+            all_subsets.append(compute_subset(list(combo), rank_matrix))
+
+    return sorted(
+        all_subsets,
+        key=lambda s: (s.min_link_rank, -len(s.gpus), s.total_vram_mb),
+        reverse=True,
+    )
+
+
+#  Phase 2: GPU Assignment 
+
+def find_llama_subset(ordered_subsets: list, model_size_mb: float) -> Subset:
+    """
+    Pick the best-ranked subset whose total VRAM covers model_size_mb.
+    Returns the first match (best topology, smallest size, most VRAM).
+    """
+    for subset in ordered_subsets:
+        if subset.total_vram_mb >= model_size_mb:
+            return subset
+    return None
+
+
+def span_subsets(all_gpus: list, rank_matrix: dict, model_size_mb: float, ordered_subsets: list) -> Subset:
+    """
+    No single subset covers model_size_mb.
+    Take the best subset, then greedily add GPUs from the remaining pool
+    (ordered by memory_mb DESC) until VRAM is covered.
+    Recomputes min_link_rank on the combined set.
+    """
+    best = ordered_subsets[0]
+    accumulated = list(best.gpus)
+    used = {g.index for g in accumulated}
+
+    remaining = sorted(
+        [g for g in all_gpus if g.index not in used],
+        key=lambda g: g.memory_mb,
+        reverse=True,
+    )
+
+    for gpu in remaining:
+        accumulated.append(gpu)
+        candidate = compute_subset(accumulated, rank_matrix)
+        if candidate.total_vram_mb >= model_size_mb:
+            return candidate
+
+    raise ValueError(
+        f"Model size {model_size_mb:.0f}MB exceeds total available VRAM "
+        f"({sum(g.memory_mb for g in all_gpus):.0f}MB across all GPUs)."
+    )
+
+
+def assign_services(all_gpus: list, llama_gpus: list, rank_matrix: dict, enabled_services: list) -> tuple:
+    """
+    Assign remaining GPUs to non-llama services.
+    Returns (service_assignments dict, final_llama_gpus list, strategy str).
+
+    Rules:
+      remaining == 0  → all 3 services share llama's last GPU       → colocated
+      remaining == 1  → all 3 services share remaining[0]           → colocated
+      remaining == 2  → whisper → [0], comfyui+embeddings → [1]    → colocated
+      remaining >= 3  → whisper → [0], comfyui → [1], emb → [2]    → dedicated
+                        remaining[3:] → back to llama
+    """
+    llama_indices = {g.index for g in llama_gpus}
+    remaining = sorted(
+        [g for g in all_gpus if g.index not in llama_indices],
+        key=lambda g: g.memory_mb,
+        reverse=True,
+    )
+
+    active_non_llama = [s for s in NON_LLAMA if s in enabled_services]
+    assignments = {}
+    final_llama_gpus = list(llama_gpus)
+
+    if len(remaining) == 0:
+        fallback = llama_gpus[-1]
+        for s in active_non_llama:
+            assignments[s] = ServiceAssignment(gpus=[fallback])
+        strategy = "colocated"
+
+    elif len(remaining) == 1:
+        for s in active_non_llama:
+            assignments[s] = ServiceAssignment(gpus=[remaining[0]])
+        strategy = "colocated"
+
+    elif len(remaining) == 2:
+        if "whisper"    in enabled_services: assignments["whisper"]    = ServiceAssignment(gpus=[remaining[0]])
+        if "comfyui"    in enabled_services: assignments["comfyui"]    = ServiceAssignment(gpus=[remaining[1]])
+        if "embeddings" in enabled_services: assignments["embeddings"] = ServiceAssignment(gpus=[remaining[1]])
+        strategy = "colocated"
+
+    else:
+        if "whisper"    in enabled_services: assignments["whisper"]    = ServiceAssignment(gpus=[remaining[0]])
+        if "comfyui"    in enabled_services: assignments["comfyui"]    = ServiceAssignment(gpus=[remaining[1]])
+        if "embeddings" in enabled_services: assignments["embeddings"] = ServiceAssignment(gpus=[remaining[2]])
+        # Push extras back to llama so no GPU sits idle
+        if len(remaining) > 3:
+            final_llama_gpus = final_llama_gpus + remaining[3:]
+        strategy = "dedicated"
+
+    assignments["llama_server"] = ServiceAssignment(gpus=final_llama_gpus)
+    return assignments, final_llama_gpus, strategy
+
+
+#  Phase 3: Llama Parallelism 
+
+def largest_pow2_divisor(n: int) -> int:
+    """
+    Find the largest power of 2 p such that:
+      - p divides n evenly
+      - p <= sqrt(n)  (keeps tensor_size <= pipeline_size for balance)
+    Minimum return value is 2 (hybrid requires at least 2 tensor groups).
+    """
+    p = 1
+    while True:
+        candidate = p * 2
+        if candidate > n or n % candidate != 0:
+            break
+        if candidate > math.sqrt(n):
+            break
+        p = candidate
+    return max(2, p)
+
+
+def is_heterogeneous(gpus: list) -> bool:
+    vrams = [g.memory_mb for g in gpus]
+    return max(vrams) != min(vrams)
+
+
+def compute_tensor_split(gpus: list) -> list:
+    """Proportional VRAM weights, rounded to 4 decimal places."""
+    total = sum(g.memory_mb for g in gpus)
+    return [round(g.memory_mb / total, 4) for g in gpus]
+
+
+def select_parallelism(subset: Subset) -> LlamaParallelism:
+    """
+    Select parallelism mode based on GPU count and min_link_rank.
+
+    Thresholds:
+      rank >= 80  → NVLink / XGMI  → tensor or hybrid
+      rank 11-79  → same-NUMA PCIe → pipeline, or hybrid if rank >= 40 and >= 4 GPUs
+      rank <= 10  → cross-NUMA     → pipeline only
+    """
+    gpus  = subset.gpus
+    n     = len(gpus)
+    rank  = subset.min_link_rank
+    split = compute_tensor_split(gpus) if is_heterogeneous(gpus) else None
+
+    # Single GPU
+    if n == 1:
+        return LlamaParallelism(
+            mode="none",
+            tensor_parallel_size=1,
+            pipeline_parallel_size=1,
+            gpu_memory_utilization=0.95,
+        )
+
+    # High-bandwidth (NVLink / XGMI)
+    if rank >= HIGH_BW_THRESHOLD:
+        if n <= 3:
+            return LlamaParallelism(
+                mode="tensor",
+                tensor_parallel_size=n,
+                pipeline_parallel_size=1,
+                gpu_memory_utilization=0.92,
+                tensor_split=split,
+            )
+        else:
+            tp = largest_pow2_divisor(n)
+            pp = n // tp
+            return LlamaParallelism(
+                mode="hybrid",
+                tensor_parallel_size=tp,
+                pipeline_parallel_size=pp,
+                gpu_memory_utilization=0.93,
+                tensor_split=split,
+            )
+
+    # Cross-NUMA PCIe
+    if rank <= 10:
+        return LlamaParallelism(
+            mode="pipeline",
+            tensor_parallel_size=1,
+            pipeline_parallel_size=n,
+            gpu_memory_utilization=0.95,
+        )
+
+    # Same-NUMA PCIe (rank 11-79)
+    if n <= 3:
+        return LlamaParallelism(
+            mode="pipeline",
+            tensor_parallel_size=1,
+            pipeline_parallel_size=n,
+            gpu_memory_utilization=0.95,
+        )
+    else:
+        if rank >= 40:
+            tp = largest_pow2_divisor(n)
+            pp = n // tp
+            return LlamaParallelism(
+                mode="hybrid",
+                tensor_parallel_size=tp,
+                pipeline_parallel_size=pp,
+                gpu_memory_utilization=0.93,
+                tensor_split=split,
+            )
+        else:
+            return LlamaParallelism(
+                mode="pipeline",
+                tensor_parallel_size=1,
+                pipeline_parallel_size=n,
+                gpu_memory_utilization=0.95,
+            )
+
+
+#  Phase 4: Build Output JSON 
+
+def build_output(result: AssignmentResult) -> dict:
+    services = {}
+
+    for name, assignment in result.services.items():
+        entry = {"gpus": [g.uuid for g in assignment.gpus]}
+
+        if assignment.parallelism:
+            p = assignment.parallelism
+            para = {
+                "mode":                   p.mode,
+                "tensor_parallel_size":   p.tensor_parallel_size,
+                "pipeline_parallel_size": p.pipeline_parallel_size,
+                "gpu_memory_utilization": p.gpu_memory_utilization,
+            }
+            if p.tensor_split is not None:
+                para["tensor_split"] = p.tensor_split
+            entry["parallelism"] = para
+
+        services[name] = entry
+
+    return {
+        "gpu_assignment": {
+            "version":  "1.0",
+            "strategy": result.strategy,
+            "services": services,
+        }
+    }
+
+
+#  Entry Point 
+
+def main():
+    parser = argparse.ArgumentParser(description="GPU assignment algorithm for DreamServer")
+    parser.add_argument("--topology",         required=True,  help="Path to topology JSON file")
+    parser.add_argument("--model-size",       required=True,  type=float, help="Model size in MB")
+    parser.add_argument("--enabled-services", default=",".join(DEFAULT_SERVICES),
+                        help="Comma-separated list of enabled services")
+    args = parser.parse_args()
+
+    # Load topology
+    try:
+        with open(args.topology) as f:
+            topology = json.load(f)
+    except FileNotFoundError:
+        print(f"ERROR: topology file not found: {args.topology}", file=sys.stderr)
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"ERROR: invalid JSON in topology file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    enabled_services = [s.strip() for s in args.enabled_services.split(",")]
+    model_size_mb    = args.model_size
+    gpu_count        = topology.get("gpu_count", 0)
+
+    if gpu_count == 0:
+        print("ERROR: no GPUs found in topology", file=sys.stderr)
+        sys.exit(1)
+
+    #  Early exit: single GPU 
+    if gpu_count == 1:
+        gpu = parse_gpus(topology)[0]
+        if model_size_mb > gpu.memory_mb:
+            print(
+                f"ERROR: Model size {model_size_mb:.0f}MB exceeds total available VRAM "
+                f"({gpu.memory_mb:.0f}MB across all GPUs).",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        parallelism = LlamaParallelism(
+            mode="none",
+            tensor_parallel_size=1,
+            pipeline_parallel_size=1,
+            gpu_memory_utilization=0.95,
+        )
+        services = {}
+        for s in enabled_services:
+            services[s] = ServiceAssignment(gpus=[gpu])
+        services["llama_server"].parallelism = parallelism
+        result = AssignmentResult(strategy="single", services=services)
+        print(json.dumps(build_output(result), indent=2))
+        return
+
+    #  Phase 1: Topology analysis 
+    gpus        = parse_gpus(topology)
+    links       = parse_links(topology)
+    rank_matrix = build_rank_matrix(links)
+    ordered     = enumerate_subsets(gpus, rank_matrix)
+
+    #  Phase 2: GPU assignment 
+    try:
+        llama_subset = find_llama_subset(ordered, model_size_mb)
+        if llama_subset is None:
+            llama_subset = span_subsets(gpus, rank_matrix, model_size_mb, ordered)
+    except ValueError as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    service_assignments, final_llama_gpus, strategy = assign_services(
+        gpus, llama_subset.gpus, rank_matrix, enabled_services
+    )
+
+    #  Phase 3: Llama parallelism 
+    final_subset = compute_subset(final_llama_gpus, rank_matrix)
+    parallelism  = select_parallelism(final_subset)
+    service_assignments["llama_server"].parallelism = parallelism
+
+    #  Phase 4: Emit JSON 
+    result = AssignmentResult(strategy=strategy, services=service_assignments)
+    print(json.dumps(build_output(result), indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dream-server/scripts/build-capability-profile.sh b/dream-server/scripts/build-capability-profile.sh
index 0c9906b6..6578a4cf 100755
--- a/dream-server/scripts/build-capability-profile.sh
+++ b/dream-server/scripts/build-capability-profile.sh
@@ -98,7 +98,7 @@ gpu_type = (gpu.get("type") or "none").lower()
 gpu_name = gpu.get("name") or "None"
 memory_type = (gpu.get("memory_type") or "none").lower()
 vram_mb = int(gpu.get("vram_mb") or 0)
-gpu_count = 1 if gpu_type not in {"none", ""} else 0
+gpu_count = int(gpu.get("count") or (1 if gpu_type not in {"none", ""} else 0))
 
 llm_health_url = f"http://localhost:{llm_port}{llm_health}"
 llm_api_port = llm_port
diff --git a/dream-server/scripts/detect-hardware.sh b/dream-server/scripts/detect-hardware.sh
index 7276a9e3..dc763892 100755
--- a/dream-server/scripts/detect-hardware.sh
+++ b/dream-server/scripts/detect-hardware.sh
@@ -161,6 +161,15 @@ parse_nvidia_vram_mb() {
     as_int "$mb"
 }
 
+# Count NVIDIA GPUs
+count_nvidia_gpus() {
+    if command -v nvidia-smi &>/dev/null; then
+        nvidia-smi --query-gpu=name --format=csv,noheader,nounits 2>/dev/null | wc -l | tr -d ' '
+    else
+        echo "0"
+    fi
+}
+
 # Detect AMD GPU via sysfs (works without ROCm installed)
 # Returns: gpu_name|vram_bytes|gtt_bytes|is_apu|gpu_busy|temp|power|vulkan|rocm|driver|device_id|subsystem_device|revision
 detect_amd_sysfs() {
@@ -255,6 +264,19 @@ detect_amd_sysfs() {
     return 1
 }
 
+# Count AMD GPUs via sysfs
+count_amd_gpus() {
+    local count=0
+    for card_dir in /sys/class/drm/card*/device; do
+        [[ -d "$card_dir" ]] || continue
+        local vendor
+        vendor=$(cat "$card_dir/vendor" 2>/dev/null) || continue
+        # (( 0++ )) returns exit 1 in bash, so || true prevents pipefail abort
+        [[ "$vendor" == "0x1002" ]] && (( count++ )) || true
+    done
+    echo "$count"
+}
+
 # Detect AMD GPU (legacy ROCm-only path)
 detect_amd() {
     # Try sysfs first (works without ROCm)
@@ -429,6 +451,7 @@ main() {
 
     local gpu_name=""
     local gpu_vram_mb=0
+    local gpu_count=0
     local gpu_type="none"
     local gpu_architecture=""
     local memory_type="discrete"
@@ -452,6 +475,7 @@ main() {
     if [[ -n "$nvidia_out" ]]; then
         gpu_name=$(echo "$nvidia_out" | awk -F',' '{gsub(/^[ \t]+|[ \t]+$/,"",$1); print $1}' | xargs || true)
         gpu_vram_mb=$(parse_nvidia_vram_mb "$nvidia_out")
+        gpu_count=$(count_nvidia_gpus)
         gpu_type="nvidia"
         gpu_architecture="cuda"
         memory_type="discrete"
@@ -470,6 +494,7 @@ main() {
             gtt_bytes=$(as_int "$gtt_bytes")
 
             gpu_vram_mb=$(( vram_bytes / 1048576 ))
+            gpu_count=$(count_amd_gpus)
             gpu_type="amd"
             gpu_temp=$(as_int "$temp")
             gpu_power=$(as_int "$power")
@@ -503,6 +528,7 @@ main() {
         if [[ -n "$apple_out" ]]; then
             gpu_name="Apple Silicon (Unified Memory)"
             gpu_vram_mb=$((ram * 1024))
+            gpu_count=1
             gpu_type="apple"
             gpu_architecture="apple-unified"
             memory_type="unified"
@@ -546,6 +572,7 @@ main() {
     "name": "$esc_gpu",
     "architecture": "$(json_escape "$gpu_architecture")",
     "memory_type": "$(json_escape "$memory_type")",
+    "count": $gpu_count,
     "vram_mb": $gpu_vram_mb,
     "vram_gb": $gpu_vram_gb,
     "device_id": "$(json_escape "$device_id")",
diff --git a/dream-server/scripts/resolve-compose-stack.sh b/dream-server/scripts/resolve-compose-stack.sh
index a4b0af99..b277a555 100755
--- a/dream-server/scripts/resolve-compose-stack.sh
+++ b/dream-server/scripts/resolve-compose-stack.sh
@@ -7,6 +7,7 @@ GPU_BACKEND="nvidia"
 PROFILE_OVERLAYS=""
 ENV_MODE="false"
 SKIP_BROKEN="false"
+GPU_COUNT="1"
 
 while [[ $# -gt 0 ]]; do
     case "$1" in
@@ -34,6 +35,10 @@ while [[ $# -gt 0 ]]; do
             ENV_MODE="true"
             shift
             ;;
+        --gpu-count)
+            GPU_COUNT="${2:-$GPU_COUNT}"
+            shift 2
+            ;;
         *)
             echo "Unknown argument: $1" >&2
             exit 1
@@ -50,7 +55,7 @@ elif command -v python >/dev/null 2>&1; then
     PYTHON_CMD="python"
 fi
 
-"$PYTHON_CMD" - "$SCRIPT_DIR" "$TIER" "$GPU_BACKEND" "$PROFILE_OVERLAYS" "$ENV_MODE" "$SKIP_BROKEN" <<'PY'
+"$PYTHON_CMD" - "$SCRIPT_DIR" "$TIER" "$GPU_BACKEND" "$PROFILE_OVERLAYS" "$ENV_MODE" "$SKIP_BROKEN" "$GPU_COUNT" <<'PY'
 import os
 import pathlib
 import sys
@@ -63,6 +68,7 @@ profile_overlays = [x.strip() for x in (sys.argv[4] or "").split(",") if x.strip
 env_mode = (sys.argv[5] or "false").lower() == "true"
 skip_broken = (sys.argv[6] or "false").lower() == "true"
 dream_mode = os.environ.get("DREAM_MODE", "local").lower()
+gpu_count = int(sys.argv[7] or "1")
 
 def existing(overlays):
     return all((script_dir / f).exists() for f in overlays)
@@ -113,6 +119,10 @@ else:
 if not resolved:
     resolved = [primary]
 
+# Multi-GPU overlay if we have more than 1 GPU.
+if gpu_count > 1 and (script_dir / "docker-compose.multigpu.yml").exists():
+    resolved.append("docker-compose.multigpu.yml")
+
 # Discover enabled extension compose fragments via manifests
 ext_dir = script_dir / "extensions" / "services"
 if ext_dir.exists():
@@ -161,11 +171,19 @@ if ext_dir.exists():
             gpu_overlay = service_dir / f"compose.{gpu_backend}.yaml"
             if gpu_overlay.exists():
                 resolved.append(str(gpu_overlay.relative_to(script_dir)))
+            
             # Mode-specific overlay — depends_on for local/hybrid mode only
             if dream_mode in ("local", "hybrid", "lemonade"):
                 local_mode_overlay = service_dir / "compose.local.yaml"
                 if local_mode_overlay.exists():
                     resolved.append(str(local_mode_overlay.relative_to(script_dir)))
+            
+            # Multi-GPU overlay if we have more than 1 GPU
+            if gpu_count > 1:
+                multi_gpu_overlay = service_dir / "compose.multigpu.yaml"
+                if multi_gpu_overlay.exists():
+                    resolved.append(str(multi_gpu_overlay.relative_to(script_dir)))
+
         except Exception as e:
             # Narrow exception handling to specific parse/structure errors
             yaml_error = yaml_available and hasattr(yaml, 'YAMLError') and isinstance(e, yaml.YAMLError)
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_1gpu_pcie.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_1gpu_pcie.json
new file mode 100644
index 00000000..5c491a2f
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_1gpu_pcie.json
@@ -0,0 +1,18 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 1,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA RTX 4090",
+      "memory_gb": 24.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-12345678-1234-1234-1234-123456789012"
+    }
+  ],
+  "links": []
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_2gpus_phb_coloc.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_2gpus_phb_coloc.json
new file mode 100644
index 00000000..199cf6eb
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_2gpus_phb_coloc.json
@@ -0,0 +1,34 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 2,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA RTX 4090",
+      "memory_gb": 24.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA RTX 4090",
+      "memory_gb": 24.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "PHB",
+      "link_label": "PCIe-HostBridge",
+      "rank": 30
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_soc.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_soc.json
new file mode 100644
index 00000000..ab9d7c2d
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_soc.json
@@ -0,0 +1,85 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 4,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "PHB",
+      "link_label": "PCIe-HostBridge",
+      "rank": 30
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 2,
+      "link_type": "SOC",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 3,
+      "link_type": "SOC",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 2,
+      "link_type": "SOC",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 3,
+      "link_type": "SOC",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 3,
+      "link_type": "PHB",
+      "link_label": "PCIe-HostBridge",
+      "rank": 30
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json
new file mode 100644
index 00000000..cf29a92e
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json
@@ -0,0 +1,85 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 4,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 2,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 3,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 2,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 3,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 3,
+      "link_type": "NODE",
+      "link_label": "SameNUMA-NoBridge",
+      "rank": 20
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.json
new file mode 100644
index 00000000..ed17f2a6
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.json
@@ -0,0 +1,58 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 5,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    },
+    {
+      "index": 4,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-44444444-4444-4444-4444-444444444444"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json
new file mode 100644
index 00000000..58f45ab4
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json
@@ -0,0 +1,271 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 8,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": {},
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    },
+    {
+      "index": 4,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-44444444-4444-4444-4444-444444444444"
+    },
+    {
+      "index": 5,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-55555555-5555-5555-5555-555555555555"
+    },
+    {
+      "index": 6,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-66666666-6666-6666-6666-666666666666"
+    },
+    {
+      "index": 7,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-77777777-7777-7777-7777-777777777777"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 2,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 2,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 6,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.json
new file mode 100644
index 00000000..63074173
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.json
@@ -0,0 +1,271 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 8,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": { "nodes": 2 },
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    },
+    {
+      "index": 4,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-44444444-4444-4444-4444-444444444444"
+    },
+    {
+      "index": 5,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-55555555-5555-5555-5555-555555555555"
+    },
+    {
+      "index": 6,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-66666666-6666-6666-6666-666666666666"
+    },
+    {
+      "index": 7,
+      "name": "NVIDIA A100-SXM4-80GB",
+      "memory_gb": 80.0,
+      "pcie_gen": "4",
+      "pcie_width": "16",
+      "uuid": "GPU-77777777-7777-7777-7777-777777777777"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 2,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 2,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 3,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 4,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 5,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 6,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    },
+    {
+      "gpu_a": 6,
+      "gpu_b": 7,
+      "link_type": "NV12",
+      "link_label": "NVLink",
+      "rank": 100
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.json b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.json
new file mode 100644
index 00000000..949ff1a8
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_json/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.json
@@ -0,0 +1,271 @@
+{
+  "vendor": "nvidia",
+  "gpu_count": 8,
+  "driver_version": "535.129.03",
+  "mig_enabled": false,
+  "numa": { "nodes": 2 },
+  "gpus": [
+    {
+      "index": 0,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-00000000-0000-0000-0000-000000000000"
+    },
+    {
+      "index": 1,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-11111111-1111-1111-1111-111111111111"
+    },
+    {
+      "index": 2,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-22222222-2222-2222-2222-222222222222"
+    },
+    {
+      "index": 3,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-33333333-3333-3333-3333-333333333333"
+    },
+    {
+      "index": 4,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-44444444-4444-4444-4444-444444444444"
+    },
+    {
+      "index": 5,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-55555555-5555-5555-5555-555555555555"
+    },
+    {
+      "index": 6,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-66666666-6666-6666-6666-666666666666"
+    },
+    {
+      "index": 7,
+      "name": "NVIDIA V100-SXM2-32GB",
+      "memory_gb": 32.0,
+      "pcie_gen": "3",
+      "pcie_width": "16",
+      "uuid": "GPU-77777777-7777-7777-7777-777777777777"
+    }
+  ],
+  "links": [
+    {
+      "gpu_a": 0,
+      "gpu_b": 1,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 2,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 3,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 4,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 5,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 6,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 0,
+      "gpu_b": 7,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 2,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 3,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 4,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 5,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 6,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 1,
+      "gpu_b": 7,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 3,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 4,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 5,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 6,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 2,
+      "gpu_b": 7,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 4,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 5,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 6,
+      "link_type": "SYS",
+      "link_label": "CrossNUMA",
+      "rank": 10
+    },
+    {
+      "gpu_a": 3,
+      "gpu_b": 7,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 5,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 6,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 4,
+      "gpu_b": 7,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 6,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    },
+    {
+      "gpu_a": 5,
+      "gpu_b": 7,
+      "link_type": "NV1",
+      "link_label": "NVLink",
+      "rank": 0
+    },
+    {
+      "gpu_a": 6,
+      "gpu_b": 7,
+      "link_type": "NV2",
+      "link_label": "NVLink",
+      "rank": 80
+    }
+  ]
+}
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_1gpu_pcie.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_1gpu_pcie.txt
new file mode 100644
index 00000000..b8233177
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_1gpu_pcie.txt
@@ -0,0 +1,11 @@
+	GPU0	CPU Affinity
+GPU0	 X 	0-7
+
+Legend:
+
+  X    = Self
+  SOC  = Connection traversing PCIe as well as the SMP link between CPU sockets(e.g. QPI)
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_soc.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_soc.txt
new file mode 100644
index 00000000..553de1cd
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_soc.txt
@@ -0,0 +1,14 @@
+	GPU0	GPU1	GPU2	GPU3	CPU Affinity
+GPU0	 X 	PHB	SOC	SOC	0-9,20-29
+GPU1	PHB	 X 	SOC	SOC	0-9,20-29
+GPU2	SOC	SOC	 X 	PHB	10-19,30-39
+GPU3	SOC	SOC	PHB	 X 	10-19,30-39
+
+Legend:
+
+  X    = Self
+  SOC  = Connection traversing PCIe as well as the SMP link between CPU sockets(e.g. QPI)
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt
new file mode 100644
index 00000000..25e0cd9d
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt
@@ -0,0 +1,17 @@
+	GPU0	GPU1	GPU2	GPU3	mlx5_0	mlx5_1	CPU Affinity	NUMA Affinity
+GPU0	 X 	NV12	SYS	SYS	NODE	NODE	0-27,56-83	0
+GPU1	NV12	 X 	SYS	SYS	NODE	NODE	0-27,56-83	0
+GPU2	SYS	SYS	 X 	NODE	SYS	SYS	28-55,84-111	1
+GPU3	SYS	SYS	NODE	 X 	SYS	SYS	28-55,84-111	1
+mlx5_0	NODE	NODE	SYS	SYS	 X 	PIX	
+mlx5_1	NODE	NODE	SYS	SYS	PIX	 X 	
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt
new file mode 100644
index 00000000..6e662f1c
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt
@@ -0,0 +1,16 @@
+	GPU0	GPU1	mlx5_0	mlx5_1	mlx5_2	CPU Affinity	NUMA Affinity
+GPU0	 X 	NV12	SYS	SYS	NODE	64-127,192-255	1
+GPU1	NV12	 X 	SYS	SYS	NODE	64-127,192-255	1
+mlx5_0	SYS	SYS	 X 	NODE	SYS	
+mlx5_1	SYS	SYS	NODE	 X 	SYS	
+mlx5_2	NODE	NODE	SYS	SYS	 X 	
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt
new file mode 100644
index 00000000..66f3fde6
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt
@@ -0,0 +1,21 @@
+	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	NIC0	NIC1	CPU Affinity	NUMA Affinity
+GPU0	 X 	NV12	NV12	NV12	NV12	NV12	NV12	NV12	PXB	PXB	48-63,176-191	3
+GPU1	NV12	 X 	NV12	NV12	NV12	NV12	NV12	NV12	PXB	PXB	48-63,176-191	3
+GPU2	NV12	NV12	 X 	NV12	NV12	NV12	NV12	NV12	SYS	SYS	16-31,144-159	1
+GPU3	NV12	NV12	NV12	 X 	NV12	NV12	NV12	NV12	SYS	SYS	16-31,144-159	1
+GPU4	NV12	NV12	NV12	NV12	 X 	NV12	NV12	NV12	SYS	SYS	112-127,240-255	7
+GPU5	NV12	NV12	NV12	NV12	NV12	 X 	NV12	NV12	SYS	SYS	112-127,240-255	7
+GPU6	NV12	NV12	NV12	NV12	NV12	NV12	 X 	NV12	SYS	SYS	80-95,208-223	5
+GPU7	NV12	NV12	NV12	NV12	NV12	NV12	NV12	 X 	SYS	SYS	80-95,208-223	5
+NIC0	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	 X 	PIX	
+NIC1	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	PIX	 X 	
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt
new file mode 100644
index 00000000..aa0ba290
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt
@@ -0,0 +1,19 @@
+	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	NIC0	NIC1	NIC2	NIC3	NIC4	NIC5	NIC6	NIC7	NIC8	NIC9	NIC10	NIC11	CPU Affinity	NUMA Affinity	GPU NUMA ID
+GPU0	 X 	NV12	NV12	NV12	NV12	NV12	NV12	NV12	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	48-63,176-191	3	N/A
+GPU1	NV12	 X 	NV12	NV12	NV12	NV12	NV12	NV12	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	48-63,176-191	3	N/A
+GPU2	NV12	NV12	 X 	NV12	NV12	NV12	NV12	NV12	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	16-31,144-159	1	N/A
+GPU3	NV12	NV12	NV12	 X 	NV12	NV12	NV12	NV12	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	16-31,144-159	1	N/A
+GPU4	NV12	NV12	NV12	NV12	 X 	NV12	NV12	NV12	SYS	SYS	SYS	SYS	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	112-127,240-255	7	N/A
+GPU5	NV12	NV12	NV12	NV12	NV12	 X 	NV12	NV12	SYS	SYS	SYS	SYS	SYS	SYS	PXB	PXB	SYS	SYS	SYS	SYS	112-127,240-255	7	N/A
+GPU6	NV12	NV12	NV12	NV12	NV12	NV12	 X 	NV12	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	PXB	PXB	SYS	SYS	80-95,208-223	5	N/A
+GPU7	NV12	NV12	NV12	NV12	NV12	NV12	NV12	 X 	SYS	SYS	SYS	SYS	SYS	SYS	SYS	SYS	PXB	PXB	SYS	SYS	80-95,208-223	5	N/A
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt
new file mode 100644
index 00000000..7675dd1a
--- /dev/null
+++ b/dream-server/tests/fixtures/topology_matrix/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt
@@ -0,0 +1,23 @@
+	GPU0	GPU1	GPU2	GPU3	GPU4	GPU5	GPU6	GPU7	mlx5_0	mlx5_2	mlx5_1	mlx5_3	CPU Affinity
+GPU0	 X 	NV1	NV1	NV2	NV2	SYS	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
+GPU1	NV1	 X 	NV2	NV1	SYS	NV2	SYS	SYS	PIX	SYS	PHB	SYS	0-19,40-59
+GPU2	NV1	NV2	 X 	NV2	SYS	SYS	NV1	SYS	PHB	SYS	PIX	SYS	0-19,40-59
+GPU3	NV2	NV1	NV2	 X 	SYS	SYS	SYS	NV1	PHB	SYS	PIX	SYS	0-19,40-59
+GPU4	NV2	SYS	SYS	SYS	 X 	NV1	NV1	NV2	SYS	PIX	SYS	PHB	20-39,60-79
+GPU5	SYS	NV2	SYS	SYS	NV1	 X 	NV2	NV1	SYS	PIX	SYS	PHB	20-39,60-79
+GPU6	SYS	SYS	NV1	SYS	NV1	NV2	 X 	NV2	SYS	PHB	SYS	PIX	20-39,60-79
+GPU7	SYS	SYS	SYS	NV1	NV2	NV1	NV2	 X 	SYS	PHB	SYS	PIX	20-39,60-79
+mlx5_0	PIX	PIX	PHB	PHB	SYS	SYS	SYS	SYS	 X 	SYS	PHB	SYS	
+mlx5_2	SYS	SYS	SYS	SYS	PIX	PIX	PHB	PHB	SYS	 X 	SYS	PHB	
+mlx5_1	PHB	PHB	PIX	PIX	SYS	SYS	SYS	SYS	PHB	SYS	 X 	SYS	
+mlx5_3	SYS	SYS	SYS	SYS	PHB	PHB	PIX	PIX	SYS	PHB	SYS	 X 	
+
+Legend:
+
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe switches (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing a single PCIe switch
+  NV#  = Connection traversing a bonded set of # NVLinks
diff --git a/dream-server/tests/test-assign-gpus.py b/dream-server/tests/test-assign-gpus.py
new file mode 100755
index 00000000..9b0f4f57
--- /dev/null
+++ b/dream-server/tests/test-assign-gpus.py
@@ -0,0 +1,561 @@
+import json
+import os
+import subprocess
+import sys
+import pytest
+
+SCRIPT = os.path.join(os.path.dirname(__file__), "../scripts/assign_gpus.py")
+FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures/topology_json")
+
+def fixture_path(name):
+    return os.path.join(FIXTURES_DIR, name)
+
+def run(topology_path, model_size_mb):
+    result = subprocess.run(
+        [sys.executable, SCRIPT, "--topology", topology_path, "--model-size", str(model_size_mb)],
+        capture_output=True, text=True,
+    )
+    output = None
+    if result.returncode == 0:
+        output = json.loads(result.stdout)["gpu_assignment"]
+    return result.returncode, output, result.stderr
+
+def all_assigned_uuids(output):
+    uuids = set()
+    for svc in output["services"].values():
+        uuids.update(svc["gpus"])
+    return uuids
+
+def llama(output):
+    return output["services"]["llama_server"]
+
+def parallelism(output):
+    return llama(output)["parallelism"]
+
+
+# ── 1 GPU — single ────────────────────────────────────────────────────────────
+
+class TestSingleGpu:
+    TOPO = fixture_path("nvidia_smi_topo_matrix_1gpu_pcie.json")
+    UUID = "GPU-12345678-1234-1234-1234-123456789012"
+
+    def test_strategy_is_single(self):
+        _, out, _ = run(self.TOPO, 20000)
+        assert out["strategy"] == "single"
+
+    def test_all_services_share_only_gpu(self):
+        _, out, _ = run(self.TOPO, 20000)
+        for svc in out["services"].values():
+            assert svc["gpus"] == [self.UUID]
+
+    def test_llama_mode_none(self):
+        _, out, _ = run(self.TOPO, 20000)
+        p = parallelism(out)
+        assert p["mode"] == "none"
+        assert p["tensor_parallel_size"] == 1
+        assert p["pipeline_parallel_size"] == 1
+
+    def test_model_too_large_errors(self):
+        rc, _, stderr = run(self.TOPO, 30000)
+        assert rc == 1
+        assert "exceeds" in stderr.lower()
+
+    def test_model_exactly_fits(self):
+        rc, out, _ = run(self.TOPO, 24576)
+        assert rc == 0
+        assert out["strategy"] == "single"
+
+    def test_no_topology_analysis_needed(self):
+        rc, out, _ = run(self.TOPO, 10000)
+        assert rc == 0
+
+
+# ── 2 GPU — rank-first means PHB pair always wins over single GPU ─────────────
+
+class TestTwoGpuColoc:
+    TOPO = fixture_path("nvidia_smi_topo_matrix_2gpus_phb_coloc.json")
+    GPU0 = "GPU-00000000-0000-0000-0000-000000000000"
+    GPU1 = "GPU-11111111-1111-1111-1111-111111111111"
+
+    def test_model_fits_one_gpu_rank_first_takes_pair(self):
+        # rank-first: PHB pair rank=30 beats single rank=0,
+        # so llama always gets both GPUs when there are only 2
+        _, out, _ = run(self.TOPO, 20000)
+        assert set(llama(out)["gpus"]) == {self.GPU0, self.GPU1}
+
+    def test_model_fits_one_gpu_strategy_colocated(self):
+        # remaining=0 after llama takes both → colocated
+        _, out, _ = run(self.TOPO, 20000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_fits_one_gpu_services_share_last(self):
+        _, out, _ = run(self.TOPO, 20000)
+        for name in ("whisper", "comfyui", "embeddings"):
+            assert out["services"][name]["gpus"] == [self.GPU1]
+
+    def test_model_fits_one_gpu_pipeline(self):
+        # PHB rank=30, n=2 → pipeline
+        _, out, _ = run(self.TOPO, 20000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["tensor_parallel_size"] == 1
+        assert p["pipeline_parallel_size"] == 2
+
+    def test_model_needs_both_gpus_strategy_colocated(self):
+        _, out, _ = run(self.TOPO, 30000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_needs_both_gpus_llama_gets_both(self):
+        _, out, _ = run(self.TOPO, 30000)
+        assert set(llama(out)["gpus"]) == {self.GPU0, self.GPU1}
+
+    def test_model_needs_both_gpus_services_share_llamas_last(self):
+        _, out, _ = run(self.TOPO, 30000)
+        for name in ("whisper", "comfyui", "embeddings"):
+            assert out["services"][name]["gpus"] == [self.GPU1]
+
+    def test_model_needs_both_gpus_llama_pipeline(self):
+        _, out, _ = run(self.TOPO, 30000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["tensor_parallel_size"] == 1
+        assert p["pipeline_parallel_size"] == 2
+
+    def test_no_gpu_idle(self):
+        for model_size in (20000, 30000):
+            _, out, _ = run(self.TOPO, model_size)
+            assert all_assigned_uuids(out) == {self.GPU0, self.GPU1}
+
+
+# ── 4 GPU — SOC / cross-NUMA PCIe ────────────────────────────────────────────
+
+class TestFourGpuSoc:
+    """4x A100-80GB. GPUs 0-1 and 2-3 are PHB pairs rank=30, cross pairs SOC rank=10."""
+    TOPO = fixture_path("nvidia_smi_topo_matrix_4gpus_soc.json")
+    UUIDS = [
+        "GPU-00000000-0000-0000-0000-000000000000",
+        "GPU-11111111-1111-1111-1111-111111111111",
+        "GPU-22222222-2222-2222-2222-222222222222",
+        "GPU-33333333-3333-3333-3333-333333333333",
+    ]
+
+    def test_model_fits_one_gpu_picks_phb_pair(self):
+        # rank-first: PHB pair rank=30 beats single rank=0
+        _, out, _ = run(self.TOPO, 70000)
+        llama_uuids = set(llama(out)["gpus"])
+        phb_pair_a = {self.UUIDS[0], self.UUIDS[1]}
+        phb_pair_b = {self.UUIDS[2], self.UUIDS[3]}
+        assert llama_uuids in (phb_pair_a, phb_pair_b)
+
+    def test_model_fits_one_gpu_colocated(self):
+        # remaining=2 after PHB pair → colocated
+        _, out, _ = run(self.TOPO, 70000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_fits_one_gpu_pipeline(self):
+        # PHB rank=30, n=2 → pipeline
+        _, out, _ = run(self.TOPO, 70000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["pipeline_parallel_size"] == 2
+
+    def test_model_fits_one_gpu_no_gpu_idle(self):
+        _, out, _ = run(self.TOPO, 70000)
+        assert all_assigned_uuids(out) == set(self.UUIDS)
+
+    def test_model_needs_two_gpus_colocated(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_needs_two_gpus_picks_phb_pair(self):
+        _, out, _ = run(self.TOPO, 100000)
+        llama_uuids = set(llama(out)["gpus"])
+        phb_pair_a = {self.UUIDS[0], self.UUIDS[1]}
+        phb_pair_b = {self.UUIDS[2], self.UUIDS[3]}
+        assert llama_uuids in (phb_pair_a, phb_pair_b)
+
+    def test_model_needs_two_gpus_pipeline(self):
+        _, out, _ = run(self.TOPO, 100000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["pipeline_parallel_size"] == 2
+
+    def test_model_needs_three_gpus_colocated(self):
+        _, out, _ = run(self.TOPO, 200000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_needs_three_gpus_pipeline_cross_numa(self):
+        _, out, _ = run(self.TOPO, 200000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["pipeline_parallel_size"] == 3
+
+    def test_model_too_large_errors(self):
+        rc, _, stderr = run(self.TOPO, 400000)
+        assert rc == 1
+        assert "exceeds" in stderr.lower()
+
+
+# ── 4 GPU — SYS-separated NVLink pairs ───────────────────────────────────────
+
+class TestFourGpuSysNvPairs:
+    """4x A100-80GB. GPU 0-1 NVLink rank=100, GPU 2-3 NODE rank=20, cross SYS rank=10."""
+    TOPO = fixture_path("nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json")
+    UUIDS = [
+        "GPU-00000000-0000-0000-0000-000000000000",
+        "GPU-11111111-1111-1111-1111-111111111111",
+        "GPU-22222222-2222-2222-2222-222222222222",
+        "GPU-33333333-3333-3333-3333-333333333333",
+    ]
+
+    def test_model_fits_one_gpu_picks_nvlink_pair(self):
+        # rank-first: NVLink pair rank=100 always wins
+        _, out, _ = run(self.TOPO, 70000)
+        assert set(llama(out)["gpus"]) == {self.UUIDS[0], self.UUIDS[1]}
+
+    def test_model_fits_one_gpu_colocated(self):
+        # remaining=2 → colocated
+        _, out, _ = run(self.TOPO, 70000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_fits_one_gpu_tensor(self):
+        # NVLink rank=100, n=2 → tensor
+        _, out, _ = run(self.TOPO, 70000)
+        p = parallelism(out)
+        assert p["mode"] == "tensor"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 1
+        assert p["gpu_memory_utilization"] == 0.92
+
+    def test_model_needs_two_gpus_picks_nvlink_pair(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert set(llama(out)["gpus"]) == {self.UUIDS[0], self.UUIDS[1]}
+
+    def test_model_needs_two_gpus_tensor(self):
+        _, out, _ = run(self.TOPO, 100000)
+        p = parallelism(out)
+        assert p["mode"] == "tensor"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 1
+        assert p["gpu_memory_utilization"] == 0.92
+
+    def test_model_needs_two_gpus_colocated(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert out["strategy"] == "colocated"
+
+    def test_model_needs_three_gpus_cross_numa_pipeline(self):
+        _, out, _ = run(self.TOPO, 200000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["pipeline_parallel_size"] == 3
+
+    def test_no_gpu_idle(self):
+        for model_size in (70000, 100000, 200000):
+            _, out, _ = run(self.TOPO, model_size)
+            assert all_assigned_uuids(out) == set(self.UUIDS)
+
+
+# ── 5 GPU — NV12 pair + 3 unlinked ───────────────────────────────────────────
+
+class TestFiveGpuNv12WithMlx5:
+    """5x A100-80GB. Only GPU 0-1 NV12 rank=100. All others rank=0."""
+    TOPO = fixture_path("nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.json")
+    UUIDS = [
+        "GPU-00000000-0000-0000-0000-000000000000",
+        "GPU-11111111-1111-1111-1111-111111111111",
+        "GPU-22222222-2222-2222-2222-222222222222",
+        "GPU-33333333-3333-3333-3333-333333333333",
+        "GPU-44444444-4444-4444-4444-444444444444",
+    ]
+
+    def test_model_fits_one_gpu_picks_nvlink_pair(self):
+        # rank-first: NVLink pair rank=100 always wins
+        _, out, _ = run(self.TOPO, 70000)
+        assert set(llama(out)["gpus"]) == {self.UUIDS[0], self.UUIDS[1]}
+
+    def test_model_fits_one_gpu_dedicated(self):
+        # remaining=3 exactly → dedicated, no extras back to llama
+        _, out, _ = run(self.TOPO, 70000)
+        assert out["strategy"] == "dedicated"
+
+    def test_model_fits_one_gpu_llama_stays_2gpus(self):
+        # remaining=3 → services each get 1, no extras push back
+        _, out, _ = run(self.TOPO, 70000)
+        assert len(llama(out)["gpus"]) == 2
+
+    def test_model_fits_one_gpu_tensor(self):
+        # NVLink rank=100, n=2 → tensor (no extra GPU degrading to pipeline)
+        _, out, _ = run(self.TOPO, 70000)
+        p = parallelism(out)
+        assert p["mode"] == "tensor"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 1
+
+    def test_model_fits_one_gpu_services_get_dedicated_gpus(self):
+        _, out, _ = run(self.TOPO, 70000)
+        svcs = out["services"]
+        for name in ("whisper", "comfyui", "embeddings"):
+            assert len(svcs[name]["gpus"]) == 1
+        service_uuids = [svcs[n]["gpus"][0] for n in ("whisper", "comfyui", "embeddings")]
+        assert len(set(service_uuids)) == 3
+
+    def test_model_needs_nvlink_pair_tensor(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert set(llama(out)["gpus"]) == {self.UUIDS[0], self.UUIDS[1]}
+        p = parallelism(out)
+        assert p["mode"] == "tensor"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 1
+
+    def test_model_needs_nvlink_pair_dedicated(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert out["strategy"] == "dedicated"
+
+    def test_model_needs_nvlink_pair_no_extras_back(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert len(llama(out)["gpus"]) == 2
+
+    def test_no_gpu_idle(self):
+        for model_size in (70000, 100000):
+            _, out, _ = run(self.TOPO, model_size)
+            assert all_assigned_uuids(out) == set(self.UUIDS)
+
+
+# ── 8 GPU — NV1/NV2 partial mesh ─────────────────────────────────────────────
+
+class TestEightGpuPartialMesh:
+    """8x V100-32GB. NV1=rank 0, NV2=rank 80, SYS=rank 10."""
+    TOPO = fixture_path("nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.json")
+    ALL_UUIDS = {f"GPU-{str(i)*8}-{str(i)*4}-{str(i)*4}-{str(i)*4}-{str(i)*12}" for i in range(8)}
+    NV2_PAIRS = [
+        {"GPU-00000000-0000-0000-0000-000000000000", "GPU-33333333-3333-3333-3333-333333333333"},
+        {"GPU-00000000-0000-0000-0000-000000000000", "GPU-44444444-4444-4444-4444-444444444444"},
+        {"GPU-11111111-1111-1111-1111-111111111111", "GPU-22222222-2222-2222-2222-222222222222"},
+        {"GPU-11111111-1111-1111-1111-111111111111", "GPU-55555555-5555-5555-5555-555555555555"},
+        {"GPU-22222222-2222-2222-2222-222222222222", "GPU-33333333-3333-3333-3333-333333333333"},
+        {"GPU-44444444-4444-4444-4444-444444444444", "GPU-77777777-7777-7777-7777-777777777777"},
+        {"GPU-55555555-5555-5555-5555-555555555555", "GPU-66666666-6666-6666-6666-666666666666"},
+        {"GPU-66666666-6666-6666-6666-666666666666", "GPU-77777777-7777-7777-7777-777777777777"},
+    ]
+
+    def test_model_fits_one_gpu_dedicated(self):
+        # remaining=6 → dedicated (3 to services, 3 extras back to llama)
+        _, out, _ = run(self.TOPO, 20000)
+        assert out["strategy"] == "dedicated"
+
+    def test_model_fits_one_gpu_picks_nv2_pair(self):
+        _, out, _ = run(self.TOPO, 20000)
+        initial_pair = set(llama(out)["gpus"][:2])
+        assert any(initial_pair == p for p in self.NV2_PAIRS)
+
+    def test_model_fits_one_gpu_extras_back_to_llama(self):
+        # remaining=6: services get 3, extras 3 → llama total=5
+        _, out, _ = run(self.TOPO, 20000)
+        assert len(llama(out)["gpus"]) == 5
+
+    def test_model_fits_one_gpu_pipeline(self):
+        # extras degrade min_rank → pipeline
+        _, out, _ = run(self.TOPO, 20000)
+        assert parallelism(out)["mode"] == "pipeline"
+
+    def test_model_needs_nv2_pair_picks_nv2_pair(self):
+        _, out, _ = run(self.TOPO, 50000)
+        initial_pair = set(llama(out)["gpus"][:2])
+        assert any(initial_pair == p for p in self.NV2_PAIRS)
+
+    def test_model_needs_nv2_pair_extras_make_pipeline(self):
+        _, out, _ = run(self.TOPO, 50000)
+        assert parallelism(out)["mode"] == "pipeline"
+
+    def test_no_gpu_idle(self):
+        for model_size in (20000, 50000):
+            _, out, _ = run(self.TOPO, model_size)
+            assert all_assigned_uuids(out) == self.ALL_UUIDS
+
+    def test_model_too_large_errors(self):
+        rc, _, stderr = run(self.TOPO, 300000)
+        assert rc == 1
+        assert "exceeds" in stderr.lower()
+
+
+# ── 8 GPU — NV12 full mesh ────────────────────────────────────────────────────
+
+class TestEightGpuNv12FullMesh:
+    """8x A100-80GB. All pairs NV12 rank=100."""
+    TOPO = fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json")
+    ALL_UUIDS = {f"GPU-{str(i)*8}-{str(i)*4}-{str(i)*4}-{str(i)*4}-{str(i)*12}" for i in range(8)}
+
+    def test_model_fits_one_gpu_dedicated(self):
+        _, out, _ = run(self.TOPO, 70000)
+        assert out["strategy"] == "dedicated"
+
+    def test_model_fits_one_gpu_services_get_dedicated_gpus(self):
+        _, out, _ = run(self.TOPO, 70000)
+        svcs = out["services"]
+        uuids = [svcs[n]["gpus"][0] for n in ("whisper", "comfyui", "embeddings")]
+        assert len(set(uuids)) == 3
+
+    def test_model_fits_one_gpu_extras_back_to_llama_nvlink(self):
+        # NVLink pair wins, remaining=6: 3 to services, 3 extras → llama=5 GPUs, hybrid
+        _, out, _ = run(self.TOPO, 70000)
+        p = parallelism(out)
+        assert p["mode"] == "hybrid"
+        assert p["gpu_memory_utilization"] == 0.93
+
+    def test_model_fits_one_gpu_llama_5gpus(self):
+        _, out, _ = run(self.TOPO, 70000)
+        assert len(llama(out)["gpus"]) == 5
+
+    def test_model_needs_two_gpus_extras_back_to_llama(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert len(llama(out)["gpus"]) == 5
+
+    def test_model_needs_two_gpus_hybrid_nvlink(self):
+        _, out, _ = run(self.TOPO, 100000)
+        p = parallelism(out)
+        assert p["mode"] == "hybrid"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 2
+        assert p["gpu_memory_utilization"] == 0.93
+
+    def test_model_needs_five_gpus_no_extras(self):
+        # 350GB needs 5 GPUs. remaining=3 exactly → no extras → llama has 5 GPUs
+        _, out, _ = run(self.TOPO, 350000)
+        assert len(llama(out)["gpus"]) == 5
+        assert out["strategy"] == "dedicated"
+
+    def test_model_needs_five_gpus_hybrid(self):
+        _, out, _ = run(self.TOPO, 350000)
+        p = parallelism(out)
+        assert p["mode"] == "hybrid"
+        assert p["tensor_parallel_size"] == 2
+        assert p["pipeline_parallel_size"] == 2
+
+    def test_model_needs_five_gpus_services_dedicated(self):
+        _, out, _ = run(self.TOPO, 350000)
+        svcs = out["services"]
+        uuids = [svcs[n]["gpus"][0] for n in ("whisper", "comfyui", "embeddings")]
+        assert len(set(uuids)) == 3
+
+    def test_no_gpu_idle(self):
+        for model_size in (70000, 100000, 350000):
+            _, out, _ = run(self.TOPO, model_size)
+            assert all_assigned_uuids(out) == self.ALL_UUIDS
+
+    def test_model_too_large_errors(self):
+        rc, _, stderr = run(self.TOPO, 700000)
+        assert rc == 1
+        assert "exceeds" in stderr.lower()
+
+
+# ── 8 GPU — NV12 full mesh with NUMA annotation ───────────────────────────────
+
+class TestEightGpuNv12FullMeshWithNuma:
+    """NUMA annotation should not affect results."""
+    TOPO_WITH_NUMA    = fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.json")
+    TOPO_WITHOUT_NUMA = fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json")
+
+    def test_numa_annotation_does_not_affect_strategy(self):
+        _, out_numa, _    = run(self.TOPO_WITH_NUMA, 100000)
+        _, out_no_numa, _ = run(self.TOPO_WITHOUT_NUMA, 100000)
+        assert out_numa["strategy"] == out_no_numa["strategy"]
+
+    def test_numa_annotation_does_not_affect_parallelism_mode(self):
+        _, out_numa, _    = run(self.TOPO_WITH_NUMA, 100000)
+        _, out_no_numa, _ = run(self.TOPO_WITHOUT_NUMA, 100000)
+        assert parallelism(out_numa)["mode"] == parallelism(out_no_numa)["mode"]
+
+    def test_numa_annotation_does_not_affect_llama_gpu_count(self):
+        _, out_numa, _    = run(self.TOPO_WITH_NUMA, 100000)
+        _, out_no_numa, _ = run(self.TOPO_WITHOUT_NUMA, 100000)
+        assert len(llama(out_numa)["gpus"]) == len(llama(out_no_numa)["gpus"])
+
+
+# ── Output schema ─────────────────────────────────────────────────────────────
+
+class TestOutputSchema:
+    TOPO = fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json")
+
+    def test_version_field_present(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert out["version"] == "1.0"
+
+    def test_strategy_field_present(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert out["strategy"] in ("single", "dedicated", "colocated", "user-defined")
+
+    def test_all_four_services_present(self):
+        _, out, _ = run(self.TOPO, 100000)
+        for svc in ("llama_server", "whisper", "comfyui", "embeddings"):
+            assert svc in out["services"]
+
+    def test_gpus_always_a_list(self):
+        _, out, _ = run(self.TOPO, 100000)
+        for svc in out["services"].values():
+            assert isinstance(svc["gpus"], list)
+            assert len(svc["gpus"]) >= 1
+
+    def test_non_llama_services_have_no_parallelism_block(self):
+        _, out, _ = run(self.TOPO, 100000)
+        for name in ("whisper", "comfyui", "embeddings"):
+            assert "parallelism" not in out["services"][name]
+
+    def test_llama_always_has_parallelism_block(self):
+        _, out, _ = run(self.TOPO, 100000)
+        p = parallelism(out)
+        assert "mode" in p
+        assert "tensor_parallel_size" in p
+        assert "pipeline_parallel_size" in p
+        assert "gpu_memory_utilization" in p
+
+    def test_tensor_split_absent_when_homogeneous(self):
+        _, out, _ = run(self.TOPO, 100000)
+        assert "tensor_split" not in parallelism(out)
+
+    def test_gpu_uuids_are_strings(self):
+        _, out, _ = run(self.TOPO, 100000)
+        for svc in out["services"].values():
+            for uuid in svc["gpus"]:
+                assert isinstance(uuid, str)
+                assert uuid.startswith("GPU-")
+
+
+# ── Parallelism mode selection ────────────────────────────────────────────────
+
+class TestParallelismModeSelection:
+
+    def test_nvlink_two_gpus_tensor(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json"), 100000)
+        assert parallelism(out)["mode"] == "tensor"
+
+    def test_pcie_phb_two_gpus_pipeline(self):
+        # PHB rank=30 → pipeline
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_soc.json"), 100000)
+        assert parallelism(out)["mode"] == "pipeline"
+
+    def test_cross_numa_three_gpus_pipeline(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_soc.json"), 200000)
+        p = parallelism(out)
+        assert p["mode"] == "pipeline"
+        assert p["pipeline_parallel_size"] == 3
+
+    def test_nvlink_full_mesh_five_gpus_hybrid(self):
+        # NV12 full mesh, extras push back → 5 GPUs → hybrid
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json"), 100000)
+        assert parallelism(out)["mode"] == "hybrid"
+
+    def test_mem_util_none_is_095(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_1gpu_pcie.json"), 20000)
+        assert parallelism(out)["gpu_memory_utilization"] == 0.95
+
+    def test_mem_util_tensor_is_092(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.json"), 100000)
+        assert parallelism(out)["gpu_memory_utilization"] == 0.92
+
+    def test_mem_util_hybrid_is_093(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.json"), 100000)
+        assert parallelism(out)["gpu_memory_utilization"] == 0.93
+
+    def test_mem_util_pipeline_is_095(self):
+        _, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_soc.json"), 100000)
+        assert parallelism(out)["gpu_memory_utilization"] == 0.95
\ No newline at end of file
diff --git a/dream-server/tests/test-nvidia-topo.sh b/dream-server/tests/test-nvidia-topo.sh
new file mode 100755
index 00000000..3de6a43b
--- /dev/null
+++ b/dream-server/tests/test-nvidia-topo.sh
@@ -0,0 +1,307 @@
+#!/usr/bin/env bash
+# ============================================================================
+# Dream Server — NVIDIA Topology Detection Test
+# ============================================================================
+# Part of: tests/
+# Purpose: Test NVIDIA topology detection against fixture files
+#
+# Usage: ./test-nvidia-topo.sh
+# ============================================================================
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+FIXTURES_DIR="$SCRIPT_DIR/fixtures/topology_matrix"
+TOPO_SCRIPT="$SCRIPT_DIR/../installers/lib/nvidia-topo.sh"
+
+source "$SCRIPT_DIR/../installers/lib/constants.sh"
+
+# Counters
+TESTS_RUN=0
+TESTS_PASSED=0
+TESTS_FAILED=0
+
+# Check dependencies
+if ! command -v jq &>/dev/null; then
+    echo -e "${RED}ERROR: jq is required but not installed${NC}"
+    exit 1
+fi
+
+# Test fixture: nvidia_smi_topo_matrix_1gpu_pcie.txt
+test_1gpu_pcie() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_1gpu_pcie.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_1gpu_pcie.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            echo "0, NVIDIA RTX 4090, 24564, 4, 16, GPU-12345678-1234-1234-1234-123456789012"
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local links_count=$(echo "$result" | jq -r '.links | length')
+    
+    if [[ "$gpu_count" == "1" ]] && [[ "$links_count" == "0" ]]; then
+        echo -e "${GRN}✓ PASS: 1 GPU, 0 links${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 1 GPU and 0 links, got $gpu_count GPUs and $links_count links${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_4gpus_soc.txt
+test_4gpus_soc() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_4gpus_soc.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_4gpus_soc.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            echo "0, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-00000000-0000-0000-0000-000000000000"
+            echo "1, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-11111111-1111-1111-1111-111111111111"
+            echo "2, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-22222222-2222-2222-2222-222222222222"
+            echo "3, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-33333333-3333-3333-3333-333333333333"
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local links_count=$(echo "$result" | jq -r '.links | length')
+    local has_soc=$(echo "$result" | jq -r '.links[] | select(.link_type == "SOC") | .link_type' | head -1)
+    
+    if [[ "$gpu_count" == "4" ]] && [[ "$links_count" -gt "0" ]] && [[ "$has_soc" == "SOC" ]]; then
+        echo -e "${GRN}✓ PASS: 4 GPUs, $links_count links, SOC topology detected${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 4 GPUs with SOC links, got $gpu_count GPUs, $links_count links${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt
+test_4gpus_sys_separated_nv_pairs() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_4gpus_sys_separated_nv_pairs.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            echo "0, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-00000000-0000-0000-0000-000000000000"
+            echo "1, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-11111111-1111-1111-1111-111111111111"
+            echo "2, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-22222222-2222-2222-2222-222222222222"
+            echo "3, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-33333333-3333-3333-3333-333333333333"
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local nvlink_count=$(echo "$result" | jq -r '[.links[] | select(.link_type | startswith("NV"))] | length')
+    
+    if [[ "$gpu_count" == "4" ]] && [[ "$nvlink_count" -gt "0" ]]; then
+        echo -e "${GRN}✓ PASS: 4 GPUs, $nvlink_count NVLink connections${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 4 GPUs with NVLink, got $gpu_count GPUs, $nvlink_count NVLinks${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt
+test_5gpus_nv12_with_mlx5() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_5gpus_nv12_with_mlx5.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            echo "0, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-00000000-0000-0000-0000-000000000000"
+            echo "1, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-11111111-1111-1111-1111-111111111111"
+            echo "2, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-22222222-2222-2222-2222-222222222222"
+            echo "3, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-33333333-3333-3333-3333-333333333333"
+            echo "4, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-44444444-4444-4444-4444-444444444444"
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local nv12_count=$(echo "$result" | jq -r '[.links[] | select(.link_type == "NV12")] | length')
+    
+    if [[ "$gpu_count" == "5" ]] && [[ "$nv12_count" -gt "0" ]]; then
+        echo -e "${GRN}✓ PASS: 5 GPUs, $nv12_count NV12 connections${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 5 GPUs with NV12, got $gpu_count GPUs, $nv12_count NV12 links${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt
+test_8gpus_nv12_full_mesh() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            for i in {0..7}; do
+                echo "$i, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-${i}${i}${i}${i}${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}"
+            done
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local nv12_count=$(echo "$result" | jq -r '[.links[] | select(.link_type == "NV12")] | length')
+    
+    # Full mesh of 8 GPUs should have 28 links (8*7/2)
+    if [[ "$gpu_count" == "8" ]] && [[ "$nv12_count" -gt "20" ]]; then
+        echo -e "${GRN}✓ PASS: 8 GPUs, $nv12_count NV12 connections (full mesh)${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 8 GPUs with full mesh NV12, got $gpu_count GPUs, $nv12_count NV12 links${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt
+test_8gpus_nv12_full_mesh_with_numa() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_8gpus_nv12_full_mesh_with_numa_id.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            for i in {0..7}; do
+                echo "$i, NVIDIA A100-SXM4-80GB, 81920, 4, 16, GPU-${i}${i}${i}${i}${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}"
+            done
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    numactl() {
+        if [[ "$1" == "--hardware" ]]; then
+            echo "node 0 cpus: 0 1 2 3"
+            echo "node 1 cpus: 4 5 6 7"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local numa_nodes=$(echo "$result" | jq -r '.numa.nodes')
+    local nv12_count=$(echo "$result" | jq -r '[.links[] | select(.link_type == "NV12")] | length')
+    
+    if [[ "$gpu_count" == "8" ]] && [[ "$numa_nodes" == "2" ]] && [[ "$nv12_count" -gt "20" ]]; then
+        echo -e "${GRN}✓ PASS: 8 GPUs, 2 NUMA nodes, $nv12_count NV12 connections${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 8 GPUs, 2 NUMA nodes with NV12, got $gpu_count GPUs, $numa_nodes NUMA nodes, $nv12_count NV12 links${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Test fixture: nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt
+test_8gpus_nv1_nv2_partial_mesh() {
+    echo -e "${BLU}Testing: nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt${NC}"
+    TESTS_RUN=$((TESTS_RUN + 1))
+    
+    nvidia-smi() {
+        if [[ "$1" == "topo" && "$2" == "-m" ]]; then
+            cat "$FIXTURES_DIR/nvidia_smi_topo_matrix_8gpus_nv1_nv2_partial_mesh.txt"
+        elif [[ "$*" == "--query-gpu=index,name,memory.total,pcie.link.gen.current,pcie.link.width.current,uuid --format=csv,noheader,nounits" ]]; then
+            for i in {0..7}; do
+                echo "$i, NVIDIA V100-SXM2-32GB, 32768, 3, 16, GPU-${i}${i}${i}${i}${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}-${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}${i}"
+            done
+        elif [[ "$*" == "--query-gpu=driver_version --format=csv,noheader" ]]; then
+            echo "535.129.03"
+        elif [[ "$1" == "-q" ]]; then
+            echo "MIG Mode: Disabled"
+        fi
+    }
+    
+    source "$TOPO_SCRIPT"
+    local result=$(detect_nvidia_topo)
+    
+    # Assertions
+    local gpu_count=$(echo "$result" | jq -r '.gpu_count')
+    local nvlink_count=$(echo "$result" | jq -r '[.links[] | select(.link_type | startswith("NV"))] | length')
+    
+    if [[ "$gpu_count" == "8" ]] && [[ "$nvlink_count" -gt "0" ]]; then
+        echo -e "${GRN}✓ PASS: 8 GPUs, $nvlink_count NVLink connections (partial mesh)${NC}"
+        TESTS_PASSED=$((TESTS_PASSED + 1))
+    else
+        echo -e "${RED}✗ FAIL: Expected 8 GPUs with NVLink, got $gpu_count GPUs, $nvlink_count NVLink connections${NC}"
+        TESTS_FAILED=$((TESTS_FAILED + 1))
+    fi
+}
+
+# Main test runner
+echo -e "${MAG}=== NVIDIA Topology Detection Tests ===${NC}\n"
+
+test_1gpu_pcie
+test_4gpus_soc
+test_4gpus_sys_separated_nv_pairs
+test_5gpus_nv12_with_mlx5
+test_8gpus_nv12_full_mesh
+test_8gpus_nv12_full_mesh_with_numa
+test_8gpus_nv1_nv2_partial_mesh
+
+echo -e "\n${MAG}=== Test Summary ===${NC}"
+echo -e "Tests run:    $TESTS_RUN"
+echo -e "${GRN}Tests passed: $TESTS_PASSED${NC}"
+if [[ $TESTS_FAILED -gt 0 ]]; then
+    echo -e "${RED}Tests failed: $TESTS_FAILED${NC}"
+    exit 1
+else
+    echo -e "${GRN}All tests passed!${NC}"
+    exit 0
+fi