Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

router = APIRouter(tags=["updates"])

_VALID_ACTIONS = {"check", "backup", "update"}

_GITHUB_HEADERS = {"Accept": "application/vnd.github.v3+json"}


Expand Down
24 changes: 12 additions & 12 deletions dream-server/scripts/assign_gpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
from typing import Optional


# Constants
# Constants

HIGH_BW_THRESHOLD = 80 # min rank for NVLink / XGMI
DEFAULT_SERVICES = ["llama_server", "whisper", "comfyui", "embeddings"]
NON_LLAMA = ["whisper", "comfyui", "embeddings"]


# Data Models
# Data Models

@dataclass
class GPU:
Expand Down Expand Up @@ -69,7 +69,7 @@ class AssignmentResult:
services: dict


# Phase 1: Topology Analysis
# Phase 1: Topology Analysis

def parse_gpus(topology: dict) -> list:
gpus = []
Expand Down Expand Up @@ -156,7 +156,7 @@ def enumerate_subsets(gpus: list, rank_matrix: dict) -> list:
)


# Phase 2: GPU Assignment
# Phase 2: GPU Assignment

def find_llama_subset(ordered_subsets: list, model_size_mb: float) -> Subset:
"""
Expand Down Expand Up @@ -257,7 +257,7 @@ def assign_services(all_gpus: list, llama_gpus: list, rank_matrix: dict, enabled
return assignments, final_llama_gpus, strategy


# Phase 3: Llama Parallelism
# Phase 3: Llama Parallelism

def largest_pow2_divisor(n: int) -> int:
"""
Expand Down Expand Up @@ -369,7 +369,7 @@ def select_parallelism(subset: Subset) -> LlamaParallelism:
)


# Phase 4: Build Output JSON
# Phase 4: Build Output JSON

def build_output(result: AssignmentResult) -> dict:
services = {}
Expand Down Expand Up @@ -400,7 +400,7 @@ def build_output(result: AssignmentResult) -> dict:
}


# Entry Point
# Entry Point

def main():
parser = argparse.ArgumentParser(description="GPU assignment algorithm for DreamServer")
Expand Down Expand Up @@ -429,7 +429,7 @@ def main():
print("ERROR: no GPUs found in topology", file=sys.stderr)
sys.exit(1)

# Early exit: single GPU
# Early exit: single GPU
if gpu_count == 1:
gpu = parse_gpus(topology)[0]
if model_size_mb > gpu.memory_mb:
Expand All @@ -453,13 +453,13 @@ def main():
print(json.dumps(build_output(result), indent=2))
return

# Phase 1: Topology analysis
# Phase 1: Topology analysis
gpus = parse_gpus(topology)
links = parse_links(topology)
rank_matrix = build_rank_matrix(links)
ordered = enumerate_subsets(gpus, rank_matrix)

# Phase 2: GPU assignment
# Phase 2: GPU assignment
try:
llama_subset = find_llama_subset(ordered, model_size_mb)
if llama_subset is None:
Expand All @@ -472,12 +472,12 @@ def main():
gpus, llama_subset.gpus, rank_matrix, enabled_services
)

# Phase 3: Llama parallelism
# Phase 3: Llama parallelism
final_subset = compute_subset(final_llama_gpus, rank_matrix)
parallelism = select_parallelism(final_subset)
service_assignments["llama_server"].parallelism = parallelism

# Phase 4: Emit JSON
# Phase 4: Emit JSON
result = AssignmentResult(strategy=strategy, services=service_assignments)
print(json.dumps(build_output(result), indent=2))

Expand Down
28 changes: 14 additions & 14 deletions dream-server/tests/bats-tests/tier-map.bats
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,29 @@ setup() {

# ── resolve_tier_config ─────────────────────────────────────────────────────

@test "resolve_tier_config: tier 1 sets Entry Level with qwen3-8b" {
@test "resolve_tier_config: tier 1 sets Entry Level with qwen3.5-9b" {
TIER=1
resolve_tier_config
assert_equal "$TIER_NAME" "Entry Level"
assert_equal "$LLM_MODEL" "qwen3-8b"
assert_equal "$GGUF_FILE" "Qwen3-8B-Q4_K_M.gguf"
assert_equal "$LLM_MODEL" "qwen3.5-9b"
assert_equal "$GGUF_FILE" "Qwen3.5-9B-Q4_K_M.gguf"
assert_equal "$MAX_CONTEXT" "16384"
}

@test "resolve_tier_config: tier 2 sets Prosumer with qwen3-8b" {
@test "resolve_tier_config: tier 2 sets Prosumer with qwen3.5-9b" {
TIER=2
resolve_tier_config
assert_equal "$TIER_NAME" "Prosumer"
assert_equal "$LLM_MODEL" "qwen3-8b"
assert_equal "$LLM_MODEL" "qwen3.5-9b"
assert_equal "$MAX_CONTEXT" "32768"
}

@test "resolve_tier_config: tier 3 sets Pro with qwen3-14b" {
@test "resolve_tier_config: tier 3 sets Pro with qwen3.5-27b" {
TIER=3
resolve_tier_config
assert_equal "$TIER_NAME" "Pro"
assert_equal "$LLM_MODEL" "qwen3-14b"
assert_equal "$GGUF_FILE" "Qwen3-14B-Q4_K_M.gguf"
assert_equal "$LLM_MODEL" "qwen3.5-27b"
assert_equal "$GGUF_FILE" "Qwen3.5-27B-Q4_K_M.gguf"
assert_equal "$MAX_CONTEXT" "32768"
}

Expand Down Expand Up @@ -101,27 +101,27 @@ setup() {

@test "tier_to_model: maps all numeric tiers correctly" {
run tier_to_model 1
assert_output "qwen3-8b"
assert_output "qwen3.5-9b"

run tier_to_model 2
assert_output "qwen3-8b"
assert_output "qwen3.5-9b"

run tier_to_model 3
assert_output "qwen3-14b"
assert_output "qwen3.5-27b"

run tier_to_model 4
assert_output "qwen3-30b-a3b"
}

@test "tier_to_model: maps T-prefix aliases correctly" {
run tier_to_model T1
assert_output "qwen3-8b"
assert_output "qwen3.5-9b"

run tier_to_model T2
assert_output "qwen3-8b"
assert_output "qwen3.5-9b"

run tier_to_model T3
assert_output "qwen3-14b"
assert_output "qwen3.5-27b"

run tier_to_model T4
assert_output "qwen3-30b-a3b"
Expand Down
2 changes: 1 addition & 1 deletion dream-server/tests/test-assign-gpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,4 +557,4 @@ def test_mem_util_hybrid_is_093(self):

def test_mem_util_pipeline_is_095(self):
_, out, _ = run(fixture_path("nvidia_smi_topo_matrix_4gpus_soc.json"), 100000)
assert parallelism(out)["gpu_memory_utilization"] == 0.95
assert parallelism(out)["gpu_memory_utilization"] == 0.95
Loading