Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion dream-server/extensions/services/dashboard-api/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# poll cycle and prevents file-descriptor exhaustion.

_aio_session: Optional[aiohttp.ClientSession] = None
_HEALTH_TIMEOUT = aiohttp.ClientTimeout(total=5)
_HEALTH_TIMEOUT = aiohttp.ClientTimeout(total=30)


async def _get_aio_session() -> aiohttp.ClientSession:
Expand Down Expand Up @@ -167,6 +167,25 @@ async def get_llama_context_size(model_hint: Optional[str] = None) -> Optional[i
return None


# --- Service Health Cache ---
# Written by background poll loop in main.py, read by API endpoints.
# Keeps health checking decoupled from request handling so slow DNS
# lookups (Docker Desktop) never block API responses.

_services_cache: Optional[list] = None # list[ServiceStatus], set by poll loop


def set_services_cache(statuses: list) -> None:
"""Store latest health check results (called by background poll)."""
global _services_cache
_services_cache = statuses


def get_cached_services() -> Optional[list]:
"""Read cached health check results. Returns None if no poll has completed yet."""
return _services_cache


# --- Service Health ---

async def check_service_health(service_id: str, config: dict) -> ServiceStatus:
Expand Down Expand Up @@ -218,6 +237,8 @@ async def _check_host_service_health(service_id: str, config: dict) -> ServiceSt
async with session.get(url) as resp:
response_time = (asyncio.get_event_loop().time() - start) * 1000
status = "healthy" if resp.status < 400 else "unhealthy"
except asyncio.TimeoutError:
status = "down"
except aiohttp.ClientConnectorError:
status = "down"
except (aiohttp.ClientError, OSError) as e:
Expand Down
42 changes: 37 additions & 5 deletions dream-server/extensions/services/dashboard-api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from security import verify_api_key
from gpu import get_gpu_info
from helpers import (
get_all_services,
get_all_services, get_cached_services, set_services_cache,
get_disk_usage, get_model_info, get_bootstrap_status,
get_uptime, get_cpu_metrics, get_ram_metrics,
get_llama_metrics, get_loaded_model, get_llama_context_size,
Expand Down Expand Up @@ -74,6 +74,7 @@ def set(self, key: str, value: object, ttl: float):
_GPU_CACHE_TTL = 3.0
_STATUS_CACHE_TTL = 2.0
_STORAGE_CACHE_TTL = 30.0
_SERVICE_POLL_INTERVAL = 10.0 # background health check interval

# --- Router imports ---
from routers import workflows, features, setup, updates, agents, privacy
Expand Down Expand Up @@ -243,7 +244,10 @@ async def gpu(api_key: str = Depends(verify_api_key)):

@app.get("/services", response_model=list[ServiceStatus])
async def services(api_key: str = Depends(verify_api_key)):
"""Get all service health statuses."""
"""Get all service health statuses (from background poll cache)."""
cached = get_cached_services()
if cached is not None:
return cached
return await get_all_services()


Expand All @@ -266,7 +270,7 @@ async def bootstrap(api_key: str = Depends(verify_api_key)):
async def status(api_key: str = Depends(verify_api_key)):
"""Get full system status. Runs sync helpers in thread pool concurrently."""
service_statuses, gpu_info, disk_info, model_info, bootstrap_info, uptime = await asyncio.gather(
get_all_services(),
_get_services(),
asyncio.to_thread(get_gpu_info),
asyncio.to_thread(get_disk_usage),
asyncio.to_thread(get_model_info),
Expand Down Expand Up @@ -324,7 +328,7 @@ async def _build_api_status() -> dict:
asyncio.to_thread(get_uptime),
asyncio.to_thread(get_cpu_metrics),
asyncio.to_thread(get_ram_metrics),
get_all_services(),
_get_services(),
get_loaded_model(),
)

Expand Down Expand Up @@ -483,12 +487,40 @@ def dir_size_gb(path: Path) -> float:
return result


# --- Service Health Polling ---

async def _get_services() -> list[ServiceStatus]:
"""Return cached service health, falling back to live check."""
cached = get_cached_services()
if cached is not None:
return cached
return await get_all_services()


async def _poll_service_health():
"""Background task: poll all service health on a timer.

Results stored via set_services_cache(). API endpoints read
cached results instead of running live checks. The poll can
take as long as it needs — nobody waits for it.
"""
await asyncio.sleep(2) # let services start
while True:
try:
statuses = await get_all_services()
set_services_cache(statuses)
except Exception:
logger.exception("Service health poll failed")
await asyncio.sleep(_SERVICE_POLL_INTERVAL)


# --- Startup ---

@app.on_event("startup")
async def startup_event():
"""Start background metrics collection."""
"""Start background tasks."""
asyncio.create_task(collect_metrics())
asyncio.create_task(_poll_service_health())


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ def calculate_feature_status(feature: dict, services: list, gpu_info: Optional[G
async def api_features(api_key: str = Depends(verify_api_key)):
"""Get feature discovery data."""
import asyncio
from helpers import get_all_services
gpu_info, service_list = await asyncio.gather(
asyncio.to_thread(get_gpu_info),
get_all_services(),
)
from helpers import get_all_services, get_cached_services
service_list = get_cached_services()
if service_list is None:
service_list = await get_all_services()
gpu_info = await asyncio.to_thread(get_gpu_info)

feature_statuses = [calculate_feature_status(f, service_list, gpu_info) for f in FEATURES]
feature_statuses.sort(key=lambda x: x["priority"])
Expand Down
Loading