NovaSky-AI · SumanthRH · Mar 27, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/ci/gpu_ci_run_skyrl_train.sh b/ci/gpu_ci_run_skyrl_train.sh
@@ -32,7 +32,7 @@ uv run --directory . --isolated --extra dev --extra fsdp pytest -s tests/backend
 
 # Run tests for new inference layer
 # TODO (sumanthrh): Migrate the remaining tests: test_verifiers_generator.py , test_pause_and_continue_generation.py
-_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp --extra vllm-router pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py
-_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp --extra vllm-router pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py
-_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp --extra vllm-router pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_skyrl_gym_generator.py
-_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp --extra vllm-router pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_lora.py
+_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_policy_local_engines_e2e.py
+_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_engine_generation.py
+_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_skyrl_gym_generator.py
+_SKYRL_USE_NEW_INFERENCE=1 uv run --isolated --extra dev --extra fsdp pytest -s tests/backends/skyrl_train/gpu/gpu_ci/test_lora.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,10 +44,6 @@ tinker = [
     "psycopg2-binary",
 ]
 
-vllm-router = [
-    "vllm-router",
-]
-
 aws = [
     "cloudpathlib[s3]",
 ]
@@ -98,13 +94,15 @@ skyrl-train = [
     "s3fs",
     "fastapi",
     "uvicorn",
+    "vllm-router; sys_platform == 'linux'",
     "pybind11",
     "setuptools",
 ]
 
 fsdp = [
     "skyrl[skyrl-train]",
     "vllm==0.18.0; sys_platform == 'linux'",
+    "vllm-router; sys_platform == 'linux'",
     # TODO (aaron): Once PyTorch 2.10 is officially supported (stable PyPI torch + matching
     # flash-attn wheels), drop the custom wheel URL
     "flash-attn @ https://github.com/lesj0610/flash-attention/releases/download/v2.8.3-cu12-torch2.10-cp312/flash_attn-2.8.3%2Bcu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; sys_platform == 'linux' and python_version == '3.12' and platform_machine == 'x86_64'",
@@ -123,6 +121,7 @@ megatron = [
     "flash-attn @ https://github.com/lesj0610/flash-attention/releases/download/v2.8.3-cu12-torch2.10-cp312/flash_attn-2.8.3%2Bcu12torch2.10cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; sys_platform == 'linux' and python_version == '3.12' and platform_machine == 'x86_64'",
     "flash-attn==2.8.3; sys_platform == 'linux' and (python_version != '3.12' or platform_machine != 'x86_64')",
     "vllm==0.18.0; sys_platform == 'linux'",
+    "vllm-router; sys_platform == 'linux'",
     "torch==2.10.0; sys_platform == 'linux'",
     "flashinfer-python==0.6.6; sys_platform == 'linux' and platform_machine == 'x86_64'",
     "torchvision; sys_platform == 'linux'",

diff --git a/skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py b/skyrl/backends/skyrl_train/inference_servers/remote_inference_client.py
@@ -11,7 +11,7 @@
 
 1. Data Plane (routed through proxy_url):
    - generate, chat_completion, completion, tokenize, detokenize, render
-   - Uses proxy_url which points to a router (vllm-router, sglang-router, InferenceRouter)
+   - Uses proxy_url which points to a router (VLLMRouter or external)
    - Router handles load balancing and session-aware routing
 
 2. Control Plane (fan-out to all server_urls):
@@ -118,7 +118,7 @@ class RemoteInferenceClient:
     - server_urls: List of backend URLs for control plane operations (fan-out)
 
     The router (proxy_url) is expected to be a data-plane-only router (like
-    vllm-router, sglang-router, or InferenceRouter). Control plane operations
+    VLLMRouter or an external router). Control plane operations
     are always fanned out to all backends directly by this client.
 
     Usage:

diff --git a/skyrl/backends/skyrl_train/inference_servers/router.py b/skyrl/backends/skyrl_train/inference_servers/router.py
diff --git a/skyrl/backends/skyrl_train/inference_servers/vllm_router.py b/skyrl/backends/skyrl_train/inference_servers/vllm_router.py
@@ -2,7 +2,7 @@
 VLLMRouter - Subprocess wrapper for vllm-router (data plane only).
 
 Spawns the vllm-router binary as a subprocess with consistent_hash policy,
-providing the same interface as InferenceRouter. Requires ``pip install vllm-router``.
+providing session-aware routing via consistent hashing.
 """
 
 import logging