ai-dynamo · atchernych · Oct 8, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025
@@ -53,7 +53,7 @@ b. Install the Inference Extension CRDs (Inference Model and Inference Pool CRDs
 
 ```bash
 INFERENCE_EXTENSION_VERSION=v0.5.1
-kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml -n  my-model
+kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml
 ```
 
 c. Install `kgateway` CRDs and kgateway.

@@ -1,12 +1,12 @@
 # Dynamo model serving recipes
 
-| Model family  | Backend | Mode                | GPU   | Deployment | Benchmark |
-|---------------|---------|---------------------|-------|------------|-----------|
-| llama-3-70b   | vllm    | agg                 | H100, H200  |     ✓      |     ✓     |
-| llama-3-70b   | vllm    | disagg-multi-node   | H100, H200  |     ✓      |     ✓     |
-| llama-3-70b   | vllm    | disagg-single-node  | H100, H200  |     ✓      |     ✓     |
-| DeepSeek-R1   | sglang  | disaggregated       | H200  |     ✓      |    🚧     |
-| oss-gpt       | trtllm  | aggregated          | GB200 |     ✓      |     ✓     |
+| Model family  | Backend | Mode                | Deployment | Benchmark | GAIE-integration |
+|---------------|---------|---------------------|------------|-----------|------------------|
+| llama-3-70b   | vllm    | agg                 |     ✓      |     ✓     |        ✓         |
+| llama-3-70b   | vllm    | disagg-multi-node   |     ✓      |     ✓     |                  |
+| llama-3-70b   | vllm    | disagg-single-node  |     ✓      |     ✓     |                  |
+| oss-gpt       | trtllm  | aggregated          |     ✓      |     ✓     |                  |
+| DeepSeek-R1   | sglang  | disaggregated       |     ✓      |    🚧     |                  |
 
 
 ## Prerequisites
@@ -32,8 +32,7 @@ Update the `hf-token-secret.yaml` file with your HuggingFace token.
 kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE}
 ```
 
-6. (Optional) Create a shared model cache pvc to store the model weights.
-Choose a storage class to create the model cache pvc. You'll need to use this storage class name to update the `storageClass` field in the model-cache/model-cache.yaml file.
+6. Choose a storage class to create the model cache pvc. You'll need to use this storage class name to update the `storageClass` field in the model-cache/model-cache.yaml file. (Optional) Create a shared model cache pvc to store the model weights. If not, the script below will create it for you. If you created it manually, pass `--skip-model-cache` to the script below.
 
 ```bash
 kubectl get storageclass
@@ -70,6 +69,20 @@ Example:
 ./run.sh --model llama-3-70b --framework vllm --deployment-type agg
 ```
 
+## If deploying with Gateway API Inference extension GAIE
+
+1. Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE.
+
+2. Apply manifests by running a script.
+
+```bash
+# Match the block size to the cli value in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128"
+export DYNAMO_KV_BLOCK_SIZE=128
+export EPP_IMAGE=nvcr.io/you/epp:tag
+# Add --gaie argument to the script i.e.:
+./run.sh --model llama-3-70b --framework vllm --gaie agg
+```
+The script will perform gateway checks and apply the manifests.
 
 ## Dry run mode
 

@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+# ===== Namespace ensure =====
+if ! kubectl get ns "$NAMESPACE" >/dev/null 2>&1; then
+  kubectl create namespace "$NAMESPACE"
+fi
+
+KGW_NS="${KGW_NS:-kgateway-system}"
+
+ok()  { printf "✅ %s\n" "$*"; }
+fail(){ printf "❌ %s\n" "$*" >&2; exit 1; }
+info(){ printf "ℹ️  %s\n" "$*"; }
+
+need() { command -v "$1" >/dev/null 2>&1 || fail "'$1' is required"; }
+
+need kubectl
+
+# ===== Config (env overridable) =====
+: "${NAMESPACE:=dynamo}"
+
+# ===== Pre-flight checks =====
+command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; }
+command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; }
+
+GATEWAY_CRDS=(
+  gateways.gateway.networking.k8s.io
+  gatewayclasses.gateway.networking.k8s.io
+  httproutes.gateway.networking.k8s.io
+  referencegrants.gateway.networking.k8s.io
+)
+info "Checking Gateway API CRDs…"
+for c in "${GATEWAY_CRDS[@]}"; do
+  kubectl get crd "$c" >/dev/null 2>&1 || fail "Missing CRD: $c (run step a)"
+  kubectl wait --for=condition=Established "crd/$c" --timeout=60s >/dev/null || fail "CRD not Established: $c"
+done
+ok "Gateway API CRDs present & Established"
+
+GAIE_CRDS=(
+  inferencemodels.inference.networking.x-k8s.io
+  inferencepools.inference.networking.x-k8s.io
+)
+
+info "Checking GAIE (Inference Extension) CRDs…"
+for c in "${GAIE_CRDS[@]}"; do
+  kubectl get crd "$c" >/dev/null 2>&1 || fail "Missing CRD: $c (run step b install of inference extension)"
+  kubectl wait --for=condition=Established "crd/$c" --timeout=60s >/dev/null || fail "CRD not Established: $c"
+done
+ok "GAIE CRDs present & Established"
+
+info "Checking Kgateway controller in namespace '$KGW_NS'…"
+# namespace must exist
+kubectl get ns "$KGW_NS" >/dev/null 2>&1 || fail "Namespace '$KGW_NS' not found (run step c Helm installs)"
+
+# pods should be running
+if ! kubectl get pods -n "$KGW_NS" -l app.kubernetes.io/name=kgateway >/dev/null 2>&1; then
+  # fallback label (charts sometimes label differently)
+  PODS=$(kubectl get pods -n "$KGW_NS" -o name | grep -E 'kgateway|gateway' || true)
+  [[ -z "${PODS:-}" ]] && fail "Kgateway pods not found in '$KGW_NS'"
+else
+  PODS=$(kubectl get pods -n "$KGW_NS" -l app.kubernetes.io/name=kgateway -o name)
+fi
+for p in $PODS; do
+  kubectl wait -n "$KGW_NS" --for=condition=Ready "$p" --timeout=180s >/dev/null || fail "Pod not Ready: $p"
+done
+ok "Kgateway controller pods Ready"
+
+kubectl get gateway.gateway.networking.k8s.io inference-gateway -n "$NAMESPACE" >/dev/null 2>&1 || fail "Gateway 'inference-gateway' not found in $NAMESPACE (apply step d manifest)"
+
+ok "GAIE is installed and the gateway is up in namespace '$NAMESPACE'."
+
+
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: You can remove the namespace field if using kubectl apply -n
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: epp-config
+  labels:
+    app.kubernetes.io/name: dynamo-gaie
+    app.kubernetes.io/instance: llama3-70b-agg
+data:
+  epp-config-dynamo.yaml: |
+    apiVersion: inference.networking.x-k8s.io/v1alpha1
+    kind: EndpointPickerConfig
+    plugins:
+      # Required: tells EPP which profile to use (even if you only have one)
+      - type: single-profile-handler
+
+      # Picker: chooses the final endpoint after scoring
+      - name: picker
+        type: max-score-picker
+      - name: dyn-pre
+        type: dynamo-inject-workerid
+        parameters: {}
+      - name: dyn-kv
+        type: kv-aware-scorer
+        parameters:
+          frontendURL: http://127.0.0.1:8000/v1/chat/completions
+          timeoutMS: 10000
+    schedulingProfiles:
+      - name: default
+        plugins:
+          - pluginRef: dyn-kv
+            weight: 1
+          - pluginRef: picker
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: Update the namespace field below to match your deployment namespace
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: llama3-70b-agg-epp
+  labels:
+    app: llama3-70b-agg-epp
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: llama3-70b-agg-epp
+  template:
+    metadata:
+      labels:
+        app: llama3-70b-agg-epp
+    spec:
+      serviceAccountName: epp-sa
+      terminationGracePeriodSeconds: 130
+
+      imagePullSecrets:
+        - name: docker-imagepullsecret
+
+      containers:
+        - name: epp
+          image: "gitlab-master.nvidia.com:5005/dl/ai-dynamo/dynamo/epp-inference-extension-dynamo:etcdless-2"
+          imagePullPolicy: IfNotPresent
+          resources:
+            requests:
+              memory: "1Gi"
+              cpu: "1"
+            limits:
+              memory: "2Gi"
+              cpu: "2"
+          command: ["/bin/sh", "-c"]
+          args:
+            - >
+              exec /epp
+              -poolName "llama3-70b-agg-pool"
+              -poolNamespace "$POD_NAMESPACE"
+              -v 4 --zap-encoder json
+              -grpcPort 9002 -grpcHealthPort 9003
+              -configFile /etc/epp/epp-config-dynamo.yaml
+
+          volumeMounts:
+            - name: epp-config
+              mountPath: /etc/epp
+              readOnly: true
+
+          env:
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: PLATFORM_NAMESPACE
+              value: "$(POD_NAMESPACE)" # set to your dynamo platform namespace if different
+            - name: ETCD_ENDPOINTS
+              value: "dynamo-platform-etcd.$(PLATFORM_NAMESPACE):2379"
+            - name: NATS_SERVER
+              value: "nats://dynamo-platform-nats.$(PLATFORM_NAMESPACE):4222"
+            - name: DYN_NAMESPACE
+              value: "llama3-70b-agg"
+            - name: DYNAMO_COMPONENT
+              value: "backend"
+            - name: DYNAMO_KV_BLOCK_SIZE
+              value: "128" # UPDATE to match the --block-size in your deploy.yaml engine command
+            - name: USE_STREAMING
+              value: "true"
+
+          ports:
+            - containerPort: 9002
+            - containerPort: 9003
+            - name: metrics
+              containerPort: 9090
+          livenessProbe:
+            grpc:
+              port: 9003
+              service: inference-extension
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          readinessProbe:
+            grpc:
+              port: 9003
+              service: inference-extension
+            initialDelaySeconds: 5
+            periodSeconds: 10
+
+      volumes:
+        - name: epp-config
+          configMap:
+            name: epp-config
+            items:
+              - key: epp-config-dynamo.yaml
+                path: epp-config-dynamo.yaml
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: You can remove metadata.namespace if using kubectl apply -n
+# The backendRefs.namespace field should match where your InferencePool is deployed
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llama3-70b-agg-route
+spec:
+  parentRefs:
+    - group: gateway.networking.k8s.io
+      kind: Gateway
+      name: inference-gateway
+  rules:
+    - backendRefs:
+        - group: inference.networking.x-k8s.io
+          kind: InferencePool
+          name: llama3-70b-agg-pool
+          port: 8000
+          weight: 1
+      matches:
+        - path:
+            type: PathPrefix
+            value: /
+      timeouts:
+        request: 300s
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: Update the namespace field below to match your deployment namespace
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama3-70b-agg-epp
+spec:
+  selector:
+    app: llama3-70b-agg
+  ports:
+    - protocol: TCP
+      port: 9002
+      targetPort: 9002
+      appProtocol: http2
+  type: ClusterIP
+