kubernetes-sigs · davidbreitgand · Nov 13, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 30, 2025
diff --git a/config/manifests/bbr-example/httproute_bbr_lora.yaml b/config/manifests/bbr-example/httproute_bbr_lora.yaml
@@ -0,0 +1,91 @@
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-llama-route
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: vllm-llama3-8b-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          name: X-Gateway-Model-Name 
+          value: 'meta-llama/Llama-3.1-8B-Instruct'
+    timeouts:
+      request: 300s
+---   
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: llm-deepseek-route #give this HTTPRoute any name that helps you to group and track the matchers
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: vllm-deepseek-r1
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          name: X-Gateway-Model-Name
+          value: 'deepseek/vllm-deepseek-r1'
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
+          name: X-Gateway-Model-Name
+          value: 'food-review'
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
+          name: X-Gateway-Model-Name
+          value: 'movie-critique'
+    timeouts:
+      request: 300s
+---
+apiVersion: gateway.networking.k8s.io/v1
+kind: HTTPRoute
+metadata:
+  name: vllm-llama3-8b-instruct-lora-food-review-1 #give this HTTPRoute any name that helps you to group and track the routes
+spec:
+  parentRefs:
+  - group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  rules:
+  - backendRefs:
+    - group: inference.networking.k8s.io
+      kind: InferencePool
+      name: vllm-llama3-8b-instruct
+    matches:
+    - path:
+        type: PathPrefix
+        value: /
+      headers:
+        - type: Exact
+          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
-          #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header.
+          name: X-Gateway-Model-Name 
+          value: 'food-review-1'   #this is the name of LoRA as defined in vLLM deployment
+    timeouts:
+      request: 300s
diff --git a/config/manifests/vllm/sim-deployment-1.yaml b/config/manifests/vllm/sim-deployment-1.yaml
@@ -0,0 +1,44 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-deepseek-r1
+spec:
+  replicas: 1 
+  selector:
+    matchLabels:
+      app: vllm-deepseek-r1
+  template:
+    metadata:
+      labels:
+        app: vllm-deepseek-r1
+    spec:
+      containers:
+      - name: vllm-sim
+        image: ghcr.io/llm-d/llm-d-inference-sim:v0.4.0
+        imagePullPolicy: Always
+        args:
+        - --model
+        - deepseek/vllm-deepseek-r1
+        - --port
+        - "8000"
+        - --max-loras
+        - "2"
+        - --lora-modules
+        - '{"name": "food-review"}'
+        - '{"name": "movie-critique"}'
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        ports:
+        - containerPort: 8000
+          name: http
+          protocol: TCP
+        resources:
+          requests:
+            cpu: 10m