diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index ed4617bd9c..56a7f60841 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -1,58 +1,58 @@ -name: 'Docker Build' -description: 'Build Dynamo container images' +name: "Docker Build" +description: "Build Dynamo container images" inputs: framework: - description: 'Framework to build' + description: "Framework to build" required: true - default: 'vllm' + default: "vllm" target: - description: 'Target to build' + description: "Target to build" required: false - default: 'runtime' + default: "runtime" platform: - description: 'Docker platform to build on, ie. linux/amd64' + description: "Docker platform to build on, ie. linux/amd64" required: false - default: 'linux/amd64' + default: "linux/amd64" image_tag: - description: 'Custom image tag (optional, defaults to framework:latest)' + description: "Custom image tag (optional, defaults to framework:latest)" required: false ngc_ci_access_token: - description: 'NGC CI Access Token' + description: "NGC CI Access Token" required: false ci_token: - description: 'CI Token' + description: "CI Token" required: false aws_default_region: - description: 'AWS Default Region' + description: "AWS Default Region" required: false sccache_s3_bucket: - description: 'SCCache S3 Bucket' + description: "SCCache S3 Bucket" required: false aws_account_id: - description: 'AWS Account ID' + description: "AWS Account ID" required: false aws_access_key_id: - description: 'AWS Access Key ID' + description: "AWS Access Key ID" required: false aws_secret_access_key: - description: 'AWS Secret Access Key' + description: "AWS Secret Access Key" required: false base_image_tag: - description: 'Optional override for base image tag passed to build.sh' + description: "Optional override for base image tag passed to build.sh" required: false runtime_image_tag: - description: 'Optional override for RUNTIME_IMAGE_TAG build-arg' + description: "Optional override for RUNTIME_IMAGE_TAG build-arg" required: false cuda_version: - description: 'Optional override for CUDA_VERSION build-arg' + description: "Optional override for CUDA_VERSION build-arg" required: false torch_backend: - description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)' + description: "Optional override for TORCH_BACKEND build-arg (e.g., cu129)" required: false outputs: image_tag: - description: 'Image Tag' + description: "Image Tag" value: ${{ steps.build.outputs.image_tag }} runs: @@ -65,9 +65,9 @@ runs: - name: Install awscli shell: bash run: | - curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip -q -o awscliv2.zip + sudo ./aws/install --update - name: Login to ECR shell: bash env: @@ -90,7 +90,7 @@ runs: env: GITHUB_TOKEN: ${{ inputs.ci_token }} AWS_DEFAULT_REGION: ${{ inputs.aws_default_region }} - SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }} + SCCACHE_S3_BUCKET: ${{ inputs.sccache_s3_bucket }} AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} PLATFORM: ${{ inputs.platform }} diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml new file mode 100644 index 0000000000..9fa684ff9f --- /dev/null +++ b/.github/workflows/gaie.yml @@ -0,0 +1,338 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: GAIE Deployment with Dynamo (vllm) + +on: + # workflow_dispatch: + push: + branches: + - dep-423-test + # pull_request: + # paths: + # - lib/bindings/c/src/** + # - lib/runtime/src/** + # - lib/llm/src/** + schedule: + # Run at 03:00 UTC on the 1st and 15th of each month (approx every 2 weeks) + - cron: "0 3 1,15 * *" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name || github.run_id }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + gaie: + name: Build Images, Deploy Dynamo, Install GAIE, create EPP image and make sure it works. + runs-on: cpu-amd-m5-2xlarge + env: + FRAMEWORK: vllm + PROFILE: agg + DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com + MODEL_NAME: Qwen/Qwen3-0.6B + OPERATOR_IMAGE_TAG: ${{ github.sha }}-operator-amd64 + VLLM_IMAGE_TAG: ${{ github.sha }}-vllm-amd64 + GATEWAY_API_VERSION: v1.3.0 + INFERENCE_EXTENSION_VERSION: v0.5.1 + KGATEWAY_VERSION: v2.0.3 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver: docker + + - name: Install tooling (curl, jq, yq, helm, kubectl) + shell: bash + run: | + set -euo pipefail + sudo apt-get update && sudo apt-get install -y curl bash openssl gettext git jq + curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o yq + sudo install -m 0755 yq /usr/local/bin/yq + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + sudo chmod 700 get_helm.sh + sudo ./get_helm.sh + KUBECTL_VER=$(curl -L -s https://dl.k8s.io/release/stable.txt) + curl -LO "https://dl.k8s.io/release/${KUBECTL_VER}/bin/linux/amd64/kubectl" + sudo install -m 0755 kubectl /usr/local/bin/kubectl + + - name: Login to NGC + if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }} + run: | + echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + + - name: Install awscli (for docker-build caching/login) + shell: bash + run: | + curl -fsSL "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip -q -o awscliv2.zip + sudo ./aws/install --update + + - name: Login to ECR (for docker-build action compatibility) + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} + - name: Build Operator image (amd64) + id: build-operator + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + set -euo pipefail + cd deploy/cloud/operator + docker buildx build --load \ + --platform linux/amd64 \ + --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ + -f Dockerfile \ + -t dynamo-operator:latest . + + - name: Tag and Push Operator image to ACR + uses: ./.github/actions/docker-tag-push + with: + local_image: dynamo-operator:latest + push_tag: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ env.OPERATOR_IMAGE_TAG }} + aws_push: "false" + azure_push: "true" + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + - name: Build vllm runtime image (amd64) + id: build-vllm + uses: ./.github/actions/docker-build + with: + framework: vllm + target: runtime + platform: linux/amd64 + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Tag and Push vllm image to ACR + uses: ./.github/actions/docker-tag-push + with: + local_image: ${{ steps.build-vllm.outputs.image_tag }} + push_tag: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ env.VLLM_IMAGE_TAG }} + aws_push: "false" + azure_push: "true" + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + - name: Configure kubeconfig and namespace + shell: bash + run: | + set -euo pipefail + echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > "${{ github.workspace }}/.kubeconfig" + chmod 600 "${{ github.workspace }}/.kubeconfig" + # Persist KUBECONFIG for subsequent steps + echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV + # Also export for use within this step + export KUBECONFIG="${{ github.workspace }}/.kubeconfig" + # Create ephemeral namespace for Dynamo + PROFILE_SANITIZED="${PROFILE//_/-}" + echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV + echo "DEPLOYMENT_FILE=deploy/${PROFILE}.yaml" >> $GITHUB_ENV + kubectl delete namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} || true + kubectl create namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} + kubectl label namespaces gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} \ + nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true + kubectl config set-context --current --namespace=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} --kubeconfig "$KUBECONFIG" + + - name: Install Dynamo Operator Helm chart + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + helm repo add bitnami https://charts.bitnami.com/bitnami + cd deploy/cloud/helm/platform/ + helm dep build . + helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ + --set dynamo-operator.namespaceRestriction.enabled=true \ + --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ + --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ + --set dynamo-operator.controllerManager.manager.image.tag=${{ env.OPERATOR_IMAGE_TAG }} \ + --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret + timeout 300s kubectl rollout status deployment -n ${NAMESPACE} --watch + + - name: Configure image pull secrets and deploy vllm graph + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + export KUBE_NS=${NAMESPACE} + export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${VLLM_IMAGE_TAG}" + # Secrets + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n ${KUBE_NS} || true + kubectl create secret docker-registry docker-imagepullsecret \ + --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} \ + --docker-username=${{ secrets.AZURE_ACR_USER }} \ + --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} \ + --namespace=${KUBE_NS} || true + # Deploy vllm backend graph + cd $GITHUB_WORKSPACE/components/backends/${FRAMEWORK} + export GRAPH_NAME=$(yq e '.metadata.name' ${DEPLOYMENT_FILE}) + yq -i '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' ${DEPLOYMENT_FILE} + kubectl apply -n ${KUBE_NS} -f ${DEPLOYMENT_FILE} + sleep 20 + kubectl wait --for=condition=ready pod -l "nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME}" -n ${KUBE_NS} --timeout=1000s + + - name: Install GAIE CRDs and Kgateway + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + ${{ github.workspace }}/deploy/inference-gateway/install_gaie_crd_kgateway.sh + + - name: Clone GAIE repo into separate folder and set environment + shell: bash + run: | + set -euo pipefail + GAIE_CLONE_DIR="${{ github.workspace }}/external/gateway-api-inference-extension" + rm -rf "${GAIE_CLONE_DIR}" + mkdir -p "$(dirname "${GAIE_CLONE_DIR}")" + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git "${GAIE_CLONE_DIR}" + cd "${GAIE_CLONE_DIR}" + git checkout ${INFERENCE_EXTENSION_VERSION} + echo "GAIE_DIR=${GAIE_CLONE_DIR}" >> $GITHUB_ENV + echo "DYNAMO_DIR=${{ github.workspace }}" >> $GITHUB_ENV + + - name: Run build-epp-dynamo.sh + shell: bash + env: + GAIE_DIR: ${{ env.GAIE_DIR }} + DYNAMO_DIR: ${{ env.DYNAMO_DIR }} + run: | + set -euo pipefail + cd ${DYNAMO_DIR}/deploy/inference-gateway + ./build-epp-dynamo.sh + docker tag us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1-dirty ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }} + docker push ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }} + + - name: Deploy GAIE Helm chart with built EPP image + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + GAIE_DIR: ${{ env.GAIE_DIR }} + DYNAMO_DIR: ${{ env.DYNAMO_DIR }} + run: | + set -euo pipefail + # Use the pushed EPP image from the previous step + EPP_IMAGE="gitlab-master.nvidia.com:5005/dl/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }}" + echo "Using EPP_IMAGE=${EPP_IMAGE}" + export EPP_IMAGE + + # Deploy Helm chart in the Dynamo repo pointing to the EPP image + cd "${DYNAMO_DIR}/deploy/inference-gateway" + helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n ${NAMESPACE} -f ./vllm_agg_qwen.yaml --set-string extension.image=${EPP_IMAGE} + + - name: Verify qwen-epp deployment readiness + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + NS=${NAMESPACE} + echo "Waiting for Deployment/qwen-epp to appear in namespace ${NS}..." + ATTEMPTS=60 + SLEEP=10 + FOUND=0 + for i in $(seq 1 ${ATTEMPTS}); do + if kubectl get deploy qwen-epp -n ${NS} >/dev/null 2>&1; then + FOUND=1 + break + fi + sleep ${SLEEP} + done + if [[ ${FOUND} -ne 1 ]]; then + echo "Deployment qwen-epp not found in ${NS} after $((ATTEMPTS*SLEEP)) seconds" >&2 + kubectl get deploy -n ${NS} || true + exit 1 + fi + echo "Deployment qwen-epp found. Waiting for rollout to complete..." + echo "Deployment image:" + kubectl get deploy/qwen-epp -n ${NS} -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' || true + + echo "Deriving pod selector from Deployment..." + SELECTOR=$(kubectl get deploy/qwen-epp -n ${NS} -o json | jq -r '.spec.selector.matchLabels | to_entries | map("\(.key)=\(.value)") | join(",")') + if [[ -n "${SELECTOR}" ]]; then + echo "Using selector: ${SELECTOR}" + echo "Current pods (name, image, ready, waitingReason, restartCount):" + kubectl get pods -n ${NS} -l "${SELECTOR}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].image}{"\t"}{range .status.containerStatuses[*]}{.ready}{"\t"}{.state.waiting.reason}{"\t"}{.restartCount}{"\n"}{end}{end}' || true + + # Fail fast on common crash conditions + if kubectl get pods -n ${NS} -l "${SELECTOR}" -o json | jq -e '.items[] | select(.status.containerStatuses != null) | .status.containerStatuses[] | select(.state.waiting.reason=="CrashLoopBackOff" or .state.waiting.reason=="ImagePullBackOff" or .state.waiting.reason=="ErrImagePull")' >/dev/null; then + echo "Detected CrashLoopBackOff/ImagePullBackOff/ErrImagePull in qwen-epp pods" >&2 + exit 1 + fi + else + echo "Could not derive a label selector from Deployment; skipping pod checks." + fi + kubectl rollout status deploy/qwen-epp -n ${NS} --timeout=600s + kubectl get deploy qwen-epp -n ${NS} -o wide + + - name: Cleanup + if: always() + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + NS="${NAMESPACE:-}" + if [[ -n "${NS}" ]]; then + kubectl get all -n ${NS} || true + helm uninstall dynamo-platform -n ${NS} || true + kubectl delete dynamographdeployments --all -n ${NS} || true + kubectl delete namespace ${NS} || true + else + echo "NAMESPACE is unset; skipping namespace-scoped cleanup." >&2 + fi + # Uninstall Kgateway controller and CRDs (cluster-scoped) + helm uninstall kgateway -n kgateway-system || true + helm uninstall kgateway-crds -n kgateway-system || true + kubectl delete namespace kgateway-system || true + + # Remove Inference Extension CRDs (cluster-scoped) + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml || true + + # Remove Gateway API (cluster-scoped) + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/${GATEWAY_API_VERSION}/standard-install.yaml || true + + - name: Remove kubeconfig file + if: always() + shell: bash + run: | + rm -f "${{ github.workspace }}/.kubeconfig" + + # - name: Email on failure + # TODO + # if: failure() + # continue-on-error: true + # uses: dawidd6/action-send-mail@v3 + # with: + # server_address: ${{ secrets.SMTP_SERVER }} + # server_port: ${{ secrets.SMTP_PORT }} + # username: ${{ secrets.SMTP_USER }} + # password: ${{ secrets.SMTP_PASS }} + # subject: GAIE scheduled run failed + # to: atchernych@nvidia.com + # from: ${{ secrets.SMTP_FROM }} + # secure: true + # html_body: | + #
GAIE scheduled run failed.
+ #Run: details
diff --git a/deploy/inference-gateway/install_gaie_crd_kgateway.sh b/deploy/inference-gateway/install_gaie_crd_kgateway.sh index dc4e5ea107..5cf535c006 100755 --- a/deploy/inference-gateway/install_gaie_crd_kgateway.sh +++ b/deploy/inference-gateway/install_gaie_crd_kgateway.sh @@ -15,32 +15,118 @@ # See the License for the specific language governing permissions and # limitations under the License. +#!/usr/bin/env bash set -euo pipefail trap 'echo "Error at line $LINENO. Exiting."' ERR +MODEL_NAMESPACE=${MODEL_NAMESPACE:-my-model} +KGATEWAY_SYSTEM_NAMESPACE=${KGATEWAY_SYSTEM_NAMESPACE:-kgateway-system} + +GATEWAY_API_VERSION=${GATEWAY_API_VERSION:-v1.3.0} +INFERENCE_EXTENSION_VERSION=${INFERENCE_EXTENSION_VERSION:-v0.5.1} +KGATEWAY_VERSION=${KGATEWAY_VERSION:-v2.0.3} + +GATEWAY_API_MANIFEST="https://github.com/kubernetes-sigs/gateway-api/releases/download/${GATEWAY_API_VERSION}/standard-install.yaml" +GAIE_MANIFEST="https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml" +KGATEWAY_CRDS_CHART="oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds" +KGATEWAY_CHART="oci://cr.kgateway.dev/kgateway-dev/charts/kgateway" + + +GATEWAY_INSTANCE_MANIFEST="https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/v1.0.0/config/manifests/gateway/kgateway/gateway.yaml" + +# Baseline marker +BASELINE_CM_NAMESPACE=kube-system +BASELINE_CM_NAME=gaie-kgateway-baseline +BASELINE_KEY=versions +BASELINE_VAL="gateway_api=${GATEWAY_API_VERSION},gaie=${INFERENCE_EXTENSION_VERSION},kgateway=${KGATEWAY_VERSION}" + + +ns() { kubectl get ns "$1" >/dev/null 2>&1 || kubectl create ns "$1"; } +have_crd() { kubectl get crd "$1" >/dev/null 2>&1; } + +cm_matches() { + kubectl get configmap "$BASELINE_CM_NAME" -n "$BASELINE_CM_NAMESPACE" -o jsonpath='{.data.'"$BASELINE_KEY"'}' 2>/dev/null | grep -qxF "$BASELINE_VAL" +} + +set_cm() { + kubectl -n "$BASELINE_CM_NAMESPACE" create configmap "$BASELINE_CM_NAME" \ + --from-literal="$BASELINE_KEY=$BASELINE_VAL" \ + --dry-run=client -o yaml | kubectl apply -f - +} + +helm_chart_version() { + # Prints deployed chart version (e.g., 2.0.3) or empty if not installed + local rel ns json + rel="$1"; ns="$2" + json=$(helm -n "$ns" ls -f "^${rel}$" -o json 2>/dev/null || true) + if [[ -z "$json" || "$json" == "[]" ]]; then + echo "" + else + # .[0].chart looks like "kgateway-2.0.3" → cut the suffix after last dash + echo "$json" | jq -r '.[0].chart' | awk -F- '{print $NF}' + fi +} + + +ns "$BASELINE_CM_NAMESPACE" +if cm_matches; then + echo "Baseline marker already set: $BASELINE_VAL" +else + echo "Setting/Updating baseline marker: $BASELINE_VAL" + set_cm +fi -MODEL_NAMESPACE=my-model -kubectl create namespace $MODEL_NAMESPACE || true -# Install the Gateway API -GATEWAY_API_VERSION=v1.3.0 -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/$GATEWAY_API_VERSION/standard-install.yaml +ns "$MODEL_NAMESPACE" +ns "$KGATEWAY_SYSTEM_NAMESPACE" +# Install Gateway API (cluster-scoped) +if have_crd gateways.gateway.networking.k8s.io; then + echo "Gateway API CRDs already present; skipping install." +else + echo "Installing Gateway API ${GATEWAY_API_VERSION}..." + kubectl apply -f "$GATEWAY_API_MANIFEST" + kubectl wait --for=condition=Established crd/gateways.gateway.networking.k8s.io --timeout=120s +fi -# Install the Inference Extension CRDs -INFERENCE_EXTENSION_VERSION=v0.5.1 -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$INFERENCE_EXTENSION_VERSION/manifests.yaml -n $MODEL_NAMESPACE +# Install GAIE CRDs (cluster-scoped) +if have_crd inferenceclasses.gateway.networking.k8s.io; then + echo "GAIE CRDs already present; skipping install." +else + echo "Installing GAIE CRDs ${INFERENCE_EXTENSION_VERSION}..." + kubectl apply -f "$GAIE_MANIFEST" + kubectl wait --for=condition=Established crd/inferenceclasses.gateway.networking.k8s.io --timeout=120s || true +fi +# Install kGateway (cluster-scoped controller + CRDs) +# Only upgrade if the chart version differs or not installed yet. +current_crds_ver=$(helm_chart_version kgateway-crds "$KGATEWAY_SYSTEM_NAMESPACE") +current_ctrl_ver=$(helm_chart_version kgateway "$KGATEWAY_SYSTEM_NAMESPACE") -# Install and upgrade Kgateway (includes CRDs) -KGATEWAY_VERSION=v2.0.3 -KGATEWAY_SYSTEM_NAMESPACE=kgateway-system -kubectl create namespace $KGATEWAY_SYSTEM_NAMESPACE || true +if [[ "$current_crds_ver" != "$KGATEWAY_VERSION" ]]; then + echo "Installing/Upgrading kGateway CRDs to ${KGATEWAY_VERSION} (was: ${current_crds_ver:-none})..." + helm upgrade -i --create-namespace --namespace "$KGATEWAY_SYSTEM_NAMESPACE" \ + --version "$KGATEWAY_VERSION" kgateway-crds "$KGATEWAY_CRDS_CHART" +else + echo "kGateway CRDs already at ${KGATEWAY_VERSION}; skipping." +fi -helm upgrade -i --create-namespace --namespace $KGATEWAY_SYSTEM_NAMESPACE --version $KGATEWAY_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds +if [[ "$current_ctrl_ver" != "$KGATEWAY_VERSION" ]]; then + echo "Installing/Upgrading kGateway controller to ${KGATEWAY_VERSION} (was: ${current_ctrl_ver:-none})..." + helm upgrade -i --namespace "$KGATEWAY_SYSTEM_NAMESPACE" \ + --version "$KGATEWAY_VERSION" kgateway "$KGATEWAY_CHART" \ + --set inferenceExtension.enabled=true +else + echo "kGateway controller already at ${KGATEWAY_VERSION}; skipping." +fi -helm upgrade -i --namespace $KGATEWAY_SYSTEM_NAMESPACE --version $KGATEWAY_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true +# Install Gateway instance (namespaced) +if kubectl get gateway inference-gateway -n "$MODEL_NAMESPACE" >/dev/null 2>&1; then + echo "Gateway instance 'inference-gateway' already exists in ${MODEL_NAMESPACE}; skipping apply." +else + echo "Creating Gateway instance in ${MODEL_NAMESPACE}..." + kubectl apply -f "$GATEWAY_INSTANCE_MANIFEST" -n "$MODEL_NAMESPACE" +fi +echo "Done. Cluster-wide components only changed when needed." -# Deploy the Gateway Instance -kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/v1.0.0/config/manifests/gateway/kgateway/gateway.yaml -n $MODEL_NAMESPACE