From 25dc5d619e0615dab97e4a5606464828766a6245 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 07:33:03 -0700 Subject: [PATCH 01/10] more Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 305 +++++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 .github/workflows/gaie.yml diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml new file mode 100644 index 0000000000..f8e524571d --- /dev/null +++ b/.github/workflows/gaie.yml @@ -0,0 +1,305 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: GAIE Deployment with Dynamo (vllm) + +on: + workflow_dispatch: + push: + branches: + - main + - release/*.*.* + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name || github.run_id }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +jobs: + gaie: + name: Build Images, Deploy Dynamo, Install GAIE, and Run Script + runs-on: cpu-amd-m5-2xlarge + env: + FRAMEWORK: vllm + PROFILE: agg + DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com + DEPLOYMENT_FILE: deploy/${{ env.PROFILE }}.yaml + MODEL_NAME: Qwen/Qwen3-0.6B + OPERATOR_IMAGE_TAG: ${{ github.sha }}-operator-amd64 + VLLM_IMAGE_TAG: ${{ github.sha }}-vllm-amd64 + GATEWAY_API_VERSION: v1.3.0 + INFERENCE_EXTENSION_VERSION: v0.5.1 + KGATEWAY_VERSION: v2.0.3 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver: docker + + - name: Install tooling (curl, jq, yq, helm, kubectl) + shell: bash + run: | + set -euo pipefail + sudo apt-get update && sudo apt-get install -y curl bash openssl gettext git jq + curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o yq + sudo install -m 0755 yq /usr/local/bin/yq + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + sudo chmod 700 get_helm.sh + sudo ./get_helm.sh + KUBECTL_VER=$(curl -L -s https://dl.k8s.io/release/stable.txt) + curl -LO "https://dl.k8s.io/release/${KUBECTL_VER}/bin/linux/amd64/kubectl" + sudo install -m 0755 kubectl /usr/local/bin/kubectl + + - name: Login to NGC (optional for base images) + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' + run: | + echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + + - name: Install awscli (for docker-build caching/login) + shell: bash + run: | + curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + + - name: Login to ECR (for docker-build action compatibility) + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} + + - name: Build Operator image (amd64) + id: build-operator + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + set -euo pipefail + cd deploy/cloud/operator + docker buildx build --load \ + --platform linux/amd64 \ + --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ + -f Dockerfile \ + -t dynamo-operator:latest . + + - name: Tag and Push Operator image to ACR + uses: ./.github/actions/docker-tag-push + with: + local_image: dynamo-operator:latest + push_tag: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ env.OPERATOR_IMAGE_TAG }} + aws_push: "false" + azure_push: "true" + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + - name: Build vllm runtime image (amd64) + id: build-vllm + uses: ./.github/actions/docker-build + with: + framework: vllm + target: runtime + platform: linux/amd64 + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + + - name: Tag and Push vllm image to ACR + uses: ./.github/actions/docker-tag-push + with: + local_image: ${{ steps.build-vllm.outputs.image_tag }} + push_tag: ${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ env.VLLM_IMAGE_TAG }} + aws_push: "false" + azure_push: "true" + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + - name: Configure kubeconfig and namespace + shell: bash + run: | + set -euo pipefail + echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig + chmod 600 .kubeconfig + export KUBECONFIG=$(pwd)/.kubeconfig + # Create ephemeral namespace for Dynamo + PROFILE_SANITIZED="${PROFILE//_/-}" + echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV + kubectl delete namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} || true + kubectl create namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} + kubectl label namespaces gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} \ + nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true + kubectl config set-context --current --namespace=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} --kubeconfig "$KUBECONFIG" + + - name: Install Dynamo Operator Helm chart + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + helm repo add bitnami https://charts.bitnami.com/bitnami + cd deploy/cloud/helm/platform/ + helm dep build . + helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ + --set dynamo-operator.namespaceRestriction.enabled=true \ + --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ + --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ + --set dynamo-operator.controllerManager.manager.image.tag=${{ env.OPERATOR_IMAGE_TAG }} \ + --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret + timeout 300s kubectl rollout status deployment -n ${NAMESPACE} --watch || true + + - name: Configure image pull secrets and deploy vllm graph + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + export KUBE_NS=${NAMESPACE} + export FRAMEWORK_RUNTIME_IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${VLLM_IMAGE_TAG}" + # Secrets + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n ${KUBE_NS} || true + kubectl create secret docker-registry docker-imagepullsecret \ + --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} \ + --docker-username=${{ secrets.AZURE_ACR_USER }} \ + --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} \ + --namespace=${KUBE_NS} || true + # Deploy vllm backend graph + cd $GITHUB_WORKSPACE/components/backends/${FRAMEWORK} + export GRAPH_NAME=$(yq e '.metadata.name' ${DEPLOYMENT_FILE}) + yq -i '.spec.services.[].extraPodSpec.mainContainer.image = env(FRAMEWORK_RUNTIME_IMAGE)' ${DEPLOYMENT_FILE} + kubectl apply -n ${KUBE_NS} -f ${DEPLOYMENT_FILE} + sleep 20 + kubectl wait --for=condition=ready pod -l "nvidia.com/dynamo-graph-deployment-name=${GRAPH_NAME}" -n ${KUBE_NS} --timeout=1000s + + - name: Install Gateway API (standard) + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/${GATEWAY_API_VERSION}/standard-install.yaml + + - name: Install Inference Extension CRDs + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + # Note: manifests are cluster-scoped; namespace flag kept to match request + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml -n my-model + + - name: Install Kgateway CRDs and controller + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + helm upgrade -i --create-namespace --namespace kgateway-system --version ${KGATEWAY_VERSION} kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds + helm upgrade -i --namespace kgateway-system --version ${KGATEWAY_VERSION} kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true + + - name: Deploy Gateway Instance + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + kubectl create namespace my-model || true + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml -n my-model + + - name: Clone GAIE repo and set environment + shell: bash + run: | + set -euo pipefail + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git + cd gateway-api-inference-extension + git checkout ${INFERENCE_EXTENSION_VERSION} + echo "GAIE_DIR=$(pwd)" >> $GITHUB_ENV + echo "DYNAMO_DIR=${{ github.workspace }}" >> $GITHUB_ENV + + - name: Run build-epp-dynamo.sh + shell: bash + env: + GAIE_DIR: ${{ env.GAIE_DIR }} + DYNAMO_DIR: ${{ env.DYNAMO_DIR }} + run: | + set -euo pipefail + cd ${DYNAMO_DIR}/deploy/inference-gateway + ./build-epp-dynamo.sh + docker tag us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.5.1-dirty gitlab-master.nvidia.com:5005/dl/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }} + docker push gitlab-master.nvidia.com:5005/dl/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }} + + - name: Deploy GAIE Helm chart with built EPP image + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + GAIE_DIR: ${{ env.GAIE_DIR }} + DYNAMO_DIR: ${{ env.DYNAMO_DIR }} + run: | + set -euo pipefail + # Discover the EPP image tag from the previous build + cd "${GAIE_DIR}" + EPP_IMAGE="$(docker images --format '{{.Repository}}:{{.Tag}}' | grep 'gateway-api-inference-extension/epp' | head -n1 || true)" + if [[ -z "${EPP_IMAGE}" ]]; then + # Fallback: reconstruct the default tag used by the Makefile + GIT_TAG=$(git describe --tags --dirty --always) + EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GIT_TAG}" + fi + echo "Using EPP_IMAGE=${EPP_IMAGE}" + export EPP_IMAGE + + # Deploy Helm chart in the Dynamo repo pointing to the EPP image + cd "${DYNAMO_DIR}/deploy/inference-gateway" + helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set-string extension.image=${EPP_IMAGE} + + - name: Verify qwen-epp deployment readiness in my-model + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + NS=my-model + echo "Waiting for Deployment/qwen-epp to appear in namespace ${NS}..." + ATTEMPTS=60 + SLEEP=10 + FOUND=0 + for i in $(seq 1 ${ATTEMPTS}); do + if kubectl get deploy qwen-epp -n ${NS} >/dev/null 2>&1; then + FOUND=1 + break + fi + sleep ${SLEEP} + done + if [[ ${FOUND} -ne 1 ]]; then + echo "Deployment qwen-epp not found in ${NS} after $((ATTEMPTS*SLEEP)) seconds" >&2 + kubectl get deploy -n ${NS} || true + exit 1 + fi + echo "Deployment qwen-epp found. Waiting for rollout to complete..." + kubectl rollout status deploy/qwen-epp -n ${NS} --timeout=600s + kubectl get deploy qwen-epp -n ${NS} -o wide + + - name: Cleanup (optional) + if: always() + shell: bash + env: + KUBECONFIG: ${{ github.workspace }}/.kubeconfig + run: | + set -euo pipefail + kubectl get all -n ${NAMESPACE} || true + helm uninstall dynamo-platform -n ${NAMESPACE} || true + kubectl delete dynamographdeployments --all -n ${NAMESPACE} || true + kubectl delete namespace ${NAMESPACE} || true + # GAIE related namespaces + kubectl get all -n my-model || true + kubectl delete namespace my-model || true From 00c094a897e4bcaa63d026d9d3c55e9d3a99c055 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 13:10:20 -0700 Subject: [PATCH 02/10] test EPP build Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 108 ++++++++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index f8e524571d..258a8d9f75 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -4,11 +4,19 @@ name: GAIE Deployment with Dynamo (vllm) on: - workflow_dispatch: - push: - branches: - - main - - release/*.*.* + # workflow_dispatch: + # push: + # branches: + # - main + # - release/*.*.* + # pull_request: + # paths: + # - lib/bindings/c/src/** + # - lib/runtime/src/** + # - lib/llm/src/** + schedule: + # Run at 03:00 UTC on the 1st and 15th of each month (approx every 2 weeks) + - cron: "0 3 1,15 * *" concurrency: group: ${{ github.workflow }}-${{ github.ref_name || github.run_id }} @@ -16,13 +24,12 @@ concurrency: jobs: gaie: - name: Build Images, Deploy Dynamo, Install GAIE, and Run Script + name: Build Images, Deploy Dynamo, Install GAIE, create EPP image and make sure it works. runs-on: cpu-amd-m5-2xlarge env: FRAMEWORK: vllm PROFILE: agg DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com - DEPLOYMENT_FILE: deploy/${{ env.PROFILE }}.yaml MODEL_NAME: Qwen/Qwen3-0.6B OPERATOR_IMAGE_TAG: ${{ github.sha }}-operator-amd64 VLLM_IMAGE_TAG: ${{ github.sha }}-vllm-amd64 @@ -136,6 +143,7 @@ jobs: # Create ephemeral namespace for Dynamo PROFILE_SANITIZED="${PROFILE//_/-}" echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV + echo "DEPLOYMENT_FILE=deploy/${PROFILE}.yaml" >> $GITHUB_ENV kubectl delete namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} || true kubectl create namespace gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} kubectl label namespaces gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED} \ @@ -196,8 +204,8 @@ jobs: KUBECONFIG: ${{ github.workspace }}/.kubeconfig run: | set -euo pipefail - # Note: manifests are cluster-scoped; namespace flag kept to match request - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml -n my-model + # Note: manifests are cluster-scoped + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml - name: Install Kgateway CRDs and controller shell: bash @@ -214,17 +222,20 @@ jobs: KUBECONFIG: ${{ github.workspace }}/.kubeconfig run: | set -euo pipefail - kubectl create namespace my-model || true - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml -n my-model + kubectl create namespace ${NAMESPACE} || true + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/gateway.yaml -n ${NAMESPACE} - - name: Clone GAIE repo and set environment + - name: Clone GAIE repo into separate folder and set environment shell: bash run: | set -euo pipefail - git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git - cd gateway-api-inference-extension + GAIE_CLONE_DIR="${{ github.workspace }}/external/gateway-api-inference-extension" + rm -rf "${GAIE_CLONE_DIR}" + mkdir -p "$(dirname "${GAIE_CLONE_DIR}")" + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git "${GAIE_CLONE_DIR}" + cd "${GAIE_CLONE_DIR}" git checkout ${INFERENCE_EXTENSION_VERSION} - echo "GAIE_DIR=$(pwd)" >> $GITHUB_ENV + echo "GAIE_DIR=${GAIE_CLONE_DIR}" >> $GITHUB_ENV echo "DYNAMO_DIR=${{ github.workspace }}" >> $GITHUB_ENV - name: Run build-epp-dynamo.sh @@ -247,28 +258,22 @@ jobs: DYNAMO_DIR: ${{ env.DYNAMO_DIR }} run: | set -euo pipefail - # Discover the EPP image tag from the previous build - cd "${GAIE_DIR}" - EPP_IMAGE="$(docker images --format '{{.Repository}}:{{.Tag}}' | grep 'gateway-api-inference-extension/epp' | head -n1 || true)" - if [[ -z "${EPP_IMAGE}" ]]; then - # Fallback: reconstruct the default tag used by the Makefile - GIT_TAG=$(git describe --tags --dirty --always) - EPP_IMAGE="us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:${GIT_TAG}" - fi + # Use the pushed EPP image from the previous step + EPP_IMAGE="gitlab-master.nvidia.com:5005/dl/ai-dynamo/dynamo/epp-inference-extension-dynamo:${{ github.run_id }}" echo "Using EPP_IMAGE=${EPP_IMAGE}" export EPP_IMAGE # Deploy Helm chart in the Dynamo repo pointing to the EPP image cd "${DYNAMO_DIR}/deploy/inference-gateway" - helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n my-model -f ./vllm_agg_qwen.yaml --set-string extension.image=${EPP_IMAGE} + helm upgrade --install dynamo-gaie ./helm/dynamo-gaie -n ${NAMESPACE} -f ./vllm_agg_qwen.yaml --set-string extension.image=${EPP_IMAGE} - - name: Verify qwen-epp deployment readiness in my-model + - name: Verify qwen-epp deployment readiness shell: bash env: KUBECONFIG: ${{ github.workspace }}/.kubeconfig run: | set -euo pipefail - NS=my-model + NS=${NAMESPACE} echo "Waiting for Deployment/qwen-epp to appear in namespace ${NS}..." ATTEMPTS=60 SLEEP=10 @@ -286,10 +291,28 @@ jobs: exit 1 fi echo "Deployment qwen-epp found. Waiting for rollout to complete..." + echo "Deployment image:" + kubectl get deploy/qwen-epp -n ${NS} -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' || true + + echo "Deriving pod selector from Deployment..." + SELECTOR=$(kubectl get deploy/qwen-epp -n ${NS} -o json | jq -r '.spec.selector.matchLabels | to_entries | map("\(.key)=\(.value)") | join(",")') + if [[ -n "${SELECTOR}" ]]; then + echo "Using selector: ${SELECTOR}" + echo "Current pods (name, image, ready, waitingReason, restartCount):" + kubectl get pods -n ${NS} -l "${SELECTOR}" -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.containers[*].image}{"\t"}{range .status.containerStatuses[*]}{.ready}{"\t"}{.state.waiting.reason}{"\t"}{.restartCount}{"\n"}{end}{end}' || true + + # Fail fast on common crash conditions + if kubectl get pods -n ${NS} -l "${SELECTOR}" -o json | jq -e '.items[] | select(.status.containerStatuses != null) | .status.containerStatuses[] | select(.state.waiting.reason=="CrashLoopBackOff" or .state.waiting.reason=="ImagePullBackOff" or .state.waiting.reason=="ErrImagePull")' >/dev/null; then + echo "Detected CrashLoopBackOff/ImagePullBackOff/ErrImagePull in qwen-epp pods" >&2 + exit 1 + fi + else + echo "Could not derive a label selector from Deployment; skipping pod checks." + fi kubectl rollout status deploy/qwen-epp -n ${NS} --timeout=600s kubectl get deploy qwen-epp -n ${NS} -o wide - - name: Cleanup (optional) + - name: Cleanup if: always() shell: bash env: @@ -300,6 +323,31 @@ jobs: helm uninstall dynamo-platform -n ${NAMESPACE} || true kubectl delete dynamographdeployments --all -n ${NAMESPACE} || true kubectl delete namespace ${NAMESPACE} || true - # GAIE related namespaces - kubectl get all -n my-model || true - kubectl delete namespace my-model || true + # Uninstall Kgateway controller and CRDs (cluster-scoped) + helm uninstall kgateway -n kgateway-system || true + helm uninstall kgateway-crds -n kgateway-system || true + kubectl delete namespace kgateway-system || true + + # Remove Inference Extension CRDs (cluster-scoped) + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${INFERENCE_EXTENSION_VERSION}/manifests.yaml || true + + # Remove Gateway API (cluster-scoped) + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/${GATEWAY_API_VERSION}/standard-install.yaml || true + + # - name: Email on failure + # TODO + # if: failure() + # continue-on-error: true + # uses: dawidd6/action-send-mail@v3 + # with: + # server_address: ${{ secrets.SMTP_SERVER }} + # server_port: ${{ secrets.SMTP_PORT }} + # username: ${{ secrets.SMTP_USER }} + # password: ${{ secrets.SMTP_PASS }} + # subject: GAIE scheduled run failed + # to: atchernych@nvidia.com + # from: ${{ secrets.SMTP_FROM }} + # secure: true + # html_body: | + #

GAIE scheduled run failed.

+ #

Run: details

From a0aabf08bf10f03036dd1c6039435963be6001f6 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 13:16:31 -0700 Subject: [PATCH 03/10] enable on this branch Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 258a8d9f75..8e35c28823 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -5,10 +5,9 @@ name: GAIE Deployment with Dynamo (vllm) on: # workflow_dispatch: - # push: - # branches: - # - main - # - release/*.*.* + push: + branches: + - dep-423-test # pull_request: # paths: # - lib/bindings/c/src/** From cdf1c1638b9483fb4dbded8172c254c7020da8ab Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 13:30:03 -0700 Subject: [PATCH 04/10] cleanup Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 8e35c28823..4a0a891107 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -59,7 +59,7 @@ jobs: sudo install -m 0755 kubectl /usr/local/bin/kubectl - name: Login to NGC (optional for base images) - if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' + if: always() run: | echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin @@ -136,9 +136,12 @@ jobs: shell: bash run: | set -euo pipefail - echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig - chmod 600 .kubeconfig - export KUBECONFIG=$(pwd)/.kubeconfig + echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > "${{ github.workspace }}/.kubeconfig" + chmod 600 "${{ github.workspace }}/.kubeconfig" + # Persist KUBECONFIG for subsequent steps + echo "KUBECONFIG=${{ github.workspace }}/.kubeconfig" >> $GITHUB_ENV + # Also export for use within this step + export KUBECONFIG="${{ github.workspace }}/.kubeconfig" # Create ephemeral namespace for Dynamo PROFILE_SANITIZED="${PROFILE//_/-}" echo "NAMESPACE=gh-job-id-${{ github.run_id }}-${FRAMEWORK}-${PROFILE_SANITIZED}" >> $GITHUB_ENV @@ -164,7 +167,7 @@ jobs: --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ --set dynamo-operator.controllerManager.manager.image.tag=${{ env.OPERATOR_IMAGE_TAG }} \ --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret - timeout 300s kubectl rollout status deployment -n ${NAMESPACE} --watch || true + timeout 300s kubectl rollout status deployment -n ${NAMESPACE} --watch - name: Configure image pull secrets and deploy vllm graph shell: bash @@ -333,6 +336,12 @@ jobs: # Remove Gateway API (cluster-scoped) kubectl delete -f https://github.com/kubernetes-sigs/gateway-api/releases/download/${GATEWAY_API_VERSION}/standard-install.yaml || true + - name: Remove kubeconfig file + if: always() + shell: bash + run: | + rm -f "${{ github.workspace }}/.kubeconfig" + # - name: Email on failure # TODO # if: failure() From 1b1eb842eb7f75fcf35267af163673b729c72b85 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 16:38:58 -0700 Subject: [PATCH 05/10] fix namespace Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 4a0a891107..889d3cba21 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -321,10 +321,15 @@ jobs: KUBECONFIG: ${{ github.workspace }}/.kubeconfig run: | set -euo pipefail - kubectl get all -n ${NAMESPACE} || true - helm uninstall dynamo-platform -n ${NAMESPACE} || true - kubectl delete dynamographdeployments --all -n ${NAMESPACE} || true - kubectl delete namespace ${NAMESPACE} || true + NS="${NAMESPACE:-}" + if [[ -n "${NS}" ]]; then + kubectl get all -n ${NS} || true + helm uninstall dynamo-platform -n ${NS} || true + kubectl delete dynamographdeployments --all -n ${NS} || true + kubectl delete namespace ${NS} || true + else + echo "NAMESPACE is unset; skipping namespace-scoped cleanup." >&2 + fi # Uninstall Kgateway controller and CRDs (cluster-scoped) helm uninstall kgateway -n kgateway-system || true helm uninstall kgateway-crds -n kgateway-system || true From d41fe3d04e3ef8af38123d6aaab227d113d40084 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 17:01:29 -0700 Subject: [PATCH 06/10] force login Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 889d3cba21..0e60f1fa45 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -77,6 +77,13 @@ jobs: run: | aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} + - name: Force fresh nvcr login + shell: bash + run: | + set -euo pipefail + docker logout nvcr.io || true + echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + - name: Build Operator image (amd64) id: build-operator shell: bash From c9a50a3205837c92920e9363d81ed59bcde29d5e Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Thu, 23 Oct 2025 17:05:09 -0700 Subject: [PATCH 07/10] login to nvcro optionally Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 0e60f1fa45..2e80a76e65 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -59,7 +59,7 @@ jobs: sudo install -m 0755 kubectl /usr/local/bin/kubectl - name: Login to NGC (optional for base images) - if: always() + if: ${{ false }} run: | echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin @@ -77,12 +77,18 @@ jobs: run: | aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} - - name: Force fresh nvcr login + - name: Ensure nvcr distroless base is available shell: bash run: | set -euo pipefail - docker logout nvcr.io || true - echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + if ! docker image inspect nvcr.io/nvidia/distroless/go:v3.1.13 >/dev/null 2>&1; then + echo "nvcr distroless not present; logging in and pulling..." + docker logout nvcr.io || true + echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + docker pull nvcr.io/nvidia/distroless/go:v3.1.13 + else + echo "nvcr distroless already present; skipping login." + fi - name: Build Operator image (amd64) id: build-operator From e5be572953d6dcea7cb0b7e2940c34f2cd1bdfab Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Fri, 24 Oct 2025 10:06:40 -0700 Subject: [PATCH 08/10] rm re-login to nvcro Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 2e80a76e65..596e02d6db 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -77,18 +77,18 @@ jobs: run: | aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} - - name: Ensure nvcr distroless base is available - shell: bash - run: | - set -euo pipefail - if ! docker image inspect nvcr.io/nvidia/distroless/go:v3.1.13 >/dev/null 2>&1; then - echo "nvcr distroless not present; logging in and pulling..." - docker logout nvcr.io || true - echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin - docker pull nvcr.io/nvidia/distroless/go:v3.1.13 - else - echo "nvcr distroless already present; skipping login." - fi + # - name: Ensure nvcr distroless base is available + # shell: bash + # run: | + # set -euo pipefail + # if ! docker image inspect nvcr.io/nvidia/distroless/go:v3.1.13 >/dev/null 2>&1; then + # echo "nvcr distroless not present; logging in and pulling..." + # docker logout nvcr.io || true + # echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin + # docker pull nvcr.io/nvidia/distroless/go:v3.1.13 + # else + # echo "nvcr distroless already present; skipping login." + # fi - name: Build Operator image (amd64) id: build-operator From 4f5645d440d447b971c6038ea865afb7e90ee027 Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Fri, 24 Oct 2025 10:24:40 -0700 Subject: [PATCH 09/10] change login to nvcro Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 596e02d6db..8522b938f9 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -58,8 +58,8 @@ jobs: curl -LO "https://dl.k8s.io/release/${KUBECTL_VER}/bin/linux/amd64/kubectl" sudo install -m 0755 kubectl /usr/local/bin/kubectl - - name: Login to NGC (optional for base images) - if: ${{ false }} + - name: Login to NGC + if: ${{ github.event_name == 'push' || github.event_name == 'schedule' }} run: | echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin From c2f2f65e9c068fd1a85a4f92262bb92456c7148d Mon Sep 17 00:00:00 2001 From: Anna Tchernych Date: Fri, 24 Oct 2025 10:41:28 -0700 Subject: [PATCH 10/10] ru after permissions update Signed-off-by: Anna Tchernych --- .github/workflows/gaie.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/gaie.yml b/.github/workflows/gaie.yml index 8522b938f9..981532c73b 100644 --- a/.github/workflows/gaie.yml +++ b/.github/workflows/gaie.yml @@ -76,20 +76,6 @@ jobs: ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com run: | aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} - - # - name: Ensure nvcr distroless base is available - # shell: bash - # run: | - # set -euo pipefail - # if ! docker image inspect nvcr.io/nvidia/distroless/go:v3.1.13 >/dev/null 2>&1; then - # echo "nvcr distroless not present; logging in and pulling..." - # docker logout nvcr.io || true - # echo "${{ secrets.NGC_CI_ACCESS_TOKEN }}" | docker login nvcr.io -u '$oauthtoken' --password-stdin - # docker pull nvcr.io/nvidia/distroless/go:v3.1.13 - # else - # echo "nvcr distroless already present; skipping login." - # fi - - name: Build Operator image (amd64) id: build-operator shell: bash