diff --git a/.github/workflows/weekly-fault-tolerance.yml b/.github/workflows/weekly-fault-tolerance.yml new file mode 100644 index 0000000000..da622cbb42 --- /dev/null +++ b/.github/workflows/weekly-fault-tolerance.yml @@ -0,0 +1,603 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Weekly Fault Tolerance Tests + +on: + schedule: + # Run every Sunday at 5:00 PM PST (1:00 AM UTC Monday) + # Cron syntax: minute hour day-of-month month day-of-week + # Note: During PDT (daylight saving), this will run at 5:00 PM PDT (12:00 AM UTC Monday) + - cron: '0 1 * * 1' + + # Allow manual triggering for testing + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-weekly-${{ github.ref_name || github.run_id }} + cancel-in-progress: false + +jobs: + # Check if we should run (skip if no changes in last 24h for scheduled runs) + should-run: + runs-on: ubuntu-latest + outputs: + run_tests: ${{ steps.check.outputs.run_tests }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + with: + fetch-depth: 0 + + - name: Check for recent activity + id: check + run: | + # Always run if manually triggered + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "run_tests=true" >> $GITHUB_OUTPUT + echo "Manual trigger - running tests" + exit 0 + fi + + # For scheduled runs, check if there were commits in last 24 hours + COMMITS_LAST_24H=$(git log --since="24 hours ago" --oneline | wc -l) + if [ "$COMMITS_LAST_24H" -gt 0 ]; then + echo "run_tests=true" >> $GITHUB_OUTPUT + echo "Found $COMMITS_LAST_24H commits in last 24 hours - running tests" + else + echo "run_tests=false" >> $GITHUB_OUTPUT + echo "No commits in last 24 hours - skipping tests" + fi + + operator: + needs: should-run + if: needs.should-run.outputs.run_tests == 'true' + strategy: + fail-fast: false + matrix: + platform: + - { arch: amd64, runner: cpu-amd-m5-2xlarge } + name: operator (${{ matrix.platform.arch }}) + runs-on: ${{ matrix.platform.runner }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver: docker + - name: Install awscli + shell: bash + run: | + curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + - name: Login to ECR + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} + - name: Build Container + id: build-image + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + cd deploy/cloud/operator + docker buildx build --load \ + --platform linux/${{ matrix.platform.arch }} \ + --build-arg DOCKER_PROXY=${ECR_HOSTNAME}/dockerhub/ \ + -f Dockerfile \ + -t dynamo-operator:latest . + - name: Docker Tag and Push + uses: ./.github/actions/docker-tag-push + with: + local_image: dynamo-operator:latest + push_tag: ai-dynamo/dynamo:${{ github.sha }}-operator-${{ matrix.platform.arch }} + aws_push: 'false' + azure_push: 'true' + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + vllm: + needs: should-run + if: needs.should-run.outputs.run_tests == 'true' + strategy: + fail-fast: false + matrix: + platform: + - { arch: amd64, runner: gpu-l40-amd64 } + name: vllm (${{ matrix.platform.arch }}) + runs-on: ${{ matrix.platform.runner }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: vllm + target: runtime + platform: 'linux/${{ matrix.platform.arch }}' + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Docker Tag and Push + uses: ./.github/actions/docker-tag-push + with: + local_image: ${{ steps.build-image.outputs.image_tag }} + push_tag: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }} + aws_push: 'false' + azure_push: 'true' + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + trtllm: + needs: should-run + if: needs.should-run.outputs.run_tests == 'true' + strategy: + fail-fast: false + matrix: + platform: + - { arch: amd64, runner: gpu-l40-amd64 } + name: trtllm (${{ matrix.platform.arch }}) + runs-on: ${{ matrix.platform.runner }} + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: trtllm + target: runtime + platform: 'linux/${{ matrix.platform.arch }}' + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Docker Tag and Push + uses: ./.github/actions/docker-tag-push + with: + local_image: ${{ steps.build-image.outputs.image_tag }} + push_tag: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }} + aws_push: 'false' + azure_push: 'true' + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + sglang: + needs: should-run + if: needs.should-run.outputs.run_tests == 'true' + runs-on: gpu-l40-amd64 + name: sglang (amd64) + steps: + - name: Checkout repository + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + - name: Build Container + id: build-image + uses: ./.github/actions/docker-build + with: + framework: sglang + target: runtime + platform: 'linux/amd64' + ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }} + ci_token: ${{ secrets.CI_TOKEN }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }} + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + - name: Docker Tag and Push + uses: ./.github/actions/docker-tag-push + with: + local_image: ${{ steps.build-image.outputs.image_tag }} + push_tag: ai-dynamo/dynamo:${{ github.sha }}-sglang-amd64 + aws_push: 'false' + azure_push: 'true' + aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }} + aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }} + azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }} + azure_acr_user: ${{ secrets.AZURE_ACR_USER }} + azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }} + + deploy-test-fault-tolerance: + runs-on: cpu-amd-m5-2xlarge + needs: [should-run, operator, vllm, trtllm, sglang] + if: needs.should-run.outputs.run_tests == 'true' + permissions: + contents: read + strategy: + fail-fast: false + matrix: + test_scenario: + # SGLang scenarios + - sglang-agg-tp-1-dp-1-decode_worker + - sglang-agg-tp-1-dp-1-decode_worker_pod + - sglang-agg-tp-1-dp-1-frontend + - sglang-agg-tp-1-dp-1-frontend_pod + - sglang-agg-tp-1-dp-1-none + - sglang-agg-tp-1-dp-1-sglang_decode_detokenizer + - sglang-agg-tp-1-dp-1-sglang_decode_scheduler + - sglang-agg-tp-1-dp-2-decode_worker + - sglang-agg-tp-1-dp-2-decode_worker_pod + - sglang-agg-tp-1-dp-2-frontend + - sglang-agg-tp-1-dp-2-frontend_pod + - sglang-agg-tp-1-dp-2-none + - sglang-agg-tp-1-dp-2-sglang_decode_detokenizer + - sglang-agg-tp-1-dp-2-sglang_decode_scheduler + - sglang-agg-tp-2-dp-1-decode_worker + - sglang-agg-tp-2-dp-1-decode_worker_pod + - sglang-agg-tp-2-dp-1-frontend + - sglang-agg-tp-2-dp-1-frontend_pod + - sglang-agg-tp-2-dp-1-none + - sglang-agg-tp-2-dp-1-sglang_decode_detokenizer + - sglang-agg-tp-2-dp-1-sglang_decode_scheduler + - sglang-agg-tp-4-dp-1-decode_worker + - sglang-agg-tp-4-dp-1-decode_worker_pod + - sglang-agg-tp-4-dp-1-frontend + - sglang-agg-tp-4-dp-1-frontend_pod + - sglang-agg-tp-4-dp-1-none + - sglang-agg-tp-4-dp-1-sglang_decode_detokenizer + - sglang-agg-tp-4-dp-1-sglang_decode_scheduler + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-none + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-sglang_decode_detokenizer + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-sglang_decode_scheduler + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-sglang_prefill_detokenizer + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-1-sglang_prefill_scheduler + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-none + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker_pod + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-sglang_decode_detokenizer + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-sglang_decode_scheduler + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-sglang_prefill_detokenizer + - sglang-disagg-prefill-tp-1-decode-tp-1-dp-2-sglang_prefill_scheduler + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker_pod + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend_pod + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-none + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker_pod + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-sglang_decode_detokenizer + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-sglang_decode_scheduler + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-sglang_prefill_detokenizer + - sglang-disagg-prefill-tp-2-decode-tp-2-dp-1-sglang_prefill_scheduler + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker_pod + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend_pod + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-none + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker_pod + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-sglang_decode_detokenizer + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-sglang_decode_scheduler + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-sglang_prefill_detokenizer + - sglang-disagg-prefill-tp-4-decode-tp-4-dp-1-sglang_prefill_scheduler + # TensorRT-LLM scenarios + - trtllm-agg-tp-1-dp-1-decode_worker + - trtllm-agg-tp-1-dp-1-decode_worker_pod + - trtllm-agg-tp-1-dp-1-frontend + - trtllm-agg-tp-1-dp-1-frontend_pod + - trtllm-agg-tp-1-dp-1-none + - trtllm-agg-tp-1-dp-1-trtllm_decode_engine_core + - trtllm-agg-tp-1-dp-2-decode_worker + - trtllm-agg-tp-1-dp-2-decode_worker_pod + - trtllm-agg-tp-1-dp-2-frontend + - trtllm-agg-tp-1-dp-2-frontend_pod + - trtllm-agg-tp-1-dp-2-none + - trtllm-agg-tp-1-dp-2-trtllm_decode_engine_core + - trtllm-agg-tp-2-dp-1-decode_worker + - trtllm-agg-tp-2-dp-1-decode_worker_pod + - trtllm-agg-tp-2-dp-1-frontend + - trtllm-agg-tp-2-dp-1-frontend_pod + - trtllm-agg-tp-2-dp-1-none + - trtllm-agg-tp-2-dp-1-trtllm_decode_engine_core + - trtllm-agg-tp-4-dp-1-decode_worker + - trtllm-agg-tp-4-dp-1-decode_worker_pod + - trtllm-agg-tp-4-dp-1-frontend + - trtllm-agg-tp-4-dp-1-frontend_pod + - trtllm-agg-tp-4-dp-1-none + - trtllm-agg-tp-4-dp-1-trtllm_decode_engine_core + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-none + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-trtllm_decode_engine_core + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-1-trtllm_prefill_engine_core + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-none + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker_pod + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-trtllm_decode_engine_core + - trtllm-disagg-prefill-tp-1-decode-tp-1-dp-2-trtllm_prefill_engine_core + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker_pod + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend_pod + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-none + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker_pod + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-trtllm_decode_engine_core + - trtllm-disagg-prefill-tp-2-decode-tp-2-dp-1-trtllm_prefill_engine_core + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker_pod + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend_pod + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-none + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker_pod + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-trtllm_decode_engine_core + - trtllm-disagg-prefill-tp-4-decode-tp-4-dp-1-trtllm_prefill_engine_core + # vLLM scenarios + - vllm-agg-tp-1-dp-1-decode_worker + - vllm-agg-tp-1-dp-1-decode_worker_pod + - vllm-agg-tp-1-dp-1-frontend + - vllm-agg-tp-1-dp-1-frontend_pod + - vllm-agg-tp-1-dp-1-none + - vllm-agg-tp-1-dp-1-vllm_decode_engine_core + - vllm-agg-tp-1-dp-2-decode_worker + - vllm-agg-tp-1-dp-2-decode_worker_pod + - vllm-agg-tp-1-dp-2-frontend + - vllm-agg-tp-1-dp-2-frontend_pod + - vllm-agg-tp-1-dp-2-none + - vllm-agg-tp-1-dp-2-vllm_decode_engine_core + - vllm-agg-tp-2-dp-1-decode_worker + - vllm-agg-tp-2-dp-1-decode_worker_pod + - vllm-agg-tp-2-dp-1-frontend + - vllm-agg-tp-2-dp-1-frontend_pod + - vllm-agg-tp-2-dp-1-none + - vllm-agg-tp-2-dp-1-vllm_decode_engine_core + - vllm-agg-tp-4-dp-1-decode_worker + - vllm-agg-tp-4-dp-1-decode_worker_pod + - vllm-agg-tp-4-dp-1-frontend + - vllm-agg-tp-4-dp-1-frontend_pod + - vllm-agg-tp-4-dp-1-none + - vllm-agg-tp-4-dp-1-vllm_decode_engine_core + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-decode_worker_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-frontend_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-none + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-prefill_worker_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-vllm_decode_engine_core + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-1-vllm_prefill_engine_core + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-decode_worker_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-frontend_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-none + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-prefill_worker_pod + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-vllm_decode_engine_core + - vllm-disagg-prefill-tp-1-decode-tp-1-dp-2-vllm_prefill_engine_core + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-decode_worker_pod + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-frontend_pod + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-none + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-prefill_worker_pod + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-vllm_decode_engine_core + - vllm-disagg-prefill-tp-2-decode-tp-2-dp-1-vllm_prefill_engine_core + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-decode_worker_pod + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-frontend_pod + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-none + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-prefill_worker_pod + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-vllm_decode_engine_core + - vllm-disagg-prefill-tp-4-decode-tp-4-dp-1-vllm_prefill_engine_core + name: deploy-test-fault-tolerance (${{ matrix.test_scenario }}) + env: + DYNAMO_INGRESS_SUFFIX: dev.aire.nvidia.com + steps: + - name: Checkout code + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 + + - name: Install awscli + shell: bash + run: | + curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + + - name: Login to ECR + shell: bash + env: + ECR_HOSTNAME: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_DEFAULT_REGION }}.amazonaws.com + run: | + aws ecr get-login-password --region ${{ secrets.AWS_DEFAULT_REGION }} | docker login --username AWS --password-stdin ${ECR_HOSTNAME} + + - name: Set namespace and install dependencies + run: | + # Extract framework from test scenario for unique namespace + FRAMEWORK=$(echo "${{ matrix.test_scenario }}" | cut -d'-' -f1) + # Create unique namespace per matrix job with weekly prefix + echo "NAMESPACE=gh-weekly-${{ github.run_id }}-ft-${FRAMEWORK}" >> $GITHUB_ENV + set -x + # Install dependencies + sudo apt-get update && sudo apt-get install -y curl bash openssl gettext git jq python3 python3-pip python3-venv + + # Install yq + echo "Installing yq..." + curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o yq + sudo chmod 755 yq + sudo mv yq /usr/local/bin/ + # Install Helm + echo "Installing Helm..." + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 + sudo chmod 700 get_helm.sh + sudo ./get_helm.sh + # Install kubectl + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + sudo chmod 755 kubectl + sudo mv kubectl /usr/local/bin/ + + # Setup kubeconfig + echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig + chmod 600 .kubeconfig + export KUBECONFIG=$(pwd)/.kubeconfig + kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" + kubectl config current-context + + - name: Deploy Operator + run: | + set -x + export KUBECONFIG=$(pwd)/.kubeconfig + + # Create a namespace for this job + echo "Creating an ephemeral namespace..." + kubectl delete namespace $NAMESPACE || true + kubectl create namespace $NAMESPACE || true + echo "Attaching the labels for secrets and cleanup" + kubectl label namespaces ${NAMESPACE} nscleanup/enabled=true nscleanup/ttl=7200 gitlab-imagepull=enabled ngc-api=enabled nvcr-imagepull=enabled --overwrite=true + + # Set the namespace as default + kubectl config set-context --current --namespace=$NAMESPACE + + # Check if Istio is installed + kubectl get pods -n istio-system + # Check if default storage class exists + kubectl get storageclass + + # Install Helm chart + export IMAGE_TAG=$(cat build.env) + echo $IMAGE_TAG + export VIRTUAL_ENV=/opt/dynamo/venv + export KUBE_NS=$NAMESPACE + export ISTIO_ENABLED=true + export ISTIO_GATEWAY=istio-system/ingress-alb + export VIRTUAL_SERVICE_SUPPORTS_HTTPS=true + export DYNAMO_CLOUD=https://${NAMESPACE}.${DYNAMO_INGRESS_SUFFIX} + + # Install dynamo env secrets + kubectl create secret generic hf-token-secret --from-literal=HF_TOKEN=${{ secrets.HF_TOKEN }} -n $KUBE_NS || true + # Create docker pull secret for operator image + kubectl create secret docker-registry docker-imagepullsecret --docker-server=${{ secrets.AZURE_ACR_HOSTNAME }} --docker-username=${{ secrets.AZURE_ACR_USER }} --docker-password=${{ secrets.AZURE_ACR_PASSWORD }} --namespace=${NAMESPACE} + # Install helm dependencies + helm repo add bitnami https://charts.bitnami.com/bitnami + cd deploy/cloud/helm/platform/ + helm dep build . + # Install platform with namespace restriction for single profile testing + helm upgrade --install dynamo-platform . --namespace ${NAMESPACE} \ + --set dynamo-operator.namespaceRestriction.enabled=true \ + --set dynamo-operator.namespaceRestriction.allowedNamespaces[0]=${NAMESPACE} \ + --set dynamo-operator.controllerManager.manager.image.repository=${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo \ + --set dynamo-operator.controllerManager.manager.image.tag=${{ github.sha }}-operator-amd64 \ + --set dynamo-operator.imagePullSecrets[0].name=docker-imagepullsecret + # Wait for all deployments to be ready + timeout 300s kubectl rollout status deployment -n $NAMESPACE --watch + cd - + + export KUBECONFIG=$(pwd)/.kubeconfig + kubectl config set-context --current --namespace=$NAMESPACE + + - name: Run Fault Tolerance Tests + run: | + set -x + export KUBECONFIG=$(pwd)/.kubeconfig + export NAMESPACE=$NAMESPACE + + # Extract framework from test scenario + FRAMEWORK=$(echo "${{ matrix.test_scenario }}" | cut -d'-' -f1) + export IMAGE="${{ secrets.AZURE_ACR_HOSTNAME }}/ai-dynamo/dynamo:${{ github.sha }}-${FRAMEWORK}-amd64" + + # Set up Python virtual environment and install dependencies + python3 -m venv venv + source venv/bin/activate + pip install --upgrade pip + + # Install core dependencies needed for tests (without full project install) + pip install -r container/deps/requirements.test.txt + pip install kubernetes==32.0.1 kubernetes_asyncio kr8s pyyaml requests tabulate pydantic + + # Add project source to PYTHONPATH for test imports + export PYTHONPATH=$(pwd):$(pwd)/components/src:$PYTHONPATH + + echo "Running weekly fault tolerance test: ${{ matrix.test_scenario }}" + echo "Using namespace: $NAMESPACE" + echo "Using image: $IMAGE" + + # Run the pytest command + pytest tests/fault_tolerance/deploy/test_deployment.py::test_fault_scenario[${{ matrix.test_scenario }}] \ + -s -v \ + --namespace ${NAMESPACE} \ + --image ${IMAGE} \ + --client-type legacy + + - name: Cleanup + if: always() + timeout-minutes: 5 + run: | + echo "${{ secrets.AZURE_AKS_CI_KUBECONFIG_B64 }}" | base64 -d > .kubeconfig + chmod 600 .kubeconfig + export KUBECONFIG=$(pwd)/.kubeconfig + kubectl config set-context --current --namespace=$NAMESPACE --kubeconfig "${KUBECONFIG}" + + # For debugging purposes, list all the resources before we uninstall + kubectl get all + + echo "Deleting all DynamoGraphDeployments in namespace $NAMESPACE..." + kubectl delete dynamographdeployments --all -n $NAMESPACE || true + + # Uninstall the helm chart + helm ls + helm uninstall dynamo-platform || true + + echo "Namespace $NAMESPACE deletion initiated, proceeding with cleanup..." + kubectl delete namespace $NAMESPACE || true + echo "Namespace $NAMESPACE completed." + + # Status check job to verify all tests passed + weekly-status-check: + runs-on: ubuntu-latest + needs: [deploy-test-fault-tolerance] + if: always() + steps: + - name: Check all tests passed + run: | + echo '${{ toJson(needs) }}' | jq -e 'to_entries | map(.value.result) | all(. as $result | ["success", "skipped"] | any($result == .))' + + - name: Send notification on failure + if: failure() + run: | + echo "Weekly fault tolerance tests failed!" + echo "Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + # Add notification logic here (e.g., Slack, email, etc.) +