From 28b94fdc9cf2e55a49764bcefaa66b66237de8b5 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Thu, 23 Oct 2025 16:50:38 -0700 Subject: [PATCH 1/8] fix test uploading to not overwrite itself --- .github/actions/pytest/action.yml | 19 ++++++++++++++----- .../upload_complete_workflow_metrics.py | 3 ++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 0037129a5e..0307dacc61 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -84,17 +84,24 @@ runs: ERROR_TESTS=$(grep -o 'errors="[0-9]*"' "$JUNIT_FILE" | grep -o '[0-9]*' | head -1 || echo "0") echo "๐Ÿ“Š ${TOTAL_TESTS} tests completed (${FAILED_TESTS} failed, ${ERROR_TESTS} errors)" - # Create metadata file with step context information - METADATA_FILE="test-results/test_metadata.json" + # Create uniquely named metadata file with step context information + # Use framework-testtype-arch to make it unique per test run + METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.json" + JUNIT_UNIQUE_NAME="pytest_test_report_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.xml" + + # Rename XML file to unique name + mv "$JUNIT_FILE" "test-results/$JUNIT_UNIQUE_NAME" + echo '{' > "$METADATA_FILE" echo ' "job_name": "${{ github.job }}",' >> "$METADATA_FILE" echo ' "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE" echo ' "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE" echo ' "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE" - echo ' "junit_xml_file": "pytest_test_report.xml",' >> "$METADATA_FILE" + echo ' "junit_xml_file": "'"$JUNIT_UNIQUE_NAME"'",' >> "$METADATA_FILE" echo ' "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE" echo '}' >> "$METADATA_FILE" - echo "๐Ÿ“ Created test metadata file" + echo "๐Ÿ“ Created test metadata file: $METADATA_FILE" + echo "๐Ÿ“ Renamed XML file to: $JUNIT_UNIQUE_NAME" else echo "โš ๏ธ JUnit XML file not found - test results may not be available for upload" TOTAL_TESTS=0 @@ -110,5 +117,7 @@ runs: if: always() # Always upload test results, even if tests failed with: name: test-results-${{ inputs.framework }}-${{ inputs.test_type }}-${{ env.PLATFORM_ARCH }} - path: test-results/${{ env.PYTEST_XML_FILE }} + path: | + test-results/pytest_test_report_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.xml + test-results/test_metadata_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.json retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/upload_complete_workflow_metrics.py b/.github/workflows/upload_complete_workflow_metrics.py index 1c0cc57fc5..f98ec68bd3 100644 --- a/.github/workflows/upload_complete_workflow_metrics.py +++ b/.github/workflows/upload_complete_workflow_metrics.py @@ -834,7 +834,8 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: return # Look for metadata files to get accurate step and framework info - metadata_files = glob.glob(f"{test_results_dir}/test_metadata.json") + # Updated pattern to match new unique naming: test_metadata___.json + metadata_files = glob.glob(f"{test_results_dir}/test_metadata_*.json") if not metadata_files: print(f"โš ๏ธ No test metadata files found in {test_results_dir}") From e4cd9d63a3df54422a4d7d86d505e1433c7ccdbf Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Thu, 23 Oct 2025 16:54:18 -0700 Subject: [PATCH 2/8] sanitize test type --- .github/actions/pytest/action.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 0307dacc61..3a35b26181 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -86,8 +86,10 @@ runs: # Create uniquely named metadata file with step context information # Use framework-testtype-arch to make it unique per test run - METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.json" - JUNIT_UNIQUE_NAME="pytest_test_report_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.xml" + # Sanitize test_type to remove commas and spaces for safe filenames + TEST_TYPE_SAFE=$(echo "${{ inputs.test_type }}" | tr ', ' '_') + METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.json" + JUNIT_UNIQUE_NAME="pytest_test_report_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.xml" # Rename XML file to unique name mv "$JUNIT_FILE" "test-results/$JUNIT_UNIQUE_NAME" @@ -117,7 +119,5 @@ runs: if: always() # Always upload test results, even if tests failed with: name: test-results-${{ inputs.framework }}-${{ inputs.test_type }}-${{ env.PLATFORM_ARCH }} - path: | - test-results/pytest_test_report_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.xml - test-results/test_metadata_${{ inputs.framework }}_${{ inputs.test_type }}_${{ inputs.platform_arch }}.json + path: test-results/ retention-days: 7 \ No newline at end of file From aec658cd5b45cfdfb16f4579a6fa3804fa6dc84f Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Thu, 23 Oct 2025 17:57:23 -0700 Subject: [PATCH 3/8] fix uploads --- .github/actions/pytest/action.yml | 12 ++++++--- .../upload_complete_workflow_metrics.py | 27 ++++++++++++++++--- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index 3a35b26181..acd2162f0b 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -73,6 +73,10 @@ runs: shell: bash run: | + # Sanitize test_type for filenames (always set this for artifact upload) + TEST_TYPE_SAFE=$(echo "${{ inputs.test_type }}" | tr ', ' '_') + echo "TEST_TYPE_SAFE=${TEST_TYPE_SAFE}" >> $GITHUB_ENV + # Check for JUnit XML file and determine test status JUNIT_FILE="test-results/pytest_test_report.xml" @@ -86,8 +90,6 @@ runs: # Create uniquely named metadata file with step context information # Use framework-testtype-arch to make it unique per test run - # Sanitize test_type to remove commas and spaces for safe filenames - TEST_TYPE_SAFE=$(echo "${{ inputs.test_type }}" | tr ', ' '_') METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.json" JUNIT_UNIQUE_NAME="pytest_test_report_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.xml" @@ -118,6 +120,8 @@ runs: uses: actions/upload-artifact@v4 if: always() # Always upload test results, even if tests failed with: - name: test-results-${{ inputs.framework }}-${{ inputs.test_type }}-${{ env.PLATFORM_ARCH }} - path: test-results/ + name: test-results-${{ inputs.framework }}-${{ env.TEST_TYPE_SAFE }}-${{ env.PLATFORM_ARCH }} + path: | + test-results/pytest_test_report_${{ inputs.framework }}_${{ env.TEST_TYPE_SAFE }}_${{ inputs.platform_arch }}.xml + test-results/test_metadata_${{ inputs.framework }}_${{ env.TEST_TYPE_SAFE }}_${{ inputs.platform_arch }}.json retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/upload_complete_workflow_metrics.py b/.github/workflows/upload_complete_workflow_metrics.py index f98ec68bd3..d3791693cd 100644 --- a/.github/workflows/upload_complete_workflow_metrics.py +++ b/.github/workflows/upload_complete_workflow_metrics.py @@ -827,6 +827,22 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: print(f"๐Ÿงช Looking for test results for job '{job_name}'") + # Determine framework from job name to filter metadata files + framework = None + job_name_lower = job_name.lower() + if "vllm" in job_name_lower: + framework = "vllm" + elif "sglang" in job_name_lower: + framework = "sglang" + elif "trtllm" in job_name_lower: + framework = "trtllm" + + if not framework: + print(f"โš ๏ธ Could not determine framework from job name: {job_name}") + return + + print(f"๐Ÿ“ฆ Job framework: {framework}") + # Look for test results directory test_results_dir = "test-results" if not os.path.exists(test_results_dir): @@ -835,13 +851,18 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: # Look for metadata files to get accurate step and framework info # Updated pattern to match new unique naming: test_metadata___.json - metadata_files = glob.glob(f"{test_results_dir}/test_metadata_*.json") + # Filter by framework to only process this job's tests + metadata_files = glob.glob( + f"{test_results_dir}/test_metadata_{framework}_*.json" + ) if not metadata_files: - print(f"โš ๏ธ No test metadata files found in {test_results_dir}") + print( + f"โš ๏ธ No test metadata files found for framework '{framework}' in {test_results_dir}" + ) return - print(f"๐Ÿ“„ Found {len(metadata_files)} test metadata files") + print(f"๐Ÿ“„ Found {len(metadata_files)} test metadata files for {framework}") total_tests_processed = 0 From 4682d3de55c12988e71e45d25e1ab024a9c928b5 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Fri, 24 Oct 2025 10:17:57 -0700 Subject: [PATCH 4/8] filter for platform arch --- .../upload_complete_workflow_metrics.py | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/workflows/upload_complete_workflow_metrics.py b/.github/workflows/upload_complete_workflow_metrics.py index d3791693cd..d1d956592c 100644 --- a/.github/workflows/upload_complete_workflow_metrics.py +++ b/.github/workflows/upload_complete_workflow_metrics.py @@ -841,7 +841,23 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: print(f"โš ๏ธ Could not determine framework from job name: {job_name}") return - print(f"๐Ÿ“ฆ Job framework: {framework}") + # Determine platform architecture from job name + # Job names typically look like: "vllm (amd64)" or "sglang (arm64)" + platform_arch = None + if "(amd64)" in job_name_lower or "amd64" in job_name_lower: + platform_arch = "amd64" + elif "(arm64)" in job_name_lower or "arm64" in job_name_lower: + platform_arch = "arm64" + + if not platform_arch: + print( + f"โš ๏ธ Could not determine platform architecture from job name: {job_name}" + ) + # Default to amd64 if not specified + platform_arch = "amd64" + print(f" Defaulting to platform_arch: {platform_arch}") + + print(f"๐Ÿ“ฆ Job framework: {framework}, platform_arch: {platform_arch}") # Look for test results directory test_results_dir = "test-results" @@ -851,18 +867,20 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: # Look for metadata files to get accurate step and framework info # Updated pattern to match new unique naming: test_metadata___.json - # Filter by framework to only process this job's tests + # Filter by both framework AND architecture to only process this job's tests metadata_files = glob.glob( - f"{test_results_dir}/test_metadata_{framework}_*.json" + f"{test_results_dir}/test_metadata_{framework}_*_{platform_arch}.json" ) if not metadata_files: print( - f"โš ๏ธ No test metadata files found for framework '{framework}' in {test_results_dir}" + f"โš ๏ธ No test metadata files found for framework '{framework}' with arch '{platform_arch}' in {test_results_dir}" ) return - print(f"๐Ÿ“„ Found {len(metadata_files)} test metadata files for {framework}") + print( + f"๐Ÿ“„ Found {len(metadata_files)} test metadata files for {framework} ({platform_arch})" + ) total_tests_processed = 0 From f6e04c67e6fdc537cfecc4fe0b86cfff6444f228 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Fri, 24 Oct 2025 11:23:19 -0700 Subject: [PATCH 5/8] add arch for sglang --- .../container-validation-backends.yml | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index b197838242..c51c8258f1 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -174,14 +174,18 @@ jobs: sglang: needs: changed-files if: needs.changed-files.outputs.has_code_changes == 'true' - strategy: - fail-fast: false - matrix: - platform: - - { arch: amd64, runner: gpu-l40-amd64 } - - { arch: arm64, runner: cpu-arm-r8g-4xlarge } - name: sglang (${{ matrix.platform.arch }}) - runs-on: ${{ matrix.platform.runner }} + # OPS-1140: Uncomment this for sglang arm switch to wideep + # strategy: + # fail-fast: false + # matrix: + # platform: + # - { arch: amd64, runner: gpu-l40-amd64 } + # - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + # name: sglang (${{ matrix.platform.arch }}) + # runs-on: ${{ matrix.platform.runner }} + # OPS-1140: Remove these lines when matrix is enabled, replaced with the above lines + name: sglang (amd64) + runs-on: gpu-l40-amd64 steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 @@ -223,7 +227,8 @@ jobs: pytest_marks: "unit and sglang and gpu_1" framework: "sglang" test_type: "unit" - platform_arch: ${{ matrix.platform.arch }} + platform_arch: amd64 + # OPS-1140: Replace above with ${{ matrix.platform.arch }} when matrix is enabled - name: Run e2e tests if: ${{ matrix.platform.arch != 'arm64' }} uses: ./.github/actions/pytest @@ -232,7 +237,8 @@ jobs: pytest_marks: "e2e and sglang and gpu_1" framework: "sglang" test_type: "e2e, gpu_1" - platform_arch: ${{ matrix.platform.arch }} + platform_arch: amd64 + # OPS-1140: Replace above with ${{ matrix.platform.arch }} when matrix is enabled trtllm: needs: changed-files From 24463d045912704c32badcdefa3f1bcf1ab77694 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Wed, 29 Oct 2025 12:24:33 -0700 Subject: [PATCH 6/8] fix rebase --- .../container-validation-backends.yml | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/.github/workflows/container-validation-backends.yml b/.github/workflows/container-validation-backends.yml index c51c8258f1..b197838242 100644 --- a/.github/workflows/container-validation-backends.yml +++ b/.github/workflows/container-validation-backends.yml @@ -174,18 +174,14 @@ jobs: sglang: needs: changed-files if: needs.changed-files.outputs.has_code_changes == 'true' - # OPS-1140: Uncomment this for sglang arm switch to wideep - # strategy: - # fail-fast: false - # matrix: - # platform: - # - { arch: amd64, runner: gpu-l40-amd64 } - # - { arch: arm64, runner: cpu-arm-r8g-4xlarge } - # name: sglang (${{ matrix.platform.arch }}) - # runs-on: ${{ matrix.platform.runner }} - # OPS-1140: Remove these lines when matrix is enabled, replaced with the above lines - name: sglang (amd64) - runs-on: gpu-l40-amd64 + strategy: + fail-fast: false + matrix: + platform: + - { arch: amd64, runner: gpu-l40-amd64 } + - { arch: arm64, runner: cpu-arm-r8g-4xlarge } + name: sglang (${{ matrix.platform.arch }}) + runs-on: ${{ matrix.platform.runner }} steps: - name: Checkout repository uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 @@ -227,8 +223,7 @@ jobs: pytest_marks: "unit and sglang and gpu_1" framework: "sglang" test_type: "unit" - platform_arch: amd64 - # OPS-1140: Replace above with ${{ matrix.platform.arch }} when matrix is enabled + platform_arch: ${{ matrix.platform.arch }} - name: Run e2e tests if: ${{ matrix.platform.arch != 'arm64' }} uses: ./.github/actions/pytest @@ -237,8 +232,7 @@ jobs: pytest_marks: "e2e and sglang and gpu_1" framework: "sglang" test_type: "e2e, gpu_1" - platform_arch: amd64 - # OPS-1140: Replace above with ${{ matrix.platform.arch }} when matrix is enabled + platform_arch: ${{ matrix.platform.arch }} trtllm: needs: changed-files From 8433432eadd6f8a22beb038189a80d03891b10d6 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Wed, 29 Oct 2025 14:46:04 -0700 Subject: [PATCH 7/8] skip deploy job xml checks --- .github/workflows/upload_complete_workflow_metrics.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/upload_complete_workflow_metrics.py b/.github/workflows/upload_complete_workflow_metrics.py index d1d956592c..99d110bfa1 100644 --- a/.github/workflows/upload_complete_workflow_metrics.py +++ b/.github/workflows/upload_complete_workflow_metrics.py @@ -825,6 +825,11 @@ def _upload_test_metrics(self, job_data: Dict[str, Any]) -> None: job_name = job_data.get("name", "") job_id = str(job_data["id"]) + # Skip deployment test jobs (No pytest metadata files are created) + if job_name.lower().startswith("deploy"): + print(f"โญ๏ธ Skipping test metrics for deployment job '{job_name}'") + return + print(f"๐Ÿงช Looking for test results for job '{job_name}'") # Determine framework from job name to filter metadata files From d19e4120e0a5af9dc60e40ceaa90703be5e9cfa2 Mon Sep 17 00:00:00 2001 From: Nate Mailhot Date: Thu, 30 Oct 2025 10:48:59 -0700 Subject: [PATCH 8/8] fix var names --- .github/actions/pytest/action.yml | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/actions/pytest/action.yml b/.github/actions/pytest/action.yml index acd2162f0b..af838d4af3 100644 --- a/.github/actions/pytest/action.yml +++ b/.github/actions/pytest/action.yml @@ -74,8 +74,9 @@ runs: run: | # Sanitize test_type for filenames (always set this for artifact upload) - TEST_TYPE_SAFE=$(echo "${{ inputs.test_type }}" | tr ', ' '_') - echo "TEST_TYPE_SAFE=${TEST_TYPE_SAFE}" >> $GITHUB_ENV + # Remove commas and spaces from test_type for use in filenames + STR_TEST_TYPE=$(echo "${{ inputs.test_type }}" | tr ', ' '_') + echo "STR_TEST_TYPE=${STR_TEST_TYPE}" >> $GITHUB_ENV # Check for JUnit XML file and determine test status JUNIT_FILE="test-results/pytest_test_report.xml" @@ -90,22 +91,22 @@ runs: # Create uniquely named metadata file with step context information # Use framework-testtype-arch to make it unique per test run - METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.json" - JUNIT_UNIQUE_NAME="pytest_test_report_${{ inputs.framework }}_${TEST_TYPE_SAFE}_${{ inputs.platform_arch }}.xml" + METADATA_FILE="test-results/test_metadata_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.json" + JUNIT_NAME="pytest_test_report_${{ inputs.framework }}_${STR_TEST_TYPE}_${{ inputs.platform_arch }}.xml" # Rename XML file to unique name - mv "$JUNIT_FILE" "test-results/$JUNIT_UNIQUE_NAME" + mv "$JUNIT_FILE" "test-results/$JUNIT_NAME" echo '{' > "$METADATA_FILE" echo ' "job_name": "${{ github.job }}",' >> "$METADATA_FILE" echo ' "framework": "${{ inputs.framework }}",' >> "$METADATA_FILE" echo ' "test_type": "${{ inputs.test_type }}",' >> "$METADATA_FILE" echo ' "platform_arch": "${{ inputs.platform_arch }}",' >> "$METADATA_FILE" - echo ' "junit_xml_file": "'"$JUNIT_UNIQUE_NAME"'",' >> "$METADATA_FILE" + echo ' "junit_xml_file": "'"$JUNIT_NAME"'",' >> "$METADATA_FILE" echo ' "step_name": "Run ${{ inputs.test_type }} tests"' >> "$METADATA_FILE" echo '}' >> "$METADATA_FILE" echo "๐Ÿ“ Created test metadata file: $METADATA_FILE" - echo "๐Ÿ“ Renamed XML file to: $JUNIT_UNIQUE_NAME" + echo "๐Ÿ“ Renamed XML file to: $JUNIT_NAME" else echo "โš ๏ธ JUnit XML file not found - test results may not be available for upload" TOTAL_TESTS=0 @@ -120,8 +121,8 @@ runs: uses: actions/upload-artifact@v4 if: always() # Always upload test results, even if tests failed with: - name: test-results-${{ inputs.framework }}-${{ env.TEST_TYPE_SAFE }}-${{ env.PLATFORM_ARCH }} + name: test-results-${{ inputs.framework }}-${{ env.STR_TEST_TYPE }}-${{ env.PLATFORM_ARCH }} path: | - test-results/pytest_test_report_${{ inputs.framework }}_${{ env.TEST_TYPE_SAFE }}_${{ inputs.platform_arch }}.xml - test-results/test_metadata_${{ inputs.framework }}_${{ env.TEST_TYPE_SAFE }}_${{ inputs.platform_arch }}.json + test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml + test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json retention-days: 7 \ No newline at end of file