From 7f66e40731c57e7f3bbeb14f9792c0ae9177ed41 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 15:17:43 +0200 Subject: [PATCH 1/4] ci: Reorder records To improve readability --- .github/workflows/_ci.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 589a42d3b..92b4f6413 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -117,6 +117,16 @@ jobs: DOCKERFILE: .github/container/Dockerfile.t5x.${{ inputs.ARCHITECTURE }} secrets: inherit + build-rosetta-t5x: + needs: build-upstream-t5x + uses: ./.github/workflows/_build_rosetta.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} + BASE_LIBRARY: t5x + secrets: inherit + build-upstream-pax: needs: build-jax uses: ./.github/workflows/_build.yaml @@ -130,16 +140,6 @@ jobs: DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }} secrets: inherit - build-rosetta-t5x: - needs: build-upstream-t5x - uses: ./.github/workflows/_build_rosetta.yaml - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} - BASE_LIBRARY: t5x - secrets: inherit - build-rosetta-pax: needs: build-upstream-pax uses: ./.github/workflows/_build_rosetta.yaml From 778a8569ff23becb626df8ea461fd4137f9d5d16 Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 15:22:44 +0200 Subject: [PATCH 2/4] refactor: Rename rosetta tests For easier readability --- .github/workflows/_ci.yaml | 6 +++--- .../{_test_pax_rosetta.yaml => _test_rosetta_pax.yaml} | 0 .../{_test_t5x_rosetta.yaml => _test_rosetta_t5x.yaml} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename .github/workflows/{_test_pax_rosetta.yaml => _test_rosetta_pax.yaml} (100%) rename .github/workflows/{_test_t5x_rosetta.yaml => _test_rosetta_t5x.yaml} (100%) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 92b4f6413..1eb6c3d3c 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -126,7 +126,7 @@ jobs: BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }} BASE_LIBRARY: t5x secrets: inherit - + build-upstream-pax: needs: build-jax uses: ./.github/workflows/_build.yaml @@ -310,7 +310,7 @@ jobs: test-rosetta-t5x: needs: build-rosetta-t5x if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_t5x_rosetta.yaml + uses: ./.github/workflows/_test_rosetta_t5x.yaml with: T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }} secrets: inherit @@ -433,7 +433,7 @@ jobs: test-rosetta-pax: needs: build-rosetta-pax if: inputs.ARCHITECTURE == 'amd64' # no images for arm64 - uses: ./.github/workflows/_test_pax_rosetta.yaml + uses: ./.github/workflows/_test_rosetta_pax.yaml with: PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }} secrets: inherit diff --git a/.github/workflows/_test_pax_rosetta.yaml b/.github/workflows/_test_rosetta_pax.yaml similarity index 100% rename from .github/workflows/_test_pax_rosetta.yaml rename to .github/workflows/_test_rosetta_pax.yaml diff --git a/.github/workflows/_test_t5x_rosetta.yaml b/.github/workflows/_test_rosetta_t5x.yaml similarity index 100% rename from .github/workflows/_test_t5x_rosetta.yaml rename to .github/workflows/_test_rosetta_t5x.yaml From 82d00dd1e2e1e20aa29d108bb50a154e4242739b Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 15:27:48 +0200 Subject: [PATCH 3/4] ci: Restore rosetta-t5x unit tests --- .github/workflows/_test_rosetta.yaml | 97 ----------------- .github/workflows/_test_rosetta_t5x.yaml | 131 +++++++++++++++++++---- 2 files changed, 109 insertions(+), 119 deletions(-) delete mode 100644 .github/workflows/_test_rosetta.yaml diff --git a/.github/workflows/_test_rosetta.yaml b/.github/workflows/_test_rosetta.yaml deleted file mode 100644 index 017662ea3..000000000 --- a/.github/workflows/_test_rosetta.yaml +++ /dev/null @@ -1,97 +0,0 @@ -name: ~test Rosetta - -on: - workflow_call: - inputs: - ROSETTA_IMAGE: - type: string - description: 'Rosetta image build by NVIDIA/JAX-Toolbox' - required: true - default: 'ghcr.io/nvidia/t5x:latest' - outputs: - TEST_ARTIFACT_NAME: - description: 'Name of the unit test artifact for downstream workflows' - value: ${{ jobs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }} - TEST_STATUS: - description: 'Summary pass/fail value indicating if results from tests are acceptable' - value: ${{ jobs.publish-test.outputs.STATUS }} - -env: - TEST_ARTIFACT_NAME: rosetta-test-logs - TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl - -jobs: - rosetta-unit-tests: - runs-on: [self-hosted, V100] - outputs: - TEST_ARTIFACT_NAME: ${{ env.TEST_ARTIFACT_NAME }} - steps: - - name: Print environment variables - run: | - env - - - name: Print GPU information - run: nvidia-smi - - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Pull Rosetta image - shell: bash -x -e {0} - run: | - docker pull ${{ inputs.ROSETTA_IMAGE }} - docker tag ${{ inputs.ROSETTA_IMAGE }} rosetta:latest - - - name: Run Rosetta tests w/ docker - shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh - run: | - ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)")) - pip install "${ROSETTA_PATH}[test]" pytest-reportlog - pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true - - - name: Upload unit test json logs - uses: actions/upload-artifact@v4 - with: - name: ${{ env.TEST_ARTIFACT_NAME }} - path: ${{ env.TEST_LOG_LOCAL_PATH }} - - publish-test: - needs: rosetta-unit-tests - uses: ./.github/workflows/_publish_badge.yaml - if: ( always() ) - secrets: inherit - with: - ENDPOINT_FILENAME: 'rosetta-unit-test-status.json' - PUBLISH: false - SCRIPT: | - ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" - all_outcomes() { - cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' - } - cnt_type() { - cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l - } - SKIPPED_TESTS=$(cnt_type skipped) - FAILED_TESTS=$(cnt_type failed) - PASSED_TESTS=$(cnt_type passed) - TOTAL_TESTS=$(all_outcomes | wc -l) - echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY - all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY - if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then - BADGE_COLOR=brightgreen - echo "STATUS=success" >> $GITHUB_OUTPUT - else - echo "STATUS=failure" >> $GITHUB_OUTPUT - if [[ $PASSED_TESTS -eq 0 ]]; then - BADGE_COLOR=red - else - BADGE_COLOR=yellow - fi - fi - echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT - echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT - echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT diff --git a/.github/workflows/_test_rosetta_t5x.yaml b/.github/workflows/_test_rosetta_t5x.yaml index 7bf6cc150..f6f43d8d2 100644 --- a/.github/workflows/_test_rosetta_t5x.yaml +++ b/.github/workflows/_test_rosetta_t5x.yaml @@ -6,26 +6,26 @@ on: T5X_IMAGE: type: string description: T5X image from ghcr.io/nvidia/t5x - default: 'ghcr.io/nvidia/t5x:latest' + default: "ghcr.io/nvidia/t5x:latest" required: false BADGE_FILENAME: type: string - description: 'Name of the endpoint JSON file for shields.io badge' + description: "Name of the endpoint JSON file for shields.io badge" required: false - default: 'badge-rosetta-t5x-mgmn-test.json' + default: "badge-rosetta-t5x-mgmn-test.json" ARTIFACT_NAME: type: string - description: 'Name of the artifact zip file' + description: "Name of the artifact zip file" required: false - default: 'artifact-rosetta-t5x-mgmn-test' + default: "artifact-rosetta-t5x-mgmn-test" FW_NAME: type: string - description: 'Name of the framework being used' + description: "Name of the framework being used" required: false - default: 'rosetta-t5x' + default: "rosetta-t5x" outputs: TEST_STATUS: - description: 'Summary pass/fail value indicating if results from tests are acceptable' + description: "Summary pass/fail value indicating if results from tests are acceptable" value: ${{ jobs.sitrep.outputs.STATUS }} env: @@ -33,7 +33,6 @@ env: VIT_BATCH_SIZE_PER_GPU: 256 jobs: - single-process-multi-device: strategy: matrix: @@ -63,10 +62,10 @@ jobs: uses: webfactory/ssh-agent@v0.9.0 with: ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - + - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - + - name: Setup SSH known hosts id: ssh-known-hosts run: | @@ -182,7 +181,7 @@ jobs: dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"} json.dump(dump, f) EOF - + - name: Generate sitrep if: success() || failure() shell: bash -x -e {0} @@ -196,7 +195,7 @@ jobs: passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) total_tests=$(ls $EXIT_STATUSES | wc -l) - + if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then badge_message='error' badge_color=red @@ -402,7 +401,7 @@ jobs: passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) total_tests=$(ls $EXIT_STATUSES | wc -l) - + if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then badge_message='error' badge_color=red @@ -429,7 +428,7 @@ jobs: color="${badge_color}" \ to_json schemaVersion label message color \ > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v4 with: @@ -571,7 +570,7 @@ jobs: passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) total_tests=$(ls $EXIT_STATUSES | wc -l) - + if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then badge_message='error' badge_color=red @@ -744,7 +743,7 @@ jobs: passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l) failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l) total_tests=$(ls $EXIT_STATUSES | wc -l) - + if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then badge_message='error' badge_color=red @@ -771,7 +770,7 @@ jobs: color="${badge_color}" \ to_json schemaVersion label message color \ > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json - + - name: Upload training logs as artifacts uses: actions/upload-artifact@v4 with: @@ -779,7 +778,13 @@ jobs: path: output/* metrics: - needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node] + needs: + [ + multi-gpu-multi-node, + single-process-multi-device, + vit-single-process-multi-device, + vit-multi-gpu-multi-node, + ] runs-on: ubuntu-22.04 steps: @@ -810,7 +815,7 @@ jobs: path: | report.jsonl *_metrics.json - + sitrep: needs: metrics if: "!cancelled()" @@ -820,10 +825,16 @@ jobs: BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }} ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }} FW_NAME: ${{ inputs.FW_NAME }} - + summary: runs-on: ubuntu-22.04 - needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node] + needs: + [ + multi-gpu-multi-node, + single-process-multi-device, + vit-single-process-multi-device, + vit-multi-gpu-multi-node, + ] if: "!cancelled()" steps: - name: Generate TensorBoard query URL @@ -848,3 +859,79 @@ jobs: if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then exit 1 fi + + unit-tests: + runs-on: [self-hosted, V100] + env: + TEST_ARTIFACT_NAME: rosetta-test-logs + TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl + steps: + - name: Print environment variables + run: | + env + + - name: Print GPU information + run: nvidia-smi + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull Rosetta image + shell: bash -x -e {0} + run: | + docker pull ${{ inputs.T5X_IMAGE }} + docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest + + - name: Run Rosetta tests w/ docker + shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh + run: | + ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)")) + pip install "${ROSETTA_PATH}[test]" pytest-reportlog + pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true + + - name: Upload unit test json logs + uses: actions/upload-artifact@v4 + with: + name: ${{ env.TEST_ARTIFACT_NAME }} + path: ${{ env.TEST_LOG_LOCAL_PATH }} + + publish-test: + needs: unit-tests + uses: ./.github/workflows/_publish_badge.yaml + if: ( always() ) + secrets: inherit + with: + ENDPOINT_FILENAME: "rosetta-unit-test-status.json" + PUBLISH: false + SCRIPT: | + ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" + all_outcomes() { + cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome' + } + cnt_type() { + cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l + } + SKIPPED_TESTS=$(cnt_type skipped) + FAILED_TESTS=$(cnt_type failed) + PASSED_TESTS=$(cnt_type passed) + TOTAL_TESTS=$(all_outcomes | wc -l) + echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY + all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY + if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then + BADGE_COLOR=brightgreen + echo "STATUS=success" >> $GITHUB_OUTPUT + else + echo "STATUS=failure" >> $GITHUB_OUTPUT + if [[ $PASSED_TESTS -eq 0 ]]; then + BADGE_COLOR=red + else + BADGE_COLOR=yellow + fi + fi + echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT + echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT + echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT From 80ccc49f8c7461425475a55c71a10c6c97ff6e4b Mon Sep 17 00:00:00 2001 From: Oliver Koenig Date: Mon, 22 Apr 2024 16:48:32 +0200 Subject: [PATCH 4/4] style: Use single quotes --- .github/workflows/_test_rosetta_t5x.yaml | 66 ++++++++++++------------ .github/workflows/ci.yaml | 36 ++++++------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/.github/workflows/_test_rosetta_t5x.yaml b/.github/workflows/_test_rosetta_t5x.yaml index f6f43d8d2..fe121b841 100644 --- a/.github/workflows/_test_rosetta_t5x.yaml +++ b/.github/workflows/_test_rosetta_t5x.yaml @@ -6,26 +6,26 @@ on: T5X_IMAGE: type: string description: T5X image from ghcr.io/nvidia/t5x - default: "ghcr.io/nvidia/t5x:latest" + default: 'ghcr.io/nvidia/t5x:latest' required: false BADGE_FILENAME: type: string - description: "Name of the endpoint JSON file for shields.io badge" + description: 'Name of the endpoint JSON file for shields.io badge' required: false - default: "badge-rosetta-t5x-mgmn-test.json" + default: 'badge-rosetta-t5x-mgmn-test.json' ARTIFACT_NAME: type: string - description: "Name of the artifact zip file" + description: 'Name of the artifact zip file' required: false - default: "artifact-rosetta-t5x-mgmn-test" + default: 'artifact-rosetta-t5x-mgmn-test' FW_NAME: type: string - description: "Name of the framework being used" + description: 'Name of the framework being used' required: false - default: "rosetta-t5x" + default: 'rosetta-t5x' outputs: TEST_STATUS: - description: "Summary pass/fail value indicating if results from tests are acceptable" + description: 'Summary pass/fail value indicating if results from tests are acceptable' value: ${{ jobs.sitrep.outputs.STATUS }} env: @@ -37,18 +37,18 @@ jobs: strategy: matrix: include: - - TEST_NAME: "1P1G_te-1" + - TEST_NAME: '1P1G_te-1' N_GPU: 1 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - - TEST_NAME: "1P1G_te-0" + ADDITIONAL_ARGS: '' + EXTRA_GIN_ARGS: '--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False' + - TEST_NAME: '1P1G_te-0' N_GPU: 1 - ADDITIONAL_ARGS: "--enable-te 0" - EXTRA_GIN_ARGS: "" - - TEST_NAME: "1P8G_te-1" + ADDITIONAL_ARGS: '--enable-te 0' + EXTRA_GIN_ARGS: '' + - TEST_NAME: '1P8G_te-1' N_GPU: 8 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" + ADDITIONAL_ARGS: '' + EXTRA_GIN_ARGS: '--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False' fail-fast: false runs-on: ubuntu-22.04 @@ -233,26 +233,26 @@ jobs: strategy: matrix: include: - - TEST_NAME: "1N1G-te-1" + - TEST_NAME: '1N1G-te-1' N_GPU: 1 N_NODE: 1 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - - TEST_NAME: "1N8G-te-1" + ADDITIONAL_ARGS: '' + EXTRA_GIN_ARGS: '--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False' + - TEST_NAME: '1N8G-te-1' N_GPU: 8 N_NODE: 1 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - - TEST_NAME: "2N8G-te-1" + ADDITIONAL_ARGS: '' + EXTRA_GIN_ARGS: '--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False' + - TEST_NAME: '2N8G-te-1' N_GPU: 8 N_NODE: 2 - ADDITIONAL_ARGS: "" - EXTRA_GIN_ARGS: "--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False" - - TEST_NAME: "2N2G_te-0" + ADDITIONAL_ARGS: '' + EXTRA_GIN_ARGS: '--gin.train/utils.DatasetConfig.pack=False --gin.train_eval/utils.DatasetConfig.pack=False' + - TEST_NAME: '2N2G_te-0' N_GPU: 2 N_NODE: 2 - ADDITIONAL_ARGS: "--enable-te 0" - EXTRA_GIN_ARGS: "" + ADDITIONAL_ARGS: '--enable-te 0' + EXTRA_GIN_ARGS: '' fail-fast: false runs-on: ubuntu-22.04 @@ -818,7 +818,7 @@ jobs: sitrep: needs: metrics - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_sitrep_mgmn.yaml secrets: inherit with: @@ -835,7 +835,7 @@ jobs: vit-single-process-multi-device, vit-multi-gpu-multi-node, ] - if: "!cancelled()" + if: '!cancelled()' steps: - name: Generate TensorBoard query URL run: | @@ -852,7 +852,7 @@ jobs: outcome: needs: sitrep runs-on: ubuntu-22.04 - if: "!cancelled()" + if: '!cancelled()' steps: - name: Sets workflow status based on test outputs run: | @@ -905,7 +905,7 @@ jobs: if: ( always() ) secrets: inherit with: - ENDPOINT_FILENAME: "rosetta-unit-test-status.json" + ENDPOINT_FILENAME: 'rosetta-unit-test-status.json' PUBLISH: false SCRIPT: | ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl" diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0098b83bf..75dddeeb8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -2,7 +2,7 @@ name: CI on: schedule: - - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC + - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC pull_request: types: - opened @@ -25,7 +25,7 @@ on: required: false MERGE_BUMPED_MANIFEST: type: boolean - description: "(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch" + description: '(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch' default: false required: false @@ -34,16 +34,15 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} permissions: - contents: write # to fetch code and push branch - actions: write # to cancel previous workflows - packages: write # to upload container - pull-requests: write # to make pull request for manifest bump + contents: write # to fetch code and push branch + actions: write # to cancel previous workflows + packages: write # to upload container + pull-requests: write # to make pull request for manifest bump env: DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest jobs: - metadata: runs-on: ubuntu-22.04 outputs: @@ -115,7 +114,7 @@ jobs: shell: bash -x -e {0} run: | bash bump.sh --input-manifest manifest.yaml --output-manifest manifest.yaml.new --base-patch-dir ./patches-new - + - name: Maybe replace current manifest/patches with the new one and show diff working-directory: .github/container shell: bash -x -e {0} @@ -168,12 +167,11 @@ jobs: steps: - name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" id: test_result - run: - echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT + run: echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT - name: Check out the repository under ${GITHUB_WORKSPACE} uses: actions/checkout@v4 - + - name: Delete checked-out manifest and patches run: | rm .github/container/manifest.yaml @@ -185,7 +183,7 @@ jobs: name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} path: .github/container/ - - name: "Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}" + - name: 'Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}' id: local_branch shell: bash -x -e {0} run: | @@ -213,7 +211,7 @@ jobs: git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }} # Push the new change git push origin ${{ github.ref_name }} - + # We will create a Draft PR & remote branch if: # 1. The tests failed # 2. The merge failed @@ -244,12 +242,12 @@ jobs: draft: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: "Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}" + + - name: 'Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}' if: steps.create_pr.outcome == 'success' run: | echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY - + # Guard delete in simple check to protect other branches - name: Check that the branch matches znightly- prefix run: | @@ -271,7 +269,7 @@ jobs: make-publish-configs: runs-on: ubuntu-22.04 - if: ${{ !cancelled() }} + if: ${{ !cancelled() }} env: MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }} FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }} @@ -365,7 +363,7 @@ jobs: needs: - metadata - make-publish-configs - if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} + if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} strategy: fail-fast: false matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }} @@ -381,7 +379,7 @@ jobs: finalize: needs: [metadata, amd64, arm64, publish-containers] - if: "!cancelled()" + if: '!cancelled()' uses: ./.github/workflows/_finalize.yaml with: BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }}