Skip to content

Commit 99db46d

Browse files
committed
ci: Restore rosetta-t5x unit tests
1 parent 246f8b6 commit 99db46d

File tree

2 files changed

+109
-119
lines changed

2 files changed

+109
-119
lines changed

.github/workflows/_test_rosetta.yaml

Lines changed: 0 additions & 97 deletions
This file was deleted.

.github/workflows/_test_rosetta_t5x.yaml

Lines changed: 109 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,34 +6,33 @@ on:
66
T5X_IMAGE:
77
type: string
88
description: T5X image from ghcr.io/nvidia/t5x
9-
default: 'ghcr.io/nvidia/t5x:latest'
9+
default: "ghcr.io/nvidia/t5x:latest"
1010
required: false
1111
BADGE_FILENAME:
1212
type: string
13-
description: 'Name of the endpoint JSON file for shields.io badge'
13+
description: "Name of the endpoint JSON file for shields.io badge"
1414
required: false
15-
default: 'badge-rosetta-t5x-mgmn-test.json'
15+
default: "badge-rosetta-t5x-mgmn-test.json"
1616
ARTIFACT_NAME:
1717
type: string
18-
description: 'Name of the artifact zip file'
18+
description: "Name of the artifact zip file"
1919
required: false
20-
default: 'artifact-rosetta-t5x-mgmn-test'
20+
default: "artifact-rosetta-t5x-mgmn-test"
2121
FW_NAME:
2222
type: string
23-
description: 'Name of the framework being used'
23+
description: "Name of the framework being used"
2424
required: false
25-
default: 'rosetta-t5x'
25+
default: "rosetta-t5x"
2626
outputs:
2727
TEST_STATUS:
28-
description: 'Summary pass/fail value indicating if results from tests are acceptable'
28+
description: "Summary pass/fail value indicating if results from tests are acceptable"
2929
value: ${{ jobs.sitrep.outputs.STATUS }}
3030

3131
env:
3232
BATCH_SIZE_PER_GPU: 32
3333
VIT_BATCH_SIZE_PER_GPU: 256
3434

3535
jobs:
36-
3736
single-process-multi-device:
3837
strategy:
3938
matrix:
@@ -63,10 +62,10 @@ jobs:
6362
uses: webfactory/[email protected]
6463
with:
6564
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
66-
65+
6766
- name: Check out the repository under ${GITHUB_WORKSPACE}
6867
uses: actions/checkout@v4
69-
68+
7069
- name: Setup SSH known hosts
7170
id: ssh-known-hosts
7271
run: |
@@ -182,7 +181,7 @@ jobs:
182181
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
183182
json.dump(dump, f)
184183
EOF
185-
184+
186185
- name: Generate sitrep
187186
if: success() || failure()
188187
shell: bash -x -e {0}
@@ -196,7 +195,7 @@ jobs:
196195
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
197196
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
198197
total_tests=$(ls $EXIT_STATUSES | wc -l)
199-
198+
200199
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
201200
badge_message='error'
202201
badge_color=red
@@ -402,7 +401,7 @@ jobs:
402401
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
403402
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
404403
total_tests=$(ls $EXIT_STATUSES | wc -l)
405-
404+
406405
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
407406
badge_message='error'
408407
badge_color=red
@@ -429,7 +428,7 @@ jobs:
429428
color="${badge_color}" \
430429
to_json schemaVersion label message color \
431430
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
432-
431+
433432
- name: Upload training logs as artifacts
434433
uses: actions/upload-artifact@v4
435434
with:
@@ -571,7 +570,7 @@ jobs:
571570
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
572571
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
573572
total_tests=$(ls $EXIT_STATUSES | wc -l)
574-
573+
575574
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
576575
badge_message='error'
577576
badge_color=red
@@ -744,7 +743,7 @@ jobs:
744743
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
745744
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
746745
total_tests=$(ls $EXIT_STATUSES | wc -l)
747-
746+
748747
if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
749748
badge_message='error'
750749
badge_color=red
@@ -771,15 +770,21 @@ jobs:
771770
color="${badge_color}" \
772771
to_json schemaVersion label message color \
773772
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
774-
773+
775774
- name: Upload training logs as artifacts
776775
uses: actions/upload-artifact@v4
777776
with:
778777
name: ${{ steps.meta.outputs.JOB_NAME }}
779778
path: output/*
780779

781780
metrics:
782-
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
781+
needs:
782+
[
783+
multi-gpu-multi-node,
784+
single-process-multi-device,
785+
vit-single-process-multi-device,
786+
vit-multi-gpu-multi-node,
787+
]
783788
runs-on: ubuntu-22.04
784789

785790
steps:
@@ -810,7 +815,7 @@ jobs:
810815
path: |
811816
report.jsonl
812817
*_metrics.json
813-
818+
814819
sitrep:
815820
needs: metrics
816821
if: "!cancelled()"
@@ -820,10 +825,16 @@ jobs:
820825
BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
821826
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
822827
FW_NAME: ${{ inputs.FW_NAME }}
823-
828+
824829
summary:
825830
runs-on: ubuntu-22.04
826-
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
831+
needs:
832+
[
833+
multi-gpu-multi-node,
834+
single-process-multi-device,
835+
vit-single-process-multi-device,
836+
vit-multi-gpu-multi-node,
837+
]
827838
if: "!cancelled()"
828839
steps:
829840
- name: Generate TensorBoard query URL
@@ -848,3 +859,79 @@ jobs:
848859
if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
849860
exit 1
850861
fi
862+
863+
unit-tests:
864+
runs-on: [self-hosted, V100]
865+
env:
866+
TEST_ARTIFACT_NAME: rosetta-test-logs
867+
TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
868+
steps:
869+
- name: Print environment variables
870+
run: |
871+
env
872+
873+
- name: Print GPU information
874+
run: nvidia-smi
875+
876+
- name: Login to GitHub Container Registry
877+
uses: docker/login-action@v3
878+
with:
879+
registry: ghcr.io
880+
username: ${{ github.repository_owner }}
881+
password: ${{ secrets.GITHUB_TOKEN }}
882+
883+
- name: Pull Rosetta image
884+
shell: bash -x -e {0}
885+
run: |
886+
docker pull ${{ inputs.T5X_IMAGE }}
887+
docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest
888+
889+
- name: Run Rosetta tests w/ docker
890+
shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
891+
run: |
892+
ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
893+
pip install "${ROSETTA_PATH}[test]" pytest-reportlog
894+
pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true
895+
896+
- name: Upload unit test json logs
897+
uses: actions/upload-artifact@v4
898+
with:
899+
name: ${{ env.TEST_ARTIFACT_NAME }}
900+
path: ${{ env.TEST_LOG_LOCAL_PATH }}
901+
902+
publish-test:
903+
needs: unit-tests
904+
uses: ./.github/workflows/_publish_badge.yaml
905+
if: ( always() )
906+
secrets: inherit
907+
with:
908+
ENDPOINT_FILENAME: "rosetta-unit-test-status.json"
909+
PUBLISH: false
910+
SCRIPT: |
911+
ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
912+
all_outcomes() {
913+
cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
914+
}
915+
cnt_type() {
916+
cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
917+
}
918+
SKIPPED_TESTS=$(cnt_type skipped)
919+
FAILED_TESTS=$(cnt_type failed)
920+
PASSED_TESTS=$(cnt_type passed)
921+
TOTAL_TESTS=$(all_outcomes | wc -l)
922+
echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
923+
all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
924+
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
925+
BADGE_COLOR=brightgreen
926+
echo "STATUS=success" >> $GITHUB_OUTPUT
927+
else
928+
echo "STATUS=failure" >> $GITHUB_OUTPUT
929+
if [[ $PASSED_TESTS -eq 0 ]]; then
930+
BADGE_COLOR=red
931+
else
932+
BADGE_COLOR=yellow
933+
fi
934+
fi
935+
echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
936+
echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
937+
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT

0 commit comments

Comments
 (0)