66 T5X_IMAGE :
77 type : string
88 description : T5X image from ghcr.io/nvidia/t5x
9- default : ' ghcr.io/nvidia/t5x:latest'
9+ default : " ghcr.io/nvidia/t5x:latest"
1010 required : false
1111 BADGE_FILENAME :
1212 type : string
13- description : ' Name of the endpoint JSON file for shields.io badge'
13+ description : " Name of the endpoint JSON file for shields.io badge"
1414 required : false
15- default : ' badge-rosetta-t5x-mgmn-test.json'
15+ default : " badge-rosetta-t5x-mgmn-test.json"
1616 ARTIFACT_NAME :
1717 type : string
18- description : ' Name of the artifact zip file'
18+ description : " Name of the artifact zip file"
1919 required : false
20- default : ' artifact-rosetta-t5x-mgmn-test'
20+ default : " artifact-rosetta-t5x-mgmn-test"
2121 FW_NAME :
2222 type : string
23- description : ' Name of the framework being used'
23+ description : " Name of the framework being used"
2424 required : false
25- default : ' rosetta-t5x'
25+ default : " rosetta-t5x"
2626 outputs :
2727 TEST_STATUS :
28- description : ' Summary pass/fail value indicating if results from tests are acceptable'
28+ description : " Summary pass/fail value indicating if results from tests are acceptable"
2929 value : ${{ jobs.sitrep.outputs.STATUS }}
3030
3131env :
3232 BATCH_SIZE_PER_GPU : 32
3333 VIT_BATCH_SIZE_PER_GPU : 256
3434
3535jobs :
36-
3736 single-process-multi-device :
3837 strategy :
3938 matrix :
@@ -63,10 +62,10 @@ jobs:
6362 uses :
webfactory/[email protected] 6463 with :
6564 ssh-private-key : ${{ secrets.SSH_PRIVATE_KEY }}
66-
65+
6766 - name : Check out the repository under ${GITHUB_WORKSPACE}
6867 uses : actions/checkout@v4
69-
68+
7069 - name : Setup SSH known hosts
7170 id : ssh-known-hosts
7271 run : |
@@ -182,7 +181,7 @@ jobs:
182181 dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
183182 json.dump(dump, f)
184183 EOF
185-
184+
186185 - name : Generate sitrep
187186 if : success() || failure()
188187 shell : bash -x -e {0}
@@ -196,7 +195,7 @@ jobs:
196195 passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
197196 failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
198197 total_tests=$(ls $EXIT_STATUSES | wc -l)
199-
198+
200199 if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
201200 badge_message='error'
202201 badge_color=red
@@ -402,7 +401,7 @@ jobs:
402401 passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
403402 failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
404403 total_tests=$(ls $EXIT_STATUSES | wc -l)
405-
404+
406405 if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
407406 badge_message='error'
408407 badge_color=red
@@ -429,7 +428,7 @@ jobs:
429428 color="${badge_color}" \
430429 to_json schemaVersion label message color \
431430 > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
432-
431+
433432 - name : Upload training logs as artifacts
434433 uses : actions/upload-artifact@v4
435434 with :
@@ -571,7 +570,7 @@ jobs:
571570 passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
572571 failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
573572 total_tests=$(ls $EXIT_STATUSES | wc -l)
574-
573+
575574 if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
576575 badge_message='error'
577576 badge_color=red
@@ -744,7 +743,7 @@ jobs:
744743 passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
745744 failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
746745 total_tests=$(ls $EXIT_STATUSES | wc -l)
747-
746+
748747 if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
749748 badge_message='error'
750749 badge_color=red
@@ -771,15 +770,21 @@ jobs:
771770 color="${badge_color}" \
772771 to_json schemaVersion label message color \
773772 > output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json
774-
773+
775774 - name : Upload training logs as artifacts
776775 uses : actions/upload-artifact@v4
777776 with :
778777 name : ${{ steps.meta.outputs.JOB_NAME }}
779778 path : output/*
780779
781780 metrics :
782- needs : [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
781+ needs :
782+ [
783+ multi-gpu-multi-node,
784+ single-process-multi-device,
785+ vit-single-process-multi-device,
786+ vit-multi-gpu-multi-node,
787+ ]
783788 runs-on : ubuntu-22.04
784789
785790 steps :
@@ -810,7 +815,7 @@ jobs:
810815 path : |
811816 report.jsonl
812817 *_metrics.json
813-
818+
814819 sitrep :
815820 needs : metrics
816821 if : " !cancelled()"
@@ -820,10 +825,16 @@ jobs:
820825 BADGE_FILENAME : ${{ inputs.BADGE_FILENAME }}
821826 ARTIFACT_NAME : ${{ inputs.ARTIFACT_NAME }}
822827 FW_NAME : ${{ inputs.FW_NAME }}
823-
828+
824829 summary :
825830 runs-on : ubuntu-22.04
826- needs : [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
831+ needs :
832+ [
833+ multi-gpu-multi-node,
834+ single-process-multi-device,
835+ vit-single-process-multi-device,
836+ vit-multi-gpu-multi-node,
837+ ]
827838 if : " !cancelled()"
828839 steps :
829840 - name : Generate TensorBoard query URL
@@ -848,3 +859,79 @@ jobs:
848859 if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
849860 exit 1
850861 fi
862+
863+ unit-tests :
864+ runs-on : [self-hosted, V100]
865+ env :
866+ TEST_ARTIFACT_NAME : rosetta-test-logs
867+ TEST_LOG_LOCAL_PATH : /log/unit-report.jsonl
868+ steps :
869+ - name : Print environment variables
870+ run : |
871+ env
872+
873+ - name : Print GPU information
874+ run : nvidia-smi
875+
876+ - name : Login to GitHub Container Registry
877+ uses : docker/login-action@v3
878+ with :
879+ registry : ghcr.io
880+ username : ${{ github.repository_owner }}
881+ password : ${{ secrets.GITHUB_TOKEN }}
882+
883+ - name : Pull Rosetta image
884+ shell : bash -x -e {0}
885+ run : |
886+ docker pull ${{ inputs.T5X_IMAGE }}
887+ docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest
888+
889+ - name : Run Rosetta tests w/ docker
890+ shell : docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
891+ run : |
892+ ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
893+ pip install "${ROSETTA_PATH}[test]" pytest-reportlog
894+ pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true
895+
896+ - name : Upload unit test json logs
897+ uses : actions/upload-artifact@v4
898+ with :
899+ name : ${{ env.TEST_ARTIFACT_NAME }}
900+ path : ${{ env.TEST_LOG_LOCAL_PATH }}
901+
902+ publish-test :
903+ needs : unit-tests
904+ uses : ./.github/workflows/_publish_badge.yaml
905+ if : ( always() )
906+ secrets : inherit
907+ with :
908+ ENDPOINT_FILENAME : " rosetta-unit-test-status.json"
909+ PUBLISH : false
910+ SCRIPT : |
911+ ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
912+ all_outcomes() {
913+ cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
914+ }
915+ cnt_type() {
916+ cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
917+ }
918+ SKIPPED_TESTS=$(cnt_type skipped)
919+ FAILED_TESTS=$(cnt_type failed)
920+ PASSED_TESTS=$(cnt_type passed)
921+ TOTAL_TESTS=$(all_outcomes | wc -l)
922+ echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
923+ all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
924+ if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
925+ BADGE_COLOR=brightgreen
926+ echo "STATUS=success" >> $GITHUB_OUTPUT
927+ else
928+ echo "STATUS=failure" >> $GITHUB_OUTPUT
929+ if [[ $PASSED_TESTS -eq 0 ]]; then
930+ BADGE_COLOR=red
931+ else
932+ BADGE_COLOR=yellow
933+ fi
934+ fi
935+ echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
936+ echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
937+ echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT
0 commit comments