|
1 | 1 | name: Sandbox |
2 | | -run-name: CI-amd64 |
| 2 | + |
3 | 3 | on: |
| 4 | + schedule: |
| 5 | + - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC |
4 | 6 | pull_request: |
5 | 7 | types: |
6 | 8 | - opened |
7 | 9 | - reopened |
8 | 10 | - ready_for_review |
9 | 11 | - synchronize |
10 | 12 | paths-ignore: |
11 | | - - "**.md" |
| 13 | + - '**.md' |
| 14 | + workflow_dispatch: |
| 15 | + inputs: |
| 16 | + PUBLISH: |
| 17 | + type: boolean |
| 18 | + description: Publish dated images and update the 'latest' tag? |
| 19 | + default: false |
| 20 | + required: false |
| 21 | + BUMP_MANIFEST: |
| 22 | + type: boolean |
| 23 | + description: Bump git repos in manifest.yaml to head of tree? |
| 24 | + default: false |
| 25 | + required: false |
| 26 | + MERGE_BUMPED_MANIFEST: |
| 27 | + type: boolean |
| 28 | + description: '(used if BUMP_MANIFEST=true) If true: attempt to PR/merge manifest branch' |
| 29 | + default: false |
| 30 | + required: false |
12 | 31 |
|
13 | | -env: |
14 | | - DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest |
| 32 | +concurrency: |
| 33 | + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} |
| 34 | + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
15 | 35 |
|
16 | 36 | permissions: |
17 | | - contents: read # to fetch code |
| 37 | + contents: write # to fetch code and push branch |
18 | 38 | actions: write # to cancel previous workflows |
19 | 39 | packages: write # to upload container |
| 40 | + pull-requests: write # to make pull request for manifest bump |
| 41 | + |
| 42 | +env: |
| 43 | + DEFAULT_MANIFEST_ARTIFACT_NAME: bumped-manifest |
20 | 44 |
|
21 | 45 | jobs: |
22 | 46 | metadata: |
23 | 47 | runs-on: ubuntu-22.04 |
24 | 48 | outputs: |
| 49 | + BUILD_DATE: ${{ steps.date.outputs.BUILD_DATE }} |
25 | 50 | PUBLISH: ${{ steps.if-publish.outputs.PUBLISH }} |
26 | 51 | BUMP_MANIFEST: ${{ steps.manifest-branch.outputs.BUMP_MANIFEST }} |
27 | 52 | MANIFEST_ARTIFACT_NAME: ${{ steps.manifest-branch.outputs.MANIFEST_ARTIFACT_NAME }} |
28 | 53 | MANIFEST_BRANCH: ${{ steps.manifest-branch.outputs.MANIFEST_BRANCH }} |
29 | 54 | MERGE_BUMPED_MANIFEST: ${{ steps.manifest-branch.outputs.MERGE_BUMBED_MANIFEST }} |
30 | 55 | steps: |
| 56 | + - name: Cancel workflow run if the trigger is a draft PR |
| 57 | + id: cancel-if-draft |
| 58 | + if: github.event_name == 'pull_request' && github.event.pull_request.draft == true |
| 59 | + run: | |
| 60 | + echo "Cancelling workflow for draft PR" |
| 61 | + curl -X POST -H "Authorization: token ${{ github.token }}" \ |
| 62 | + -H "Accept: application/vnd.github.v3+json" \ |
| 63 | + "https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel" |
| 64 | + while true; do sleep 1; done # blocks execution in case workflow cancellation takes time |
| 65 | +
|
31 | 66 | - name: Set build date |
32 | 67 | id: date |
33 | 68 | shell: bash -x -e {0} |
|
45 | 80 | id: manifest-branch |
46 | 81 | shell: bash -x -e {0} |
47 | 82 | run: | |
48 | | - BUMP_MANIFEST=${{ 'true' }} |
| 83 | + BUMP_MANIFEST=${{ github.event_name == 'schedule' || inputs.BUMP_MANIFEST || 'false' }} |
49 | 84 | MERGE_BUMPED_MANIFEST=${{ github.event_name == 'schedule' || inputs.MERGE_BUMPED_MANIFEST || 'false' }} |
50 | 85 | # Prepend nightly manifest branch with "z" to make it appear at the end |
51 | 86 | if [[ "$BUMP_MANIFEST" == "true" ]]; then |
@@ -103,48 +138,251 @@ jobs: |
103 | 138 | .github/container/manifest.yaml |
104 | 139 | .github/container/patches |
105 | 140 |
|
106 | | - build-base: |
107 | | - uses: ./.github/workflows/_build_base.yaml |
| 141 | + amd64: |
108 | 142 | needs: [metadata, bump-manifest] |
| 143 | + uses: ./.github/workflows/_ci.yaml |
109 | 144 | with: |
110 | 145 | ARCHITECTURE: amd64 |
111 | | - BUILD_DATE: 20240418 |
| 146 | + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} |
112 | 147 | MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} |
113 | 148 | secrets: inherit |
114 | 149 |
|
115 | | - build-jax: |
116 | | - needs: build-base |
117 | | - uses: ./.github/workflows/_build.yaml |
| 150 | + arm64: |
| 151 | + needs: [metadata, bump-manifest] |
| 152 | + uses: ./.github/workflows/_ci.yaml |
118 | 153 | with: |
119 | | - ARCHITECTURE: amd64 |
120 | | - ARTIFACT_NAME: artifact-jax-build |
121 | | - BADGE_FILENAME: badge-jax-build |
122 | | - BUILD_DATE: 20240418 |
123 | | - BASE_IMAGE: ${{ needs.build-base.outputs.DOCKER_TAG }} |
124 | | - CONTAINER_NAME: jax |
125 | | - DOCKERFILE: .github/container/Dockerfile.jax |
126 | | - RUNNER_SIZE: large |
| 154 | + ARCHITECTURE: arm64 |
| 155 | + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} |
| 156 | + MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} |
127 | 157 | secrets: inherit |
128 | 158 |
|
129 | | - build-upstream-maxtext: |
130 | | - needs: build-jax |
131 | | - uses: ./.github/workflows/_build.yaml |
| 159 | + # Only merge if everything succeeds |
| 160 | + merge-new-manifest: |
| 161 | + runs-on: ubuntu-22.04 |
| 162 | + if: ${{ !cancelled() && needs.metadata.outputs.MERGE_BUMPED_MANIFEST == 'true' && needs.metadata.outputs.MANIFEST_BRANCH != github.sha }} |
| 163 | + needs: |
| 164 | + - metadata |
| 165 | + - amd64 |
| 166 | + - arm64 |
| 167 | + steps: |
| 168 | + - name: "Tests Succeeded: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" |
| 169 | + id: test_result |
| 170 | + run: echo "SUCCEEDED=${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') }}" | tee -a $GITHUB_OUTPUT |
| 171 | + |
| 172 | + - name: Check out the repository under ${GITHUB_WORKSPACE} |
| 173 | + uses: actions/checkout@v4 |
| 174 | + |
| 175 | + - name: Delete checked-out manifest and patches |
| 176 | + run: | |
| 177 | + rm .github/container/manifest.yaml |
| 178 | + rm -rf .github/container/patches |
| 179 | +
|
| 180 | + - name: Replace checked-out manifest file/patches with bumped one |
| 181 | + uses: actions/download-artifact@v4 |
| 182 | + with: |
| 183 | + name: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} |
| 184 | + path: .github/container/ |
| 185 | + |
| 186 | + - name: 'Create local manifest branch: ${{ needs.metadata.outputs.MANIFEST_BRANCH }}' |
| 187 | + id: local_branch |
| 188 | + shell: bash -x -e {0} |
| 189 | + run: | |
| 190 | + git config user.name "JAX-Toolbox CI" |
| 191 | + git config user.email "[email protected]" |
| 192 | + git switch -c ${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 193 | + git status |
| 194 | + git add .github/container/patches/ |
| 195 | + git status |
| 196 | + # In the unusual situation where the manifest is the same even after bumping, |
| 197 | + # we will produce an empty commit with --allow-empty, which allows a PR to be |
| 198 | + # made and merged even with no changeset. |
| 199 | + git commit --allow-empty -a -m "Nightly Manifest Bump (${{ needs.metadata.outputs.BUILD_DATE }}) from: https://github.com/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }}" |
| 200 | +
|
| 201 | + - name: Try to merge manifest branch |
| 202 | + id: merge_local |
| 203 | + if: steps.test_result.outputs.SUCCEEDED == 'true' |
| 204 | + # Merge can fail |
| 205 | + continue-on-error: true |
| 206 | + shell: bash -x -e {0} |
| 207 | + run: | |
| 208 | + git switch ${{ github.ref_name }} |
| 209 | + # Pull this ref in case it was updated |
| 210 | + git pull --rebase |
| 211 | + git merge --ff-only ${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 212 | + # Push the new change |
| 213 | + git push origin ${{ github.ref_name }} |
| 214 | +
|
| 215 | + # We will create a Draft PR & remote branch if: |
| 216 | + # 1. The tests failed |
| 217 | + # 2. The merge failed |
| 218 | + - name: Create remote manifest branch |
| 219 | + id: create_remote_branch |
| 220 | + if: steps.test_result.outputs.SUCCEEDED == 'false' || steps.merge_local.outcome != 'success' |
| 221 | + shell: bash -x -e {0} |
| 222 | + run: | |
| 223 | + # Always abort in case in-progress merge |
| 224 | + git merge --abort || true |
| 225 | + git switch ${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 226 | + # Since the merge failed, create a remote and follow up with a PR |
| 227 | + git push --set-upstream origin ${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 228 | +
|
| 229 | + - name: Creating Draft PR for MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 230 | + id: create_pr |
| 231 | + if: steps.test_result.outputs.SUCCEEDED == 'false' || steps.merge_local.outcome != 'success' |
| 232 | + |
| 233 | + with: |
| 234 | + route: POST /repos/{owner_and_repo}/pulls |
| 235 | + owner_and_repo: ${{ github.repository }} |
| 236 | + head: ${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 237 | + # Always try to merge back into the branch that triggered this workflow |
| 238 | + base: ${{ github.ref }} |
| 239 | + body: | |
| 240 | + https://github.com/NVIDIA/JAX-Toolbox/actions/runs/${{ github.run_id }} |
| 241 | + title: Nightly Manifest Bump (${{ needs.metadata.outputs.BUILD_DATE }}) |
| 242 | + draft: true |
| 243 | + env: |
| 244 | + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| 245 | + |
| 246 | + - name: 'Log created PR: #${{ fromJson(steps.create_pr.outputs.data).number }}' |
| 247 | + if: steps.create_pr.outcome == 'success' |
| 248 | + run: | |
| 249 | + echo "https://github.com/NVIDIA/JAX-Toolbox/pull/${{ fromJson(steps.create_pr.outputs.data).number }}" | tee -a $GITHUB_STEP_SUMMARY |
| 250 | +
|
| 251 | + # Guard delete in simple check to protect other branches |
| 252 | + - name: Check that the branch matches znightly- prefix |
| 253 | + run: | |
| 254 | + if [[ "${{ needs.metadata.outputs.MANIFEST_BRANCH }}" != znightly-* ]]; then |
| 255 | + echo Tried to delete MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }}, but did not start with "znightly-" |
| 256 | + exit 1 |
| 257 | + fi |
| 258 | +
|
| 259 | + # If merging fails b/c upstream conflict, branch is deleted to avoid clutter since changeset is preserved in PR |
| 260 | + - name: Deleting remote MANIFEST_BRANCH=${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 261 | + # Delete can fail if branch was already deleted or not created, e.g., if the PR successfully merges, then branch is also already deleted. |
| 262 | + continue-on-error: true |
| 263 | + |
| 264 | + with: |
| 265 | + route: DELETE /repos/{owner_and_repo}/git/refs/heads/${{ needs.metadata.outputs.MANIFEST_BRANCH }} |
| 266 | + owner_and_repo: ${{ github.repository }} |
| 267 | + env: |
| 268 | + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| 269 | + |
| 270 | + make-publish-configs: |
| 271 | + runs-on: ubuntu-22.04 |
| 272 | + if: ${{ !cancelled() }} |
| 273 | + env: |
| 274 | + MEALKIT_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax-mealkit' || 'mock-jax-mealkit' }} |
| 275 | + FINAL_IMAGE_REPO: ${{ needs.metadata.outputs.PUBLISH == 'true' && 'jax' || 'mock-jax' }} |
| 276 | + needs: |
| 277 | + - metadata |
| 278 | + - amd64 |
| 279 | + - arm64 |
| 280 | + outputs: |
| 281 | + PUBLISH_CONFIGS: ${{ steps.generate-configs.outputs.PUBLISH_CONFIGS }} |
| 282 | + steps: |
| 283 | + - id: generate-configs |
| 284 | + shell: bash -eu -o pipefail {0} |
| 285 | + run: | |
| 286 | + declare -a FLAVORS=( |
| 287 | + base |
| 288 | + jax |
| 289 | + triton |
| 290 | + equinox |
| 291 | + maxtext |
| 292 | + levanter |
| 293 | + upstream-t5x |
| 294 | + upstream-pax |
| 295 | + upstream-maxtext |
| 296 | + t5x |
| 297 | + pax |
| 298 | + grok |
| 299 | + ) |
| 300 | + declare -a STAGES=( |
| 301 | + mealkit |
| 302 | + final |
| 303 | + ) |
| 304 | +
|
| 305 | + ## create JSON specs for a 1D matrix of container publication jobs |
| 306 | +
|
| 307 | + ALL_TAGS=$( |
| 308 | + echo '${{ needs.amd64.outputs.DOCKER_TAGS }}' \ |
| 309 | + '${{ needs.arm64.outputs.DOCKER_TAGS }}' |\ |
| 310 | + jq -s 'add' |
| 311 | + ) |
| 312 | + PUBLISH_CONFIGS='[]' |
| 313 | +
|
| 314 | + for stage in "${STAGES[@]}"; do |
| 315 | + for flavor in "${FLAVORS[@]}";do |
| 316 | +
|
| 317 | + # collect images for different platforms, e.g. amd64 and arm64 |
| 318 | + matching_tags=$( |
| 319 | + echo "$ALL_TAGS" |\ |
| 320 | + jq -c ".[] | select(.stage == \"${stage}\" and .flavor == \"${flavor}\" and .tag != \"\")" |
| 321 | + ) |
| 322 | +
|
| 323 | + # source_image is a list of all platform-specific tags |
| 324 | + source_image=$(echo "${matching_tags}" | jq -c "[.tag]" | jq -s 'add') |
| 325 | + # if the build job failed without producing any images, skip this flavor |
| 326 | + n_source_images=$(echo "$source_image" | jq 'length') |
| 327 | + if [[ $n_source_images -gt 0 ]]; then |
| 328 | + echo "PUBLISH image $flavor with $n_source_images $stage containers" |
| 329 | +
|
| 330 | + # tag priority is the highest priority of all platform-specific tags |
| 331 | + priority=$(echo "${matching_tags}" | jq -r ".priority" | jq -s 'max') |
| 332 | +
|
| 333 | + # put all final images in the `ghcr.io/nvidia/jax` namespace |
| 334 | + # and mealkit images in `ghcr.io/nvidia/jax-toolbox-mealkit` namespace |
| 335 | + case ${stage} in |
| 336 | + mealkit) |
| 337 | + target_image=${MEALKIT_IMAGE_REPO} |
| 338 | + ;; |
| 339 | + final) |
| 340 | + target_image=${FINAL_IMAGE_REPO} |
| 341 | + ;; |
| 342 | + esac |
| 343 | +
|
| 344 | + PUBLISH_CONFIGS=$( |
| 345 | + echo ${PUBLISH_CONFIGS} | jq -c ". + [{ |
| 346 | + \"flavor\": \"${flavor}\", |
| 347 | + \"target_image\": \"${target_image}\", |
| 348 | + \"priority\": \"${priority}\", |
| 349 | + \"source_image\": ${source_image}, |
| 350 | + \"stage\": \"${stage}\" |
| 351 | + }]" |
| 352 | + ) |
| 353 | + else |
| 354 | + echo "SKIPPED image $flavor with 0 $stage containers" |
| 355 | + fi |
| 356 | + done |
| 357 | + done |
| 358 | +
|
| 359 | + PUBLISH_CONFIGS=$(echo "$PUBLISH_CONFIGS" | jq -c '{"config": .}') |
| 360 | + echo ${PUBLISH_CONFIGS} | jq |
| 361 | + echo "PUBLISH_CONFIGS=${PUBLISH_CONFIGS}" >> $GITHUB_OUTPUT |
| 362 | +
|
| 363 | + publish-containers: |
| 364 | + needs: |
| 365 | + - metadata |
| 366 | + - make-publish-configs |
| 367 | + if: ${{ !cancelled() && needs.make-publish-configs.outputs.PUBLISH_CONFIGS.config != '{"config":[]}' }} |
| 368 | + strategy: |
| 369 | + fail-fast: false |
| 370 | + matrix: ${{ fromJson(needs.make-publish-configs.outputs.PUBLISH_CONFIGS) }} |
| 371 | + uses: ./.github/workflows/_publish_container.yaml |
132 | 372 | with: |
133 | | - ARCHITECTURE: amd64 |
134 | | - ARTIFACT_NAME: artifact-maxtext-build |
135 | | - BADGE_FILENAME: badge-maxtext-build |
136 | | - BUILD_DATE: 20240418 |
137 | | - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} |
138 | | - CONTAINER_NAME: maxtext |
139 | | - DOCKERFILE: .github/container/Dockerfile.maxtext.amd64 |
140 | | - secrets: inherit |
| 373 | + ARTIFACT_NAME: ${{ matrix.config.stage }}-${{ matrix.config.flavor }} |
| 374 | + ARTIFACT_TAG: ${{ matrix.config.flavor }}-${{ needs.metadata.outputs.BUILD_DATE }} |
| 375 | + SOURCE_IMAGE: ${{ join(matrix.config.source_image, ' ') }} |
| 376 | + TARGET_IMAGE: ${{ matrix.config.target_image }} |
| 377 | + TARGET_TAGS: | |
| 378 | + type=raw,value=${{ matrix.config.flavor }},priority=${{ matrix.config.priority }} |
| 379 | + type=raw,value=${{ matrix.config.flavor }}-${{ needs.metadata.outputs.BUILD_DATE }},priority=${{ matrix.config.priority }} |
141 | 380 |
|
142 | | - build-rosetta-maxtext: |
143 | | - needs: build-upstream-maxtext |
144 | | - uses: ./.github/workflows/_build_rosetta.yaml |
| 381 | + finalize: |
| 382 | + needs: [metadata, amd64, arm64, publish-containers] |
| 383 | + if: '!cancelled()' |
| 384 | + uses: ./.github/workflows/_finalize.yaml |
145 | 385 | with: |
146 | | - ARCHITECTURE: amd64 |
147 | | - BUILD_DATE: 20240418 |
148 | | - BASE_IMAGE: ${{ needs.build-upstream-maxtext.outputs.DOCKER_TAG_MEALKIT }} |
149 | | - BASE_LIBRARY: maxtext |
| 386 | + BUILD_DATE: ${{ needs.metadata.outputs.BUILD_DATE }} |
| 387 | + PUBLISH_BADGE: ${{ needs.metadata.outputs.PUBLISH == 'true' }} |
150 | 388 | secrets: inherit |
0 commit comments