diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml new file mode 100644 index 000000000..1d3acec18 --- /dev/null +++ b/.github/actions/store-delete-k8s-ghcr/action.yml @@ -0,0 +1,27 @@ +name: Store & Delete GHCR Token +description: Store and Delete the docker credentails for pulling from GHCR + +outputs: + token-name: + description: Name of the K8s secret to delete + value: ${{ steps.token.outputs.token-name }} + +runs: + using: "composite" + steps: + - name: Generate a UUID token + shell: bash + id: token + run: | + echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT + - name: Delete GitHub Container Registry token + uses: ./.github/actions/with-post-step + with: + main: | + # Store GitHub Container Registry token as Kubernetes secret + kubectl create secret generic \ + ${{ steps.token.outputs.token-name }} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + post: | + kubectl delete secret ${{ steps.token.outputs.token-name }} diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml new file mode 100644 index 000000000..dbeabe668 --- /dev/null +++ b/.github/actions/submit-delete-k8s-job/action.yml @@ -0,0 +1,37 @@ +name: Submit & Delete K8s Job +description: Submit and delete a K8s job after its execution + +inputs: + job-name: + description: The job name + required: true + job-config-file: + description: Path to the Kubernetes job YAML + required: true + +runs: + using: "composite" + steps: + - name: Submit and Delete Kubernetes job + uses: ./.github/actions/with-post-step + with: + main: | + echo "Submit K8s job" + kubectl apply -f "${{ inputs.job-config-file }}" + + # Wait for job to be craeted + kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s + + # Wait for job to be unsuspended + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s + + # Wait for pods to be running + kubectl wait --for=condition=Ready \ + --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \ + --timeout=600s pod + + # Stream logs + kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }} + + post: | + kubectl delete -f "${{ inputs.job-config-file }}" diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml new file mode 100644 index 000000000..69c2a6eff --- /dev/null +++ b/.github/actions/with-post-step/action.yml @@ -0,0 +1,42 @@ +# ==================================================================================================================== # +# Authors: # +# Patrick Lehmann # +# Unai Martinez-Corral # +# # +# ==================================================================================================================== # +# Copyright 2020-2024 The pyTooling Authors # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +# # +# SPDX-License-Identifier: Apache-2.0 # +# ==================================================================================================================== # +name: With post step + +description: 'Generic JS Action to execute a main command and set a command as a post step.' + +inputs: + main: + description: 'Main command/script.' + required: true + post: + description: 'Post command/script.' + required: true + key: + description: 'Name of the state variable used to detect the post step.' + required: false + default: POST + +runs: + using: 'node20' + main: 'main.js' + post: 'main.js' diff --git a/.github/actions/with-post-step/main.js b/.github/actions/with-post-step/main.js new file mode 100644 index 000000000..47a817cbc --- /dev/null +++ b/.github/actions/with-post-step/main.js @@ -0,0 +1,46 @@ +/* ================================================================================================================== * + * Authors: * + * Unai Martinez-Corral * + * * + * ================================================================================================================== * + * Copyright 2021-2022 Unai Martinez-Corral * + * Copyright 2022 Unai Martinez-Corral * + * * + * Licensed under the Apache License, Version 2.0 (the "License"); * + * you may not use this file except in compliance with the License. * + * You may obtain a copy of the License at * + * * + * http://www.apache.org/licenses/LICENSE-2.0 * + * * + * Unless required by applicable law or agreed to in writing, software * + * distributed under the License is distributed on an "AS IS" BASIS, * + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * + * See the License for the specific language governing permissions and * + * limitations under the License. * + * * + * SPDX-License-Identifier: Apache-2.0 * + * ================================================================================================================== * + * * + * Context: * + * * https://github.com/docker/login-action/issues/72 * + * * https://github.com/actions/runner/issues/1478 * + * ================================================================================================================== */ +const { spawn } = require("child_process"); +const { appendFileSync } = require("fs"); +const { EOL } = require("os"); + +function run(cmd) { + const subprocess = spawn(cmd, { stdio: "inherit", shell: true }); + subprocess.on("exit", (exitCode) => { + process.exitCode = exitCode; + }); +} + +const key = process.env.INPUT_KEY.toUpperCase(); + +if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step? + run(process.env.INPUT_POST); +} else { // Otherwise, this is the main step + appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`); + run(process.env.INPUT_MAIN); +} \ No newline at end of file diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn new file mode 100644 index 000000000..ac73d07c6 --- /dev/null +++ b/.github/container/Dockerfile.axlearn @@ -0,0 +1,44 @@ +# syntax=docker/dockerfile:1-labs +ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax +ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main +ARG SRC_PATH_AXLEARN=/opt/axlearn + +############################################################################### +## Download source and configure dependencies +############################################################################### +FROM ${BASE_IMAGE} AS mealkit +ARG URLREF_AXLEARN +ARG SRC_PATH_AXLEARN + +RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}" + +# these packages are needed to run axlearn tests +# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference +RUN <<"EOF" bash -ex + echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in + cat <> /opt/pip-tools.d/requirements-axlearn.in +aqtp==0.8.2 +einops==0.8.0 +nltk==3.7 +portpicker==1.6.0 +seqio==0.0.18 +protobuf==3.20.3 +pytest>=7.4.3 +REQUIREMENTS +EOF + + +############################################################################### +## Add test script to the path +############################################################################### + +ADD test-axlearn.sh /usr/local/bin/ + +############################################################################### +## Install accumulated packages from the base image and the previous stage +############################################################################### +FROM mealkit AS final + +RUN pip-finalize.sh + +WORKDIR ${SRC_PATH_AXLEARN} diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh index 6d8ceac9b..285da565c 100755 --- a/.github/container/pip-finalize.sh +++ b/.github/container/pip-finalize.sh @@ -46,6 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then exit 1 fi +# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64 +if [ "$(uname -m)" = "x86_64" ]; then + sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt +else + echo "Skipping TF on $(uname -m)" +fi # --no-deps is required since conflicts can still appear during pip-sync pip-sync --pip-args '--no-deps --src /opt' requirements.txt diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh new file mode 100755 index 000000000..d1993cc03 --- /dev/null +++ b/.github/container/test-axlearn.sh @@ -0,0 +1,169 @@ +#!/bin/bash + +set -uo pipefail + +usage() { + echo "Run tests in axlearn with specified options." + echo "" + echo "Usage: $0 [OPTIONS]" + echo "" + echo " OPTIONS DESCRIPTION" + echo " -d, --directory DIR Directory to run tests in." + echo " Default: 'axlearn/axlearn/common'." + echo " -t, --test-files FILES Pattern for test files to run." + echo " Default: '*_test.py'." + echo " -o, --output DIRECTORY Output directory for logs and summary." + echo " Default: 'test_runs/'." + echo " -h, --help Show this help message and exit." + exit 1 +} + +# Default values +DIR='axlearn/axlearn/common' +TEST_FILES=() +OUTPUT_DIRECTORY='' + +# Parse args manually +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -d|--directory) + if [[ -z "$2" ]]; then + echo "Error: --directory requires an argument." + usage + fi + DIR="$2" + shift 2 + ;; + -t|--test-files) + shift + # Collect all arguments until the next option (starting with '-') + if [[ $# -eq 0 ]]; then + echo "Error: --test-files requires at least one file pattern." + usage + fi + echo "Option -t|--test-files with arguments:" + while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do + echo " $1" + TEST_FILES+=("$1") + shift + done + ;; + -o|--output) + if [[ -z "$2" ]]; then + echo "Error: --output requires an argument." + usage + fi + OUTPUT_DIRECTORY="$2" + shift 2 + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $1" + usage + ;; + esac +done + + +if [ -z "$OUTPUT_DIRECTORY" ]; then + timestamp=$(date +%Y%m%d_%H%M%S) + OUTPUT_DIRECTORY="test_runs/${timestamp}" +fi +LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs" + +mkdir -p "${LOG_DIRECTORY}" + +# Print out config for sanity check +echo "Configuration:" +echo " Directory: $DIR" +if [ "${#TEST_FILES[@]}" -gt 0 ]; then + echo " Test Files:" + for f in "${TEST_FILES[@]}"; do + echo " $f" + done +else + echo " Test Files Pattern: '*_test.py' (default)" +fi +echo " Output Directory: $OUTPUT_DIRECTORY" + +cd "$DIR" || exit 1 + +echo "Running tests..." + +pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu +pip install timm transformers scikit-learn + + +if [ "${#TEST_FILES[@]}" -eq 0 ]; then + TEST_FILES=("*_test.py") +fi + +expanded_test_files=() +for pattern in "${TEST_FILES[@]}"; do + # retrieve all the files + files=( $pattern ) + if [ "${#files[@]}" -gt 0 ]; then + expanded_test_files+=( "${files[@]}" ) + else + echo "Warning: No files matched pattern '$pattern'" + fi +done + +if [ "${#expanded_test_files[@]}" -eq 0 ]; then + echo "No test files found to run." + exit 1 +fi + +# in case we have the exclusion list file +EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt" +EXCLUDE_PATTERNS=() + +if [ -f "$EXCLUDE_LIST_FILE" ]; then + echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'" + mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE" +else + echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'" +fi + +final_test_files=() + +for test_file in "${expanded_test_files[@]}"; do + exclude=false + for pattern in "${EXCLUDE_PATTERNS[@]}"; do + if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then + exclude=true + break + fi + done + if [ "$exclude" = false ]; then + final_test_files+=("$test_file") + fi +done + +# Initialize counters for test +failures=0 +passed=0 +SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt" + + +for test_file in "${final_test_files[@]}"; do + echo "Running: ${test_file}" + log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log + log_file="${LOG_DIRECTORY}/${log_file_name}" + # run the tests and save them as *.log + pytest "${test_file}" --capture=tee-sys | tee "${log_file}" + exit_code=${PIPESTATUS[0]} + echo $exit_code + # write number of tests passed and failed + if [ $exit_code -eq 0 ]; then + echo "${test_file}: PASSED" >> "${SUMMARY_FILE}" + ((passed++)) + else + echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}" + ((failures++)) + fi + echo "" +done diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml new file mode 100644 index 000000000..a36411d73 --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml @@ -0,0 +1,66 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + # the job will run for 20 mins, as we can't set max_steps + activeDeadlineSeconds: 1200 + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn-fuji-model + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - | + BASEDIR="/opt/axlearn" + CONFIG="fuji-3B-v3-flash-single-host" + HLO_DUMP=0 + POSTFIX="" + + AR_THRESHOLD=1073741824 + AG_THRESHOLD=8589934592 + RS_THRESHOLD=8589934592 + BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 + --xla_gpu_enable_pipelined_all_gather=true + --xla_gpu_enable_pipelined_reduce_scatter=true + --xla_gpu_enable_pipelined_all_reduce=true + --xla_gpu_enable_while_loop_double_buffering=true + --xla_disable_hlo_passes=rematerialization} + + export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" + export TF_GPU_ALLOCATOR=cuda_malloc_async + + LOG_DIR=${BASEDIR}/logs + TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir + mkdir -p ${TRAINER_DIR} + + + python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml new file mode 100644 index 000000000..8f70908da --- /dev/null +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 1 + parallelism: 1 + template: + spec: + restartPolicy: Never + containers: + - name: axlearn + image: PLACEHOLDER + command: + - bash + - -xo + - pipefail + - -c + - | + test-axlearn.sh \ + --directory "." \ + --output "/opt/output/" \ + --test-files "/opt/axlearn/axlearn/common/*_test.py" + + sync + wait + # after execution flag the results have been produced + touch /opt/output/done + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: output + mountPath: /opt/output + - name: upload + image: amazon/aws-cli + env: + - name: RUN_ID + value: PLACEHOLDER + command: + - sh + - -c + - | + while [ ! -f /opt/output/done ]; do + sleep 5 + done + # Upload to S3 bucket + aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt + # Upload logs to S3 bucket + aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log" + volumeMounts: + - name: output + mountPath: /opt/output + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: output + emptyDir: {} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 4fcf3e563..8ed17d9d6 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -176,9 +176,23 @@ jobs: URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }} secrets: inherit + build-axlearn: + needs: build-jax + uses: ./.github/workflows/_build.yaml + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-axlearn-build + BADGE_FILENAME: badge-axlearn-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: axlearn + DOCKERFILE: .github/container/Dockerfile.axlearn + RUNNER_SIZE: large + secrets: inherit + collect-docker-tags: runs-on: ubuntu-22.04 - if: "!cancelled()" + if: ${{ !cancelled() }} needs: - build-base - build-jax @@ -189,6 +203,7 @@ jobs: - build-upstream-t5x - build-rosetta-t5x - build-gemma + - build-axlearn outputs: TAGS: ${{ steps.collect-tags.outputs.TAGS }} steps: @@ -206,6 +221,7 @@ jobs: {"flavor": "upstream-t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "t5x", "stage": "final", "priority": 900, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "gemma", "stage": "final", "priority": 900, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\ + {"flavor": "axlearn", "stage": "final", "priority": 900, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\ {"flavor": "jax", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "triton", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "equinox", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\ @@ -214,6 +230,7 @@ jobs: {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "t5x", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\ {"flavor": "gemma", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\ + {"flavor": "axlearn", "stage": "mealkit", "priority": 500, "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\ {}\ ] @@ -376,9 +393,9 @@ jobs: *-execution-combine.log secrets: inherit - # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # not already have nsys-jax installed + #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + #not already have nsys-jax installed test-nsys-jax-archive: needs: test-nsys-jax if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -413,9 +430,8 @@ jobs: runs-on: eks env: JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token + JOB_NAME: ${{ github.run_id }}-nsys-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess steps: - name: Check out the repository uses: actions/checkout@v4 @@ -425,59 +441,37 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - kubectl create secret generic \ - ${{ github.run_id }}-${{ github.run_attempt }}-token \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Configure Kubernetes job run: | yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \ .github/eks-workflow-files/job.yml git diff .github/eks-workflow-files/job.yml - name: Submit Kubernetes job - run: kubectl apply -f .github/eks-workflow-files/job.yml - - name: Wait for Kubernetes job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 2 - done - - name: Stream Kubernetes job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax - # Clean up in case of errors as well as success - - name: Delete Kubernetes job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: .github/eks-workflow-files/job.yml + job-name: ${{ env.JOB_NAME }} - name: Configure post-processing job run: | export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ .github/eks-workflow-files/post-process-job.yml git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post-processing Kubernetes job - run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml - - name: Wait for post-processing Kubernetes job to start - run: | - while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do - sleep 2 - done - - name: Stream post-processing Kubernetes job output - run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess - # Clean up in case of errors as well as success - - name: Delete post-processing Kubernetes job - if: always() - run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token + - name: Submit post process Kubernetes job + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} # test-equinox: # needs: build-equinox @@ -582,7 +576,7 @@ jobs: ARTIFACTS: | test-levanter.log secrets: inherit - + # test-te: # needs: build-upstream-pax # if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a @@ -643,3 +637,136 @@ jobs: with: MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} secrets: inherit + + test-axlearn-eks: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: axlearn-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure axlearn test job + run: | + # Replace placeholders in axlearn-job.yml with environment variables + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/axlearn/axlearn-job.yml + git diff .github/eks-workflow-files/axlearn/axlearn-job.yml + - name: Submit & delete axlearn test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" + job-name: ${{ env.JOB_NAME }} + - name: Download logs from S3 + id: log-s3 + run: | + mkdir -p axlearn-output + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log" + + passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true) + failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true) + total_tests=$((failed_tests + passed_tests)) + + echo "Passed tests: $passed_tests" + echo "Failed tests: $failed_tests" + echo "Total tests: $total_tests" + echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + - name: Generate sitrep + id: sitrep + if: ${{ !cancelled() }} + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='Axlearn EKS Unit' + + total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ + errors="0" \ + summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + badge_message="Passed $passed_tests out of $total_tests." \ + badge_color="brightgreen" + if [ "$failed_tests" -gt 0 ]; then + badge_color="red" + fi \ + + to_json \ + summary \ + errors total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="Passed $passed_tests out of $total_tests." \ + color=$badge_color \ + to_json schemaVersion label message color \ + > badge-axlearn-test.json + + - name: Upload artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: "artifact-axlearn-test" + path: | + sitrep.json + badge-axlearn-test.json + axlearn-output/* + + # the fuji test will run for 20 minutes only, as per 2025-02-24 + # is not possible to set the `max_steps` value + # this will be done with a customer python code + test-axlearn-fuji-models-eks: + needs: build-axlearn + if: inputs.ARCHITECTURE == 'amd64' + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure axlearn test job + run: | + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + + - name: Submit & delete axlearn test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" + job-name: ${{ env.JOB_NAME }} + diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml index 54da0886e..f8b328b76 100644 --- a/.github/workflows/_test_nccl.yaml +++ b/.github/workflows/_test_nccl.yaml @@ -47,31 +47,30 @@ jobs: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Store GitHub Container Registry token as Kubernetes secret - run: | - # Replace underscores in TEST_NAME with - to make a valid Kubernetes name + - name: Create env vars + id: var + shell: bash + run: | JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}" LAUNCHER_NAME="${JOB_NAME}-launcher" TOKEN_NAME="${JOB_NAME}-token" # Make these available to later steps echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV" echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV" - echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" - kubectl create secret generic \ - ${TOKEN_NAME} \ - --from-file=.dockerconfigjson=$HOME/.docker/config.json \ - --type=kubernetes.io/dockerconfigjson + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr - name: Configure Kubernetes job run: | export WORKER_NAME="${JOB_NAME}-worker" yq -i '.metadata.name = strenv(JOB_NAME) | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME) - | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE) | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME) - | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \ + | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ .github/eks-workflow-files/mpi-nccl-test.yml git diff .github/eks-workflow-files/mpi-nccl-test.yml - name: Submit Kubernetes job @@ -126,6 +125,3 @@ jobs: - name: Delete Kubernetes job if: always() run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml - - name: Delete GitHub Container Registry token - if: always() - run: kubectl delete secret ${TOKEN_NAME} diff --git a/README.md b/README.md index 648208205..83053215e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ We support and test the following JAX frameworks and model architectures. More d | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` | | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` | | levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` | +| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | # Build Pipeline Status @@ -248,6 +249,30 @@ We support and test the following JAX frameworks and model architectures. More d + + + + + +
+ + + + + ghcr.io/nvidia/jax:axlearn + + + + +
+ + + +
+ + + +
diff --git a/docs/frameworks/axlearn/README.md b/docs/frameworks/axlearn/README.md new file mode 100644 index 000000000..ad7172ca7 --- /dev/null +++ b/docs/frameworks/axlearn/README.md @@ -0,0 +1,40 @@ +# AXLearn +[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. + + +## Hardware and Software Specifications + +The functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G). + + +## Containers +We provide a multi-architecture container that is regularly updated. Use these containers to avoid dependency and environment issues. +- Latest container: ghcr.io/nvidia/jax:axlearn +- Nightly dated container: ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD + +When you start an interactive session: + +- Navigate to `/opt/axlearn` inside the container. +- Place your persistent files in a mounted directory (e.g. `/opt/axlearn/workspace`). + +## Launching a container +Use the following command to launch a container: +```bash +docker run -ti --gpus=all --net=host --ipc=host -v :/opt/axlearn/workspace -w /opt/axlearn /bin/bash +``` +where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag. + +## Example: training `fuji-3B-v3-flash-single-host` on EKS +[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml) we're using for deploying the training of Fuji-3B model, that uses flash attention, and runs on a single host. The core part of the deployment is: +```bash +python3 -m axlearn.common.launch_trainer_main \ + --module=text.gpt.c4_trainer \ + --config=${CONFIG} \ + --trainer_dir=${TRAINER_DIR} \ + --data_dir=gs://axlearn-public/tensorflow_datasets \ + --jax_backend=gpu +``` +Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). + +## Testing +[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-job.yml) used for testing AXLearn funcitonalities. In particular, this test makes uses of [`test_axlearn.sh` script](../../../.github/container/test-axlearn.sh). The test runs `pytest` against all the tests contains in `/opt/axlearn/axlearn/common` folder.