Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
98 commits
Select commit Hold shift + click to select a range
5bea679
start drafting support for axlearn
Steboss Jan 31, 2025
807d3df
fix test for axlearn
Steboss Feb 3, 2025
9947c08
add build for axlearn
Steboss Feb 3, 2025
ef775d5
install dependencies
Steboss Feb 4, 2025
15927d2
check tests
Steboss Feb 4, 2025
af9dad3
make the bash script executable
Steboss Feb 6, 2025
9e4d4a5
minimal ci to test axlearn
Steboss Feb 7, 2025
a3b8f26
fix requirements
Steboss Feb 10, 2025
94054fd
fix installation from pip
Steboss Feb 10, 2025
b9e893c
remove the nvidia-cunn package
Steboss Feb 10, 2025
4088e2f
fix input for tests
Steboss Feb 10, 2025
15781cb
fix test and create output
Steboss Feb 11, 2025
031cfb0
fix requirements
Steboss Feb 11, 2025
1fce714
setup for running axlearn tests on k8s
Steboss Feb 13, 2025
89d388c
fix indentation
Steboss Feb 13, 2025
6c47cf5
what an error
Steboss Feb 13, 2025
44ffb6e
add the k8s option
Steboss Feb 13, 2025
a926bf9
try a test with 5 files and avoid postprocessing on k8s
Steboss Feb 13, 2025
6349f44
fix test
Steboss Feb 14, 2025
004ed78
remove postprocess
Steboss Feb 14, 2025
cf85814
reusable actions test
Steboss Feb 14, 2025
f32ee76
fix
Steboss Feb 14, 2025
04c6cf9
test on single piece
Steboss Feb 14, 2025
cfc68db
add checkout
Steboss Feb 14, 2025
65eca97
restart ci
Steboss Feb 14, 2025
580bf73
general clean up
Steboss Feb 14, 2025
165eb87
Merge branch 'main' into sbosisio/support_axlearn
Steboss Feb 14, 2025
3cd5b78
Fix nsys
Steboss Feb 15, 2025
c7de742
Merge branch 'sbosisio/support_axlearn' of github.com:NVIDIA/JAX-Tool…
Steboss Feb 15, 2025
51307d9
fix typo
Steboss Feb 16, 2025
8d7af61
test on eks
Steboss Feb 17, 2025
ca15908
forgot the done for
Steboss Feb 17, 2025
9fe301c
move ghcr deletion a part
Steboss Feb 17, 2025
9125c82
try to replace postprocess
Steboss Feb 17, 2025
4b39c9c
fix nccl test
Steboss Feb 17, 2025
9516183
fix errors
Steboss Feb 17, 2025
cbee8bb
fix typo
Steboss Feb 17, 2025
8aed044
make a test with 5 files
Steboss Feb 18, 2025
91a2bf7
fix conflicts
Steboss Feb 18, 2025
1a97746
fix comments
Steboss Feb 18, 2025
715911c
Merge branch 'main' into sbosisio/support_axlearn
Steboss Feb 18, 2025
852d381
test axlearn
Steboss Feb 18, 2025
c4d3bbf
fix nccl test variables, install in test file, make a signal for test…
Steboss Feb 18, 2025
4b5a56b
Fix var output
Steboss Feb 18, 2025
d205f6a
test clean
Steboss Feb 18, 2025
8a9de05
fix test
Steboss Feb 18, 2025
5a0bb04
remove always
Steboss Feb 18, 2025
d7fb8c3
indentention error
Steboss Feb 18, 2025
d3500bd
fix runner size
Steboss Feb 18, 2025
569fb5f
try with post step
Steboss Feb 20, 2025
0de66b0
build axlearn with tensorflow-cpu
Steboss Feb 20, 2025
8fbacde
placeholder for models on eks
Steboss Feb 21, 2025
026b37a
test a setup for running fuji 1B on slurm
Steboss Feb 21, 2025
2dd21ad
fix naming
Steboss Feb 21, 2025
e350434
fix indt
Steboss Feb 21, 2025
2c8409d
set k8s jobs to run for 20 min
Steboss Feb 24, 2025
4777359
try a test on fuji 7B params
Steboss Feb 24, 2025
4518ec5
Merge branch 'main' into sbosisio/support_axlearn
Steboss Feb 24, 2025
1f3e1e4
upload test script for testing
Steboss Feb 24, 2025
ab1ff8a
Merge branch 'sbosisio/support_axlearn' of github.com:NVIDIA/JAX-Tool…
Steboss Feb 24, 2025
ea2a265
reset the 7B
Steboss Feb 24, 2025
5fd3400
address comments
Steboss Feb 24, 2025
d680e66
fix path for git
Steboss Feb 24, 2025
c200dea
fix error in bash
Steboss Feb 24, 2025
0b1a61f
fix the 3B model run on k8s
Steboss Feb 25, 2025
5693a5c
@olupton comments
Steboss Feb 26, 2025
64c646f
fix errors
Steboss Feb 26, 2025
9009dc4
test uuidgen
Steboss Feb 26, 2025
5604763
test with random
Steboss Feb 26, 2025
9d53298
no shell needed
Steboss Feb 26, 2025
2eba3b7
revert test nccl and simplify the submit k8s
Steboss Feb 26, 2025
900ebb2
Fix the nccl test
Steboss Feb 26, 2025
a5b8e08
do not add the shell
Steboss Feb 26, 2025
43f75a6
correct typos
Steboss Feb 26, 2025
e3a9e4e
fix the fuji eks model
Steboss Feb 26, 2025
785f8ae
remove k8s
Steboss Feb 27, 2025
fe70b0f
Merge branch 'main' into sbosisio/support_axlearn
Steboss Feb 27, 2025
7c2da3f
remove test-fuji.sh, test with slurm
Steboss Feb 27, 2025
4f1f9f7
Merge branch 'sbosisio/support_axlearn' of github.com:NVIDIA/JAX-Tool…
Steboss Feb 27, 2025
d2823b8
try to not install seqio for tensorflow
Steboss Feb 27, 2025
5afc8d9
recommit seqio
Steboss Feb 27, 2025
bbe8c3b
substitute tensorflow with cpu one
Steboss Feb 28, 2025
f711efc
fix the test
Steboss Feb 28, 2025
faf0b83
fix installation process
Steboss Feb 28, 2025
465264b
Merge branch 'main' into sbosisio/support_axlearn
Steboss Mar 3, 2025
b2579cb
@olupton comments work
Steboss Mar 3, 2025
fc64bbd
fix typo
Steboss Mar 3, 2025
7f186cc
fix readme, and copy of zip file, and xla flags
Steboss Mar 3, 2025
63ace5d
fix test error
Steboss Mar 3, 2025
ec6b548
run small test
Steboss Mar 3, 2025
68c0010
change with zip
Steboss Mar 3, 2025
46b6c9e
change with zip
Steboss Mar 3, 2025
7ef84c4
fix the copy
Steboss Mar 3, 2025
828073c
fixed tests and comments @olupton
Steboss Mar 3, 2025
3818333
Merge branch 'main' into sbosisio/support_axlearn
Steboss Mar 4, 2025
97f0215
fix ci typo
Steboss Mar 4, 2025
f1fbff2
Fix test-nsys-jax-eks
Steboss Mar 4, 2025
626d1a7
fix names in CI
Steboss Mar 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/actions/store-delete-k8s-ghcr/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Store & Delete GHCR Token
description: Store and Delete the docker credentails for pulling from GHCR

outputs:
token-name:
description: Name of the K8s secret to delete
value: ${{ steps.token.outputs.token-name }}

runs:
using: "composite"
steps:
- name: Generate a UUID token
shell: bash
id: token
run: |
echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
- name: Delete GitHub Container Registry token
uses: ./.github/actions/with-post-step
with:
main: |
# Store GitHub Container Registry token as Kubernetes secret
kubectl create secret generic \
${{ steps.token.outputs.token-name }} \
--from-file=.dockerconfigjson=$HOME/.docker/config.json \
--type=kubernetes.io/dockerconfigjson
post: |
kubectl delete secret ${{ steps.token.outputs.token-name }}
37 changes: 37 additions & 0 deletions .github/actions/submit-delete-k8s-job/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Submit & Delete K8s Job
description: Submit and delete a K8s job after its execution

inputs:
job-name:
description: The job name
required: true
job-config-file:
description: Path to the Kubernetes job YAML
required: true

runs:
using: "composite"
steps:
- name: Submit and Delete Kubernetes job
uses: ./.github/actions/with-post-step
with:
main: |
echo "Submit K8s job"
kubectl apply -f "${{ inputs.job-config-file }}"

# Wait for job to be craeted
kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s

# Wait for job to be unsuspended
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s

# Wait for pods to be running
kubectl wait --for=condition=Ready \
--selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
--timeout=600s pod

# Stream logs
kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}

post: |
kubectl delete -f "${{ inputs.job-config-file }}"
42 changes: 42 additions & 0 deletions .github/actions/with-post-step/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# ==================================================================================================================== #
# Authors: #
# Patrick Lehmann #
# Unai Martinez-Corral #
# #
# ==================================================================================================================== #
# Copyright 2020-2024 The pyTooling Authors #
# #
# Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. #
# You may obtain a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
# #
# SPDX-License-Identifier: Apache-2.0 #
# ==================================================================================================================== #
name: With post step

description: 'Generic JS Action to execute a main command and set a command as a post step.'

inputs:
main:
description: 'Main command/script.'
required: true
post:
description: 'Post command/script.'
required: true
key:
description: 'Name of the state variable used to detect the post step.'
required: false
default: POST

runs:
using: 'node20'
main: 'main.js'
post: 'main.js'
46 changes: 46 additions & 0 deletions .github/actions/with-post-step/main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/* ================================================================================================================== *
* Authors: *
* Unai Martinez-Corral *
* *
* ================================================================================================================== *
* Copyright 2021-2022 Unai Martinez-Corral <[email protected]> *
* Copyright 2022 Unai Martinez-Corral <[email protected]> *
* *
* Licensed under the Apache License, Version 2.0 (the "License"); *
* you may not use this file except in compliance with the License. *
* You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
* *
* SPDX-License-Identifier: Apache-2.0 *
* ================================================================================================================== *
* *
* Context: *
* * https://github.com/docker/login-action/issues/72 *
* * https://github.com/actions/runner/issues/1478 *
* ================================================================================================================== */
const { spawn } = require("child_process");
const { appendFileSync } = require("fs");
const { EOL } = require("os");

function run(cmd) {
const subprocess = spawn(cmd, { stdio: "inherit", shell: true });
subprocess.on("exit", (exitCode) => {
process.exitCode = exitCode;
});
}

const key = process.env.INPUT_KEY.toUpperCase();

if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step?
run(process.env.INPUT_POST);
} else { // Otherwise, this is the main step
appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`);
run(process.env.INPUT_MAIN);
}
44 changes: 44 additions & 0 deletions .github/container/Dockerfile.axlearn
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
ARG SRC_PATH_AXLEARN=/opt/axlearn

###############################################################################
## Download source and configure dependencies
###############################################################################
FROM ${BASE_IMAGE} AS mealkit
ARG URLREF_AXLEARN
ARG SRC_PATH_AXLEARN

RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"

# these packages are needed to run axlearn tests
# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference
RUN <<"EOF" bash -ex
echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
aqtp==0.8.2
einops==0.8.0
nltk==3.7
portpicker==1.6.0
seqio==0.0.18
protobuf==3.20.3
pytest>=7.4.3
REQUIREMENTS
EOF


###############################################################################
## Add test script to the path
###############################################################################

ADD test-axlearn.sh /usr/local/bin/

###############################################################################
## Install accumulated packages from the base image and the previous stage
###############################################################################
FROM mealkit AS final

RUN pip-finalize.sh

WORKDIR ${SRC_PATH_AXLEARN}
6 changes: 6 additions & 0 deletions .github/container/pip-finalize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
exit 1
fi

# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
if [ "$(uname -m)" = "x86_64" ]; then
sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
else
echo "Skipping TF on $(uname -m)"
fi
# --no-deps is required since conflicts can still appear during pip-sync
pip-sync --pip-args '--no-deps --src /opt' requirements.txt

Expand Down
169 changes: 169 additions & 0 deletions .github/container/test-axlearn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/bin/bash

set -uo pipefail

usage() {
echo "Run tests in axlearn with specified options."
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo " OPTIONS DESCRIPTION"
echo " -d, --directory DIR Directory to run tests in."
echo " Default: 'axlearn/axlearn/common'."
echo " -t, --test-files FILES Pattern for test files to run."
echo " Default: '*_test.py'."
echo " -o, --output DIRECTORY Output directory for logs and summary."
echo " Default: 'test_runs/<timestamp>'."
echo " -h, --help Show this help message and exit."
exit 1
}

# Default values
DIR='axlearn/axlearn/common'
TEST_FILES=()
OUTPUT_DIRECTORY=''

# Parse args manually
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-d|--directory)
if [[ -z "$2" ]]; then
echo "Error: --directory requires an argument."
usage
fi
DIR="$2"
shift 2
;;
-t|--test-files)
shift
# Collect all arguments until the next option (starting with '-')
if [[ $# -eq 0 ]]; then
echo "Error: --test-files requires at least one file pattern."
usage
fi
echo "Option -t|--test-files with arguments:"
while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
echo " $1"
TEST_FILES+=("$1")
shift
done
;;
-o|--output)
if [[ -z "$2" ]]; then
echo "Error: --output requires an argument."
usage
fi
OUTPUT_DIRECTORY="$2"
shift 2
;;
-h|--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done


if [ -z "$OUTPUT_DIRECTORY" ]; then
timestamp=$(date +%Y%m%d_%H%M%S)
OUTPUT_DIRECTORY="test_runs/${timestamp}"
fi
LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"

mkdir -p "${LOG_DIRECTORY}"

# Print out config for sanity check
echo "Configuration:"
echo " Directory: $DIR"
if [ "${#TEST_FILES[@]}" -gt 0 ]; then
echo " Test Files:"
for f in "${TEST_FILES[@]}"; do
echo " $f"
done
else
echo " Test Files Pattern: '*_test.py' (default)"
fi
echo " Output Directory: $OUTPUT_DIRECTORY"

cd "$DIR" || exit 1

echo "Running tests..."

pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip install timm transformers scikit-learn


if [ "${#TEST_FILES[@]}" -eq 0 ]; then
TEST_FILES=("*_test.py")
fi

expanded_test_files=()
for pattern in "${TEST_FILES[@]}"; do
# retrieve all the files
files=( $pattern )
if [ "${#files[@]}" -gt 0 ]; then
expanded_test_files+=( "${files[@]}" )
else
echo "Warning: No files matched pattern '$pattern'"
fi
done

if [ "${#expanded_test_files[@]}" -eq 0 ]; then
echo "No test files found to run."
exit 1
fi

# in case we have the exclusion list file
EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
EXCLUDE_PATTERNS=()

if [ -f "$EXCLUDE_LIST_FILE" ]; then
echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'"
mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE"
else
echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
fi

final_test_files=()

for test_file in "${expanded_test_files[@]}"; do
exclude=false
for pattern in "${EXCLUDE_PATTERNS[@]}"; do
if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
exclude=true
break
fi
done
if [ "$exclude" = false ]; then
final_test_files+=("$test_file")
fi
done

# Initialize counters for test
failures=0
passed=0
SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"


for test_file in "${final_test_files[@]}"; do
echo "Running: ${test_file}"
log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
log_file="${LOG_DIRECTORY}/${log_file_name}"
# run the tests and save them as *.log
pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
exit_code=${PIPESTATUS[0]}
echo $exit_code
# write number of tests passed and failed
if [ $exit_code -eq 0 ]; then
echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
((passed++))
else
echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}"
((failures++))
fi
echo ""
done
Loading
Loading