diff --git a/.github/actions/store-delete-k8s-ghcr/action.yml b/.github/actions/store-delete-k8s-ghcr/action.yml
new file mode 100644
index 000000000..1d3acec18
--- /dev/null
+++ b/.github/actions/store-delete-k8s-ghcr/action.yml
@@ -0,0 +1,27 @@
+name: Store & Delete GHCR Token
+description: Store and Delete the docker credentails for pulling from GHCR
+
+outputs:
+  token-name:
+    description: Name of the K8s secret to delete
+    value: ${{ steps.token.outputs.token-name }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Generate a UUID token 
+      shell: bash 
+      id: token
+      run: | 
+        echo "token-name=${RANDOM}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" >> $GITHUB_OUTPUT
+    - name: Delete GitHub Container Registry token
+      uses: ./.github/actions/with-post-step
+      with: 
+        main: | 
+          # Store GitHub Container Registry token as Kubernetes secret
+          kubectl create secret generic \
+          ${{ steps.token.outputs.token-name }} \
+          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+          --type=kubernetes.io/dockerconfigjson
+        post: |
+          kubectl delete secret ${{ steps.token.outputs.token-name }}
diff --git a/.github/actions/submit-delete-k8s-job/action.yml b/.github/actions/submit-delete-k8s-job/action.yml
new file mode 100644
index 000000000..dbeabe668
--- /dev/null
+++ b/.github/actions/submit-delete-k8s-job/action.yml
@@ -0,0 +1,37 @@
+name: Submit & Delete K8s Job
+description: Submit and delete a K8s job after its execution
+
+inputs:
+  job-name:
+    description: The job name
+    required: true
+  job-config-file:
+    description: Path to the Kubernetes job YAML
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Submit and Delete Kubernetes job
+      uses: ./.github/actions/with-post-step 
+      with: 
+        main: |
+          echo "Submit K8s job" 
+          kubectl apply -f "${{ inputs.job-config-file }}"
+          
+          # Wait for job to be craeted
+          kubectl wait --for=create job/${{ inputs.job-name }} --timeout=60s
+          
+          # Wait for job to be unsuspended
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=7200s
+          
+          # Wait for pods to be running
+          kubectl wait --for=condition=Ready \
+            --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
+            --timeout=600s pod
+          
+          # Stream logs
+          kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
+          
+        post: | 
+          kubectl delete -f "${{ inputs.job-config-file }}"
diff --git a/.github/actions/with-post-step/action.yml b/.github/actions/with-post-step/action.yml
new file mode 100644
index 000000000..69c2a6eff
--- /dev/null
+++ b/.github/actions/with-post-step/action.yml
@@ -0,0 +1,42 @@
+# ==================================================================================================================== #
+# Authors:                                                                                                             #
+#   Patrick Lehmann                                                                                                    #
+#   Unai Martinez-Corral                                                                                               #
+#                                                                                                                      #
+# ==================================================================================================================== #
+# Copyright 2020-2024 The pyTooling Authors                                                                            #
+#                                                                                                                      #
+# Licensed under the Apache License, Version 2.0 (the "License");                                                      #
+# you may not use this file except in compliance with the License.                                                     #
+# You may obtain a copy of the License at                                                                              #
+#                                                                                                                      #
+#   http://www.apache.org/licenses/LICENSE-2.0                                                                         #
+#                                                                                                                      #
+# Unless required by applicable law or agreed to in writing, software                                                  #
+# distributed under the License is distributed on an "AS IS" BASIS,                                                    #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                             #
+# See the License for the specific language governing permissions and                                                  #
+# limitations under the License.                                                                                       #
+#                                                                                                                      #
+# SPDX-License-Identifier: Apache-2.0                                                                                  #
+# ==================================================================================================================== #
+name: With post step
+
+description: 'Generic JS Action to execute a main command and set a command as a post step.'
+
+inputs:
+  main:
+    description: 'Main command/script.'
+    required: true
+  post:
+    description: 'Post command/script.'
+    required: true
+  key:
+    description: 'Name of the state variable used to detect the post step.'
+    required: false
+    default: POST
+
+runs:
+  using: 'node20'
+  main: 'main.js'
+  post: 'main.js'
diff --git a/.github/actions/with-post-step/main.js b/.github/actions/with-post-step/main.js
new file mode 100644
index 000000000..47a817cbc
--- /dev/null
+++ b/.github/actions/with-post-step/main.js
@@ -0,0 +1,46 @@
+/* ================================================================================================================== *
+ * Authors:                                                                                                           *
+ *   Unai Martinez-Corral                                                                                             *
+ *                                                                                                                    *
+ * ================================================================================================================== *
+ * Copyright 2021-2022 Unai Martinez-Corral <unai.martinezcorral@ehu.eus>                                             *
+ * Copyright 2022 Unai Martinez-Corral <umartinezcorral@antmicro.com>                                                 *
+ *                                                                                                                    *
+ * Licensed under the Apache License, Version 2.0 (the "License");                                                    *
+ * you may not use this file except in compliance with the License.                                                   *
+ * You may obtain a copy of the License at                                                                            *
+ *                                                                                                                    *
+ *     http://www.apache.org/licenses/LICENSE-2.0                                                                     *
+ *                                                                                                                    *
+ * Unless required by applicable law or agreed to in writing, software                                                *
+ * distributed under the License is distributed on an "AS IS" BASIS,                                                  *
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.                                           *
+ * See the License for the specific language governing permissions and                                                *
+ * limitations under the License.                                                                                     *
+ *                                                                                                                    *
+ * SPDX-License-Identifier: Apache-2.0                                                                                *
+ * ================================================================================================================== *
+ *                                                                                                                    *
+ * Context:                                                                                                           *
+ * * https://github.com/docker/login-action/issues/72                                                                 *
+ * * https://github.com/actions/runner/issues/1478                                                                    *
+ * ================================================================================================================== */
+const { spawn } = require("child_process");
+const { appendFileSync } = require("fs");
+const { EOL } = require("os");
+
+function run(cmd) {
+  const subprocess = spawn(cmd, { stdio: "inherit", shell: true });
+  subprocess.on("exit", (exitCode) => {
+    process.exitCode = exitCode;
+  });
+}
+
+const key = process.env.INPUT_KEY.toUpperCase();
+
+if ( process.env[`STATE_${key}`] !== undefined ) { // Are we in the 'post' step?
+  run(process.env.INPUT_POST);
+} else { // Otherwise, this is the main step
+  appendFileSync(process.env.GITHUB_STATE, `${key}=true${EOL}`);
+  run(process.env.INPUT_MAIN);
+}
\ No newline at end of file
diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn
new file mode 100644
index 000000000..ac73d07c6
--- /dev/null
+++ b/.github/container/Dockerfile.axlearn
@@ -0,0 +1,44 @@
+# syntax=docker/dockerfile:1-labs
+ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
+ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git#main
+ARG SRC_PATH_AXLEARN=/opt/axlearn
+
+###############################################################################
+## Download source and configure dependencies
+###############################################################################
+FROM ${BASE_IMAGE} AS mealkit
+ARG URLREF_AXLEARN
+ARG SRC_PATH_AXLEARN
+
+RUN git-clone.sh "${URLREF_AXLEARN}" "${SRC_PATH_AXLEARN}"
+
+# these packages are needed to run axlearn tests
+# https://github.com/apple/axlearn/blob/main/pyproject.toml as reference
+RUN <<"EOF" bash -ex
+  echo "-e ${SRC_PATH_AXLEARN}" > /opt/pip-tools.d/requirements-axlearn.in
+  cat <<REQUIREMENTS >> /opt/pip-tools.d/requirements-axlearn.in
+aqtp==0.8.2
+einops==0.8.0
+nltk==3.7
+portpicker==1.6.0
+seqio==0.0.18
+protobuf==3.20.3  
+pytest>=7.4.3
+REQUIREMENTS
+EOF
+
+
+###############################################################################
+## Add test script to the path
+###############################################################################
+
+ADD test-axlearn.sh /usr/local/bin/
+
+###############################################################################
+## Install accumulated packages from the base image and the previous stage
+###############################################################################
+FROM mealkit AS final
+
+RUN pip-finalize.sh
+
+WORKDIR ${SRC_PATH_AXLEARN}
diff --git a/.github/container/pip-finalize.sh b/.github/container/pip-finalize.sh
index 6d8ceac9b..285da565c 100755
--- a/.github/container/pip-finalize.sh
+++ b/.github/container/pip-finalize.sh
@@ -46,6 +46,12 @@ if [[ $(echo -n "$unpinned_vcs_dependencies" | wc -l) -gt 0 ]]; then
   exit 1
 fi
 
+# Replace any tensorflow==X with tensorflow-cpu==X in requirements.txt only on amd64
+if [ "$(uname -m)" = "x86_64" ]; then
+  sed -i 's/^tensorflow==\([0-9.*]\+\)$/tensorflow-cpu==\1/' requirements.txt
+else
+  echo "Skipping TF on $(uname -m)"
+fi
 # --no-deps is required since conflicts can still appear during pip-sync
 pip-sync --pip-args '--no-deps --src /opt' requirements.txt
 
diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh
new file mode 100755
index 000000000..d1993cc03
--- /dev/null
+++ b/.github/container/test-axlearn.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+set -uo pipefail
+
+usage() {
+    echo "Run tests in axlearn with specified options."
+    echo ""
+    echo "Usage: $0 [OPTIONS]"
+    echo ""
+    echo "  OPTIONS                       DESCRIPTION"
+    echo "  -d, --directory DIR           Directory to run tests in."
+    echo "                                Default: 'axlearn/axlearn/common'."
+    echo "  -t, --test-files FILES        Pattern for test files to run."
+    echo "                                Default: '*_test.py'."
+    echo "  -o, --output DIRECTORY        Output directory for logs and summary."
+    echo "                                Default: 'test_runs/<timestamp>'."
+    echo "  -h, --help                    Show this help message and exit."
+    exit 1
+}
+
+# Default values
+DIR='axlearn/axlearn/common'
+TEST_FILES=()
+OUTPUT_DIRECTORY=''
+
+# Parse args manually
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        -d|--directory)
+            if [[ -z "$2" ]]; then
+                echo "Error: --directory requires an argument."
+                usage
+            fi
+            DIR="$2"
+            shift 2
+            ;;
+        -t|--test-files)
+            shift
+            # Collect all arguments until the next option (starting with '-')
+            if [[ $# -eq 0 ]]; then
+                echo "Error: --test-files requires at least one file pattern."
+                usage
+            fi
+            echo "Option -t|--test-files with arguments:"
+            while [[ $# -gt 0 && ! "$1" =~ ^- ]]; do
+                echo "  $1"
+                TEST_FILES+=("$1")
+                shift
+            done
+            ;;
+        -o|--output)
+            if [[ -z "$2" ]]; then
+                echo "Error: --output requires an argument."
+                usage
+            fi
+            OUTPUT_DIRECTORY="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            ;;
+    esac
+done
+
+
+if [ -z "$OUTPUT_DIRECTORY" ]; then
+    timestamp=$(date +%Y%m%d_%H%M%S)
+    OUTPUT_DIRECTORY="test_runs/${timestamp}"
+fi
+LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"
+
+mkdir -p "${LOG_DIRECTORY}"
+
+# Print out config for sanity check
+echo "Configuration:"
+echo "  Directory: $DIR"
+if [ "${#TEST_FILES[@]}" -gt 0 ]; then
+    echo "  Test Files:"
+    for f in "${TEST_FILES[@]}"; do
+        echo "    $f"
+    done
+else
+    echo "  Test Files Pattern: '*_test.py' (default)"
+fi
+echo "  Output Directory: $OUTPUT_DIRECTORY"
+
+cd "$DIR" || exit 1
+
+echo "Running tests..."
+
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+pip install timm transformers scikit-learn 
+
+
+if [ "${#TEST_FILES[@]}" -eq 0 ]; then
+    TEST_FILES=("*_test.py")
+fi
+
+expanded_test_files=()
+for pattern in "${TEST_FILES[@]}"; do
+    # retrieve all the files
+    files=( $pattern )
+    if [ "${#files[@]}" -gt 0 ]; then
+        expanded_test_files+=( "${files[@]}" )
+    else
+        echo "Warning: No files matched pattern '$pattern'"
+    fi
+done
+
+if [ "${#expanded_test_files[@]}" -eq 0 ]; then
+    echo "No test files found to run."
+    exit 1
+fi
+
+# in case we have the exclusion list file 
+EXCLUDE_LIST_FILE="$DIR/exclusion_list.txt"
+EXCLUDE_PATTERNS=()
+
+if [ -f "$EXCLUDE_LIST_FILE" ]; then
+    echo "Reading exclusion list from '$EXCLUDE_LIST_FILE'"
+    mapfile -t EXCLUDE_PATTERNS < "$EXCLUDE_LIST_FILE"
+else
+    echo "Exclusion list file not found at '$EXCLUDE_LIST_FILE'"
+fi
+
+final_test_files=()
+
+for test_file in "${expanded_test_files[@]}"; do 
+    exclude=false 
+    for pattern in "${EXCLUDE_PATTERNS[@]}"; do 
+        if [[ "$(basename "$test_file")" == "$(basename "$pattern")" ]]; then
+            exclude=true 
+            break 
+        fi 
+    done 
+    if [ "$exclude" = false ]; then 
+        final_test_files+=("$test_file")
+    fi 
+done
+
+# Initialize counters for test
+failures=0
+passed=0
+SUMMARY_FILE="${OUTPUT_DIRECTORY}/summary.txt"
+
+
+for test_file in "${final_test_files[@]}"; do
+    echo "Running: ${test_file}"
+    log_file_name=$(echo "${test_file%.py}" | sed 's/\//__/g').log
+    log_file="${LOG_DIRECTORY}/${log_file_name}"
+    # run the tests and save them as *.log
+    pytest "${test_file}" --capture=tee-sys | tee "${log_file}"
+    exit_code=${PIPESTATUS[0]}
+    echo $exit_code
+    # write number of tests passed and failed
+    if [ $exit_code -eq 0 ]; then
+        echo "${test_file}: PASSED" >> "${SUMMARY_FILE}"
+        ((passed++))
+    else
+        echo "${test_file}: FAILED (Exit code: $exit_code)" >> "${SUMMARY_FILE}"
+        ((failures++))
+    fi
+    echo ""
+done
diff --git a/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
new file mode 100644
index 000000000..a36411d73
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
@@ -0,0 +1,66 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+spec:
+    # the job will run for 20 mins, as we can't set max_steps
+    activeDeadlineSeconds: 1200
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn-fuji-model
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - |        
+                      BASEDIR="/opt/axlearn"
+                      CONFIG="fuji-3B-v3-flash-single-host"
+                      HLO_DUMP=0
+                      POSTFIX=""
+
+                      AR_THRESHOLD=1073741824
+                      AG_THRESHOLD=8589934592
+                      RS_THRESHOLD=8589934592
+                      BASE_XLA_FLAGS=${BASE_XLA_FLAGS:---xla_gpu_enable_latency_hiding_scheduler=true
+                          --xla_gpu_all_reduce_combine_threshold_bytes=1073741824
+                          --xla_gpu_all_gather_combine_threshold_bytes=1073741824
+                          --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
+                          --xla_gpu_enable_pipelined_all_gather=true
+                          --xla_gpu_enable_pipelined_reduce_scatter=true
+                          --xla_gpu_enable_pipelined_all_reduce=true
+                          --xla_gpu_enable_while_loop_double_buffering=true
+                          --xla_disable_hlo_passes=rematerialization}
+
+                      export XLA_FLAGS="$BASE_XLA_FLAGS ${XLA_FLAGS:-}" 
+                      export TF_GPU_ALLOCATOR=cuda_malloc_async
+
+                      LOG_DIR=${BASEDIR}/logs
+                      TRAINER_DIR=${LOG_DIR}/${CONFIG}${POSTFIX}-eks/trainer-dir
+                      mkdir -p ${TRAINER_DIR}
+
+
+                      python3 -m axlearn.common.launch_trainer_main \
+                          --module=text.gpt.c4_trainer \
+                          --config=${CONFIG} \
+                          --trainer_dir=${TRAINER_DIR} \
+                          --data_dir=gs://axlearn-public/tensorflow_datasets \
+                          --jax_backend=gpu                    
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml
new file mode 100644
index 000000000..8f70908da
--- /dev/null
+++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml
@@ -0,0 +1,60 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+spec:
+    completions: 1
+    parallelism: 1
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: axlearn
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -xo
+                    - pipefail
+                    - -c
+                    - |
+                      test-axlearn.sh \
+                        --directory "." \
+                        --output "/opt/output/" \
+                        --test-files "/opt/axlearn/axlearn/common/*_test.py"
+
+                      sync  
+                      wait 
+                      # after execution flag the results have been produced 
+                      touch /opt/output/done
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+                - name: upload
+                  image: amazon/aws-cli
+                  env:
+                    - name: RUN_ID
+                      value: PLACEHOLDER
+                  command:
+                    - sh
+                    - -c
+                    - |
+                      while [ ! -f /opt/output/done ]; do
+                        sleep 5
+                      done
+                      # Upload to S3 bucket
+                      aws s3 cp /opt/output/summary.txt s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/summary.txt
+                      # Upload logs to S3 bucket
+                      aws s3 cp /opt/output/ s3://jax-toolbox-eks-output/axlearn/${RUN_ID}/ --recursive --exclude "*" --include "*.log"
+                  volumeMounts:
+                    - name: output
+                      mountPath: /opt/output
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: output
+                  emptyDir: {}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 4fcf3e563..8ed17d9d6 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -176,9 +176,23 @@ jobs:
         URLREF_PANOPTICAPI=${{ fromJson(inputs.SOURCE_URLREFS).PANOPTICAPI }}
     secrets: inherit
 
+  build-axlearn:
+    needs: build-jax
+    uses: ./.github/workflows/_build.yaml
+    with:
+      ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+      ARTIFACT_NAME: artifact-axlearn-build
+      BADGE_FILENAME: badge-axlearn-build
+      BUILD_DATE: ${{ inputs.BUILD_DATE }}
+      BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+      CONTAINER_NAME: axlearn
+      DOCKERFILE: .github/container/Dockerfile.axlearn
+      RUNNER_SIZE: large
+    secrets: inherit
+
   collect-docker-tags:
     runs-on: ubuntu-22.04
-    if: "!cancelled()"
+    if: ${{ !cancelled() }}
     needs:
       - build-base
       - build-jax
@@ -189,6 +203,7 @@ jobs:
       - build-upstream-t5x
       - build-rosetta-t5x
       - build-gemma
+      - build-axlearn
     outputs:
       TAGS: ${{ steps.collect-tags.outputs.TAGS }}
     steps:
@@ -206,6 +221,7 @@ jobs:
             {"flavor": "upstream-t5x", "stage": "final",   "priority": 900,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "t5x",          "stage": "final",   "priority": 900,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "gemma",        "stage": "final",   "priority": 900,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
             {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "triton",       "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-triton.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
@@ -214,6 +230,7 @@ jobs:
             {"flavor": "upstream-t5x", "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "t5x",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_MEALKIT }}"},\
             {"flavor": "gemma",        "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-gemma.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
 
             {}\
           ]
@@ -376,9 +393,9 @@ jobs:
         *-execution-combine.log
     secrets: inherit
 
-  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # not already have nsys-jax installed
+  #test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  #runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  #not already have nsys-jax installed
   test-nsys-jax-archive:
     needs: test-nsys-jax
     if: inputs.ARCHITECTURE == 'amd64' # arm64 runners n/a
@@ -413,9 +430,8 @@ jobs:
     runs-on: eks
     env:
       JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-      TOKEN_NAME: ${{ github.run_id }}-${{ github.run_attempt }}-token
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
     steps:
     - name: Check out the repository
       uses: actions/checkout@v4
@@ -425,59 +441,37 @@ jobs:
         registry: ghcr.io
         username: ${{ github.repository_owner }}
         password: ${{ secrets.GITHUB_TOKEN }}
-    - name: Store GitHub Container Registry token as Kubernetes secret
-      run: |
-        kubectl create secret generic \
-          ${{ github.run_id }}-${{ github.run_attempt }}-token \
-          --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-          --type=kubernetes.io/dockerconfigjson
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr 
     - name: Configure Kubernetes job
       run: |
         yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
           | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
           | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)' \
           .github/eks-workflow-files/job.yml
         git diff .github/eks-workflow-files/job.yml
     - name: Submit Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/job.yml
-    - name: Wait for Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-jax --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-jax
-    # Clean up in case of errors as well as success
-    - name: Delete Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-jax
+      uses: ./.github/actions/submit-delete-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
     - name: Configure post-processing job
       run: |
         export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
         yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
           | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
           | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
           .github/eks-workflow-files/post-process-job.yml
         git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post-processing Kubernetes job
-      run: kubectl apply -f .github/eks-workflow-files/post-process-job.yml
-    - name: Wait for post-processing Kubernetes job to start
-      run: |
-        while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ github.run_id }}-${{ github.run_attempt }}-postprocess --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
-          sleep 2
-        done
-    - name: Stream post-processing Kubernetes job output
-      run: kubectl logs --all-containers=true --all-pods=true --follow job/${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    # Clean up in case of errors as well as success
-    - name: Delete post-processing Kubernetes job
-      if: always()
-      run: kubectl delete job ${{ github.run_id }}-${{ github.run_attempt }}-postprocess
-    - name: Delete GitHub Container Registry token
-      if: always()
-      run: kubectl delete secret ${{ github.run_id }}-${{ github.run_attempt }}-token
+    - name: Submit post process Kubernetes job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with: 
+        job-config-file: .github/eks-workflow-files/post-process-job.yml
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
 
   # test-equinox:
   #   needs: build-equinox
@@ -582,7 +576,7 @@ jobs:
       ARTIFACTS: |
         test-levanter.log
     secrets: inherit
-
+    
   # test-te:
   #   needs: build-upstream-pax
   #   if: inputs.ARCHITECTURE == 'amd64'  # arm64 runners n/a
@@ -643,3 +637,136 @@ jobs:
     with:
       MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
     secrets: inherit
+
+  test-axlearn-eks:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token 
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        # Replace placeholders in axlearn-job.yml with environment variables
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[1].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-job.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
+      with:
+        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+        job-name: ${{ env.JOB_NAME }}
+    - name: Download logs from S3
+      id: log-s3
+      run: |
+        mkdir -p axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/summary.txt axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/ axlearn-output/ --recursive --exclude "*" --include "*.log"
+
+        passed_tests=$(grep -c ": PASSED" axlearn-output/summary.txt || true)
+        failed_tests=$(grep -c ": FAILED" axlearn-output/summary.txt || true)
+        total_tests=$((failed_tests + passed_tests))
+
+        echo "Passed tests: $passed_tests"
+        echo "Failed tests: $failed_tests"
+        echo "Total tests: $total_tests"
+        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+    - name: Generate sitrep
+      id: sitrep
+      if: ${{ !cancelled() }}
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='Axlearn EKS Unit'
+
+        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+        errors="0" \
+        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ]; then
+          badge_color="red"
+        fi \
+
+        to_json \
+          summary \
+          errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > badge-axlearn-test.json
+
+    - name: Upload artifacts
+      if: ${{ !cancelled() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: "artifact-axlearn-test"
+        path: |
+          sitrep.json
+          badge-axlearn-test.json
+          axlearn-output/*
+
+  # the fuji test will run for 20 minutes only, as per 2025-02-24 
+  # is not possible to set the `max_steps` value
+  # this will be done with a customer python code
+  test-axlearn-fuji-models-eks:
+    needs: build-axlearn
+    if: inputs.ARCHITECTURE == 'amd64'
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+
+    - name: Submit & delete axlearn test 
+      uses: ./.github/actions/submit-delete-k8s-job 
+      with:
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
+        job-name: ${{ env.JOB_NAME }}
+
diff --git a/.github/workflows/_test_nccl.yaml b/.github/workflows/_test_nccl.yaml
index 54da0886e..f8b328b76 100644
--- a/.github/workflows/_test_nccl.yaml
+++ b/.github/workflows/_test_nccl.yaml
@@ -47,31 +47,30 @@ jobs:
           registry: ghcr.io
           username: ${{ github.repository_owner }}
           password: ${{ secrets.GITHUB_TOKEN }}
-      - name: Store GitHub Container Registry token as Kubernetes secret
-        run: |
-          # Replace underscores in TEST_NAME with - to make a valid Kubernetes name
+      - name: Create env vars 
+        id: var 
+        shell: bash 
+        run: | 
           JOB_NAME="nccl-test-${{ github.run_id }}-${{ github.run_attempt }}-${TEST_NAME//_/-}"
           LAUNCHER_NAME="${JOB_NAME}-launcher"
           TOKEN_NAME="${JOB_NAME}-token"
           # Make these available to later steps
           echo "JOB_NAME=${JOB_NAME}" >> "$GITHUB_ENV"
           echo "LAUNCHER_NAME=${LAUNCHER_NAME}" >> "$GITHUB_ENV"
-          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
-          kubectl create secret generic \
-            ${TOKEN_NAME} \
-            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
-            --type=kubernetes.io/dockerconfigjson
+      - name: K8s GHCR store and delete token
+        id: store-token
+        uses: ./.github/actions/store-delete-k8s-ghcr 
       - name: Configure Kubernetes job
         run: |
           export WORKER_NAME="${JOB_NAME}-worker"
           yq -i '.metadata.name = strenv(JOB_NAME)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].name = strenv(LAUNCHER_NAME)
-            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | .spec.mpiReplicaSpecs.Launcher.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
             | .spec.mpiReplicaSpecs.Launcher.template.spec.containers[].command[3] = strenv(TEST_NAME)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].image = strenv(BASE_IMAGE)
             | .spec.mpiReplicaSpecs.Worker.template.spec.containers[].name = strenv(WORKER_NAME)
-            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)' \
+            | .spec.mpiReplicaSpecs.Worker.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
             .github/eks-workflow-files/mpi-nccl-test.yml
           git diff .github/eks-workflow-files/mpi-nccl-test.yml
       - name: Submit Kubernetes job
@@ -126,6 +125,3 @@ jobs:
       - name: Delete Kubernetes job
         if: always()
         run: kubectl delete -f .github/eks-workflow-files/mpi-nccl-test.yml
-      - name: Delete GitHub Container Registry token
-        if: always()
-        run: kubectl delete secret ${TOKEN_NAME}
diff --git a/README.md b/README.md
index 648208205..83053215e 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ We support and test the following JAX frameworks and model architectures. More d
 | [t5x](./rosetta/rosetta/projects/imagen) | Imagen | pre-training | `ghcr.io/nvidia/t5x:imagen-2023-10-02.v3` |
 | [big vision](./rosetta/rosetta/projects/paligemma) | PaliGemma | fine-tuning, evaluation | `ghcr.io/nvidia/jax:gemma` |
 | levanter | GPT, LLaMA, MPT, Backpacks | pretraining, fine-tuning | `ghcr.io/nvidia/jax:levanter` |
+| axlearn | Fuji | pretraining | `gchr.io/nvidia/jax:axlearn` | 
 
 # Build Pipeline Status
 <table>
@@ -248,6 +249,30 @@ We support and test the following JAX frameworks and model architectures. More d
         </a>
       </td>
     </tr>
+    <tr>
+      <td>
+        <a href="https://github.com/NVIDIA/JAX-Toolbox/blob/main/.github/container/Dockerfile.axlearn">
+          <img style="height:1em;" src="https://img.shields.io/static/v1?label=&color=gray&logo=docker&message=AXLearn%3D%7Bcore%2CAXLearn%7D">
+        </a>
+      </td>
+      <td>
+        <code>ghcr.io/nvidia/jax:axlearn</code>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-axlearn-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-amd64.json&logo=docker&label=amd64">
+        </a>
+        <br>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae/#file-final-maxtext-md">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axlearn-build-arm64.json&logo=docker&label=arm64">
+        </a>
+      </td>
+      <td>
+        <a href="https://gist.github.com/nvjax/913c2af68649fe568e9711c2dabb23ae#file-badge-maxtext-test-json">
+          <img style="height:1em;" src="https://img.shields.io/endpoint?url=https%3A%2F%2Fgist.githubusercontent.com%2Fnvjax%2F913c2af68649fe568e9711c2dabb23ae%2Fraw%2Fbadge-axleran-test.json&logo=nvidia&label=A100%20distributed">
+        </a>
+      </td>
+    </tr>
   </tbody>
 </table>
 
diff --git a/docs/frameworks/axlearn/README.md b/docs/frameworks/axlearn/README.md
new file mode 100644
index 000000000..ad7172ca7
--- /dev/null
+++ b/docs/frameworks/axlearn/README.md
@@ -0,0 +1,40 @@
+# AXLearn
+[AXLearn](https://github.com/apple/axlearn) is a deep learning design framework, built on top of JAX and XLA, to support the development of large-scale models. 
+
+
+## Hardware and Software Specifications
+
+The functionality have been validated on AWS p5.48xlarge EKS cluster (8x H100 80G). 
+
+
+## Containers
+We provide a multi-architecture container that is regularly updated. Use these containers to avoid dependency and environment issues. 
+- Latest container: ghcr.io/nvidia/jax:axlearn
+- Nightly dated container: ghcr.io/nvidia/jax:axlearn-YYYY-MM-DD
+
+When you start an interactive session:
+
+- Navigate to `/opt/axlearn` inside the container.
+- Place your persistent files in a mounted directory (e.g. `/opt/axlearn/workspace`).
+
+## Launching a container
+Use the following command to launch a container:
+```bash
+docker run -ti --gpus=all --net=host --ipc=host -v <WORKSPACE_PATH>:/opt/axlearn/workspace -w /opt/axlearn <CONTAINER> /bin/bash
+```
+where `WORKSPACE_PATH` is the path to the directory where you would like to store any persistent files and `container` is the name of the maxtext container. You can additionally add dataset and vocab paths with the `-v` flag.
+
+## Example: training `fuji-3B-v3-flash-single-host` on EKS
+[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-fuji-model.yml) we're using for deploying the training of Fuji-3B model, that uses flash attention, and runs on a single host. The core part of the deployment is: 
+```bash 
+python3 -m axlearn.common.launch_trainer_main \
+        --module=text.gpt.c4_trainer \
+        --config=${CONFIG} \
+        --trainer_dir=${TRAINER_DIR} \
+        --data_dir=gs://axlearn-public/tensorflow_datasets \
+        --jax_backend=gpu             
+```
+Where `CONFIG="fuji-3B-v3-flash-single-host`. The input dataset is the public tensorflow [C4 dataset](https://www.tensorflow.org/datasets/catalog/c4). 
+
+## Testing
+[Here is the YAML file](../../../.github/eks-workflow-files/axlearn/axlearn-job.yml) used for testing AXLearn funcitonalities. In particular, this test makes uses of [`test_axlearn.sh` script](../../../.github/container/test-axlearn.sh). The test runs `pytest` against all the tests contains in `/opt/axlearn/axlearn/common` folder.