diff --git a/.devops/Dockerfile.dev.frontend b/.devops/Dockerfile.dev.frontend
new file mode 100644
index 0000000..de054d6
--- /dev/null
+++ b/.devops/Dockerfile.dev.frontend
@@ -0,0 +1,12 @@
+FROM node:20-alpine
+
+WORKDIR /app
+
+COPY frontend/package*.json ./
+RUN npm ci
+
+COPY frontend/ ./
+
+EXPOSE 5173
+
+CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"]
diff --git a/.devops/Dockerfile.frontend b/.devops/Dockerfile.frontend
new file mode 100644
index 0000000..70ca5aa
--- /dev/null
+++ b/.devops/Dockerfile.frontend
@@ -0,0 +1,22 @@
+FROM node:20-alpine AS build
+
+WORKDIR /app
+
+COPY frontend/package*.json ./
+RUN npm ci
+
+COPY frontend/ ./
+
+ARG VITE_API_BASE_URL=/api
+ENV VITE_API_BASE_URL=${VITE_API_BASE_URL}
+
+RUN npm run build
+
+FROM nginx:1.27-alpine AS runtime
+
+COPY .devops/nginx.conf /etc/nginx/conf.d/default.conf
+COPY --from=build /app/dist /usr/share/nginx/html
+
+EXPOSE 80
+
+CMD ["nginx", "-g", "daemon off;"]
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bf49286..992ef59 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,12 +1,10 @@
 name: CI
 
 on:
-  push:
-    branches: [master, dev]
   workflow_dispatch:
     inputs:
       image:
-        description: "Which image to build?"
+        description: "Which image to build? (cpp=C++ engine, cpu=PyTorch CPU, cuda=PyTorch CUDA, all=all three)"
         required: true
         type: choice
         options:
@@ -14,7 +12,7 @@ on:
           - cpu
           - cuda
           - all
-      push:
+      push_image:
         description: "Push to ghcr.io?"
         required: true
         default: "true"
@@ -23,10 +21,10 @@ on:
 
 env:
   REGISTRY: ghcr.io
-  IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix
 
 jobs:
 
+
   file-integrity:
     name: File integrity
     if: github.event_name == 'push'
@@ -44,9 +42,9 @@ jobs:
           failed=0
           for f in "${files[@]}"; do
             if [ -f "$f" ]; then
-              echo "✅  $f"
+              echo "PASS: $f"
             else
-              echo "❌  $f — MISSING"
+              echo "FAIL: $f -- MISSING"
               failed=1
             fi
           done
@@ -66,8 +64,8 @@ jobs:
           args: "check engine/ --ignore E501 --exit-zero"
 
 
-  build-cpp:
-    name: C++ compile check
+  build-binary-linux:
+    name: Binary (ubuntu-latest)
     if: github.event_name == 'push'
     runs-on: ubuntu-latest
     steps:
@@ -85,10 +83,43 @@ jobs:
       - name: Smoke test
         run: ./quadtrix --help || true
 
+      - name: Upload binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: quadtrix-linux-amd64
+          path: quadtrix
+          retention-days: 7
+
+
+  build-binary-macos:
+    name: Binary (macos-14)
+    if: github.event_name == 'push'
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Compile main.cpp
+        run: |
+          g++ -std=c++17 -O3 \
+            -I. -Iinclude \
+            -o quadtrix main.cpp
+
+      - name: Smoke test
+        run: ./quadtrix --help || true
+
+      - name: Package binary
+        run: tar -czf quadtrix-macos-arm64.tar.gz quadtrix
+
+      - name: Upload binary
+        uses: actions/upload-artifact@v4
+        with:
+          name: quadtrix-macos-arm64
+          path: quadtrix-macos-arm64.tar.gz
+          retention-days: 7
 
   build-cpp-image:
-    name: Build — cpp
-    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all')
+    name: "Build -- cpp (C++ engine - linux/amd64 + arm64)"
+    if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all') }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -99,8 +130,11 @@ jobs:
       - uses: docker/setup-qemu-action@v3
       - uses: docker/setup-buildx-action@v3
 
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
+
       - name: Login to GHCR
-        if: inputs.push == 'true'
+        if: ${{ inputs.push_image == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -123,7 +157,7 @@ jobs:
           context: .
           file: .devops/Dockerfile.cpp
           platforms: linux/amd64,linux/arm64
-          push: ${{ inputs.push == 'true' }}
+          push: ${{ inputs.push_image == 'true' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=gha,scope=cpp
@@ -131,8 +165,8 @@ jobs:
 
 
   build-cpu-image:
-    name: Build — cpu
-    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all')
+    name: "Build -- cpu (PyTorch CPU - linux/amd64 + arm64)"
+    if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all') }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -143,8 +177,11 @@ jobs:
       - uses: docker/setup-qemu-action@v3
       - uses: docker/setup-buildx-action@v3
 
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
+
       - name: Login to GHCR
-        if: inputs.push == 'true'
+        if: ${{ inputs.push_image == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -167,7 +204,7 @@ jobs:
           context: .
           file: .devops/Dockerfile
           platforms: linux/amd64,linux/arm64
-          push: ${{ inputs.push == 'true' }}
+          push: ${{ inputs.push_image == 'true' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=gha,scope=cpu
@@ -175,8 +212,8 @@ jobs:
 
 
   build-cuda-image:
-    name: Build — cuda
-    if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all')
+    name: "Build -- cuda (PyTorch CUDA - linux/amd64 only)"
+    if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all') }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -186,8 +223,11 @@ jobs:
 
       - uses: docker/setup-buildx-action@v3
 
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV
+
       - name: Login to GHCR
-        if: inputs.push == 'true'
+        if: ${{ inputs.push_image == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
@@ -210,8 +250,8 @@ jobs:
           context: .
           file: .devops/Dockerfile.backend
           platforms: linux/amd64
-          push: ${{ inputs.push == 'true' }}
+          push: ${{ inputs.push_image == 'true' }}
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           cache-from: type=gha,scope=cuda
-          cache-to: type=gha,mode=max,scope=cuda
\ No newline at end of file
+          cache-to: type=gha,mode=max,scope=cuda
diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index ca9493f..0986534 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -1,66 +1,86 @@
-name: Release
+name: Docker Images
 
 on:
   workflow_dispatch:
     inputs:
+      image:
+        description: "Image variant to build"
+        required: true
+        type: choice
+        options:
+          - cpp
+          - cpu
+          - cuda
+          - all
       version:
-        description: "Version tag (e.g. 1.2.3)"
+        description: "Optional image tag for manual runs"
+        required: false
+      push_image:
+        description: "Push to ghcr.io"
         required: true
+        default: "true"
+        type: choice
+        options: ["true", "false"]
+
+concurrency:
+  group: docker-images-${{ github.ref }}
+  cancel-in-progress: true
 
 env:
   REGISTRY: ghcr.io
   IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix
 
 jobs:
-
-  build-binaries:
-    name: Binary (${{ matrix.os }})
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, macos-14]
-        include:
-          - os: ubuntu-22.04
-            artifact_name: quadtrix-linux-x64
-            binary: quadtrix
-          - os: macos-14
-            artifact_name: quadtrix-macos-arm64
-            binary: quadtrix
+  build-cpp-image:
+    name: Docker cpp
+    if: ${{ inputs.image == 'cpp' || inputs.image == 'all' }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
     steps:
       - uses: actions/checkout@v4
 
-      - name: Compile (Linux)
-        if: runner.os == 'Linux'
-        run: |
-          sudo apt-get update && sudo apt-get install -y g++
-          g++ -std=c++17 -O3 -march=native \
-              -I. -Iinclude \
-              -o ${{ matrix.binary }} main.cpp
-          strip ${{ matrix.binary }}
-
-      - name: Compile (macOS)
-        if: runner.os == 'macOS'
-        run: |
-          g++ -std=c++17 -O3 -march=native \
-              -I. -Iinclude \
-              -o ${{ matrix.binary }} main.cpp
-
-      - name: Package
-        run: |
-          mkdir dist
-          cp ${{ matrix.binary }} dist/
-          cp README.md LICENSE dist/
-          tar -czf ${{ matrix.artifact_name }}.tar.gz -C dist .
-
-      - name: Upload to Release
-        uses: softprops/action-gh-release@v2
+      - uses: docker/setup-qemu-action@v3
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
+
+      - name: Login to GHCR
+        if: ${{ inputs.push_image == 'true' }}
+        uses: docker/login-action@v3
         with:
-          tag_name: v${{ github.event.inputs.version }}
-          files: ${{ matrix.artifact_name }}.tar.gz
-          generate_release_notes: true
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.IMAGE_PREFIX }}-cpp
+          tags: |
+            type=ref,event=tag
+            type=sha,prefix=sha-
+            type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }}
+            type=raw,value=latest,enable=${{ inputs.push_image == 'true' }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: .devops/Dockerfile.cpp
+          platforms: linux/amd64,linux/arm64
+          push: ${{ inputs.push_image == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cpp
+          cache-to: type=gha,mode=max,scope=cpp
 
-  publish-images:
-    name: Publish Docker images
+  build-cpu-image:
+    name: Docker cpu
+    if: ${{ inputs.image == 'cpu' || inputs.image == 'all' }}
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -71,62 +91,82 @@ jobs:
       - uses: docker/setup-qemu-action@v3
       - uses: docker/setup-buildx-action@v3
 
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
+
       - name: Login to GHCR
+        if: ${{ inputs.push_image == 'true' }}
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REGISTRY }}
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Parse tag
-        id: tag
-        run: echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT
-
-      - name: Build & push backend
-        uses: docker/build-push-action@v6
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
         with:
-          context: .
-          file: .devops/Dockerfile.backend
-          platforms: linux/amd64,linux/arm64
-          push: true
+          images: ${{ env.IMAGE_PREFIX }}-cpu
           tags: |
-            ${{ env.IMAGE_PREFIX }}-backend:latest
-            ${{ env.IMAGE_PREFIX }}-backend:${{ steps.tag.outputs.VERSION }}
-          cache-from: type=gha,scope=backend
-          cache-to: type=gha,mode=max,scope=backend
+            type=ref,event=tag
+            type=sha,prefix=sha-
+            type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }}
+            type=raw,value=latest,enable=${{ inputs.push_image == 'true' }}
 
-      - name: Build & push frontend
+      - name: Build and push
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: .devops/Dockerfile.frontend
+          file: .devops/Dockerfile
           platforms: linux/amd64,linux/arm64
-          push: true
+          push: ${{ inputs.push_image == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cpu
+          cache-to: type=gha,mode=max,scope=cpu
+
+  build-cuda-image:
+    name: Docker cuda
+    if: ${{ inputs.image == 'cuda' || inputs.image == 'all' }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: docker/setup-buildx-action@v3
+
+      - name: Set lowercase image prefix
+        run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV"
+
+      - name: Login to GHCR
+        if: ${{ inputs.push_image == 'true' }}
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.IMAGE_PREFIX }}-cuda
           tags: |
-            ${{ env.IMAGE_PREFIX }}-frontend:latest
-            ${{ env.IMAGE_PREFIX }}-frontend:${{ steps.tag.outputs.VERSION }}
-          cache-from: type=gha,scope=frontend
-          cache-to: type=gha,mode=max,scope=frontend
+            type=ref,event=tag
+            type=sha,prefix=sha-
+            type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }}
+            type=raw,value=latest,enable=${{ inputs.push_image == 'true' }}
 
-      - name: Build & push cpp
+      - name: Build and push
         uses: docker/build-push-action@v6
         with:
           context: .
-          file: .devops/Dockerfile.cpp
-          platforms: linux/amd64,linux/arm64
-          push: true
-          tags: |
-            ${{ env.IMAGE_PREFIX }}-cpp:latest
-            ${{ env.IMAGE_PREFIX }}-cpp:${{ steps.tag.outputs.VERSION }}
-          cache-from: type=gha,scope=cpp
-          cache-to: type=gha,mode=max,scope=cpp
-
-      - name: Create Release summary
-        run: |
-          echo "## Docker images published" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
-          echo "| Image | Tags |" >> $GITHUB_STEP_SUMMARY
-          echo "|-------|------|" >> $GITHUB_STEP_SUMMARY
-          echo "| \`quadtrix-backend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| \`quadtrix-frontend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
-          echo "| \`quadtrix-cpp\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY
+          file: .devops/Dockerfile.backend
+          platforms: linux/amd64
+          push: ${{ inputs.push_image == 'true' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha,scope=cuda
+          cache-to: type=gha,mode=max,scope=cuda
diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml
index 4824b9e..b50de6b 100644
--- a/.github/workflows/pr-check.yml
+++ b/.github/workflows/pr-check.yml
@@ -15,7 +15,7 @@ jobs:
       pr-sha: ${{ steps.get-sha.outputs.sha }}
     steps:
       - name: Check commenter permission
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             const { data } = await github.rest.repos.getCollaboratorPermissionLevel({
@@ -34,7 +34,7 @@ jobs:
             }
 
       - name: React with rocket
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             await github.rest.reactions.createForIssueComment({
@@ -46,7 +46,7 @@ jobs:
 
       - name: Get PR head SHA
         id: get-sha
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             const { data: pr } = await github.rest.pulls.get({
@@ -57,7 +57,7 @@ jobs:
             core.setOutput('sha', pr.head.sha);
 
       - name: Set checks to pending
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             const sha = '${{ steps.get-sha.outputs.sha }}';
@@ -96,7 +96,7 @@ jobs:
 
       - name: Report status
         if: always()
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             await github.rest.repos.createCommitStatus({
@@ -158,7 +158,7 @@ jobs:
 
       - name: Report status
         if: always()
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             await github.rest.repos.createCommitStatus({
@@ -218,7 +218,7 @@ jobs:
 
       - name: Report status
         if: always()
-        uses: actions/github-script@v7
+        uses: actions/github-script@v9
         with:
           script: |
             await github.rest.repos.createCommitStatus({
@@ -237,7 +237,7 @@ jobs:
     runs-on: ubuntu-latest
     if: always()
     steps:
-      - uses: actions/github-script@v7
+      - uses: actions/github-script@v9
         with:
           script: |
             const jobs   = ${{ toJSON(needs) }};
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..e92b486
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,263 @@
+name: Release
+
+on:
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "Release version, for example v1.2.3 or 1.2.3"
+        required: true
+
+concurrency:
+  group: release
+  cancel-in-progress: false
+
+env:
+  ARTIFACT_ROOT: release-assets
+
+jobs:
+  release-metadata:
+    name: Release metadata
+    runs-on: ubuntu-latest
+    outputs:
+      tag_name: ${{ steps.tag.outputs.tag_name }}
+    steps:
+      - id: tag
+        shell: bash
+        run: |
+          set -euo pipefail
+          raw_tag="${{ inputs.version }}"
+          if [[ "${raw_tag}" == v* ]]; then
+            tag_name="${raw_tag}"
+          else
+            tag_name="v${raw_tag}"
+          fi
+          echo "tag_name=${tag_name}" >> "$GITHUB_OUTPUT"
+
+  ubuntu-cpu:
+    name: Ubuntu ${{ matrix.build }} CPU
+    needs: release-metadata
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - build: x64
+            os: ubuntu-22.04
+          - build: arm64
+            os: ubuntu-24.04-arm
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Dependencies
+        shell: bash
+        run: |
+          set -euo pipefail
+          sudo apt-get update
+          sudo apt-get install -y build-essential file
+
+      - name: Toolchain workaround
+        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
+        shell: bash
+        run: |
+          set -euo pipefail
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Build
+        shell: bash
+        run: |
+          set -euo pipefail
+          ${CXX:-g++} -std=c++17 -O3 -DNDEBUG \
+            -I. -Iinclude \
+            -o quadtrix main.cpp
+          file quadtrix
+
+      - name: Smoke test
+        shell: bash
+        run: |
+          set +e
+          ./quadtrix --chat >/dev/null 2>&1
+          exit 0
+
+      - name: Pack artifacts
+        shell: bash
+        run: |
+          set -euo pipefail
+          package="quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-ubuntu-${{ matrix.build }}-cpu"
+          mkdir -p "${ARTIFACT_ROOT}/${package}"
+          cp quadtrix README.md LICENSE "${ARTIFACT_ROOT}/${package}/"
+          tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: quadtrix-bin-ubuntu-${{ matrix.build }}-cpu
+          path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-ubuntu-${{ matrix.build }}-cpu.tar.gz
+          if-no-files-found: error
+          retention-days: 30
+
+  windows-cpu:
+    name: Windows ${{ matrix.arch }} CPU
+    needs: release-metadata
+    runs-on: windows-2022
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: x64
+            vcvars: x64
+          - arch: arm64
+            vcvars: amd64_arm64
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        shell: cmd
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.vcvars }}
+          cl /nologo /std:c++17 /O2 /DNDEBUG /EHsc /Iinclude /I. main.cpp /Fe:quadtrix.exe
+
+      - name: Smoke test
+        if: ${{ matrix.arch == 'x64' }}
+        shell: pwsh
+        run: |
+          $ErrorActionPreference = 'Continue'
+          & .\quadtrix.exe --chat | Out-Null
+          exit 0
+
+      - name: Pack artifacts
+        shell: pwsh
+        run: |
+          $package = "quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-windows-${{ matrix.arch }}-cpu"
+          New-Item -ItemType Directory -Force "${env:ARTIFACT_ROOT}\${package}" | Out-Null
+          Copy-Item quadtrix.exe "${env:ARTIFACT_ROOT}\${package}\"
+          Copy-Item README.md, LICENSE "${env:ARTIFACT_ROOT}\${package}\"
+          Compress-Archive -Path "${env:ARTIFACT_ROOT}\${package}\*" -DestinationPath "${package}.zip" -Force
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: quadtrix-bin-windows-${{ matrix.arch }}-cpu
+          path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-windows-${{ matrix.arch }}-cpu.zip
+          if-no-files-found: error
+          retention-days: 30
+
+  macos-cpu:
+    name: macOS ${{ matrix.build }} CPU
+    needs: release-metadata
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - build: arm64
+            arch: arm64
+            os: macos-14
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Build
+        shell: bash
+        run: |
+          set -euo pipefail
+          clang++ -std=c++17 -O3 -DNDEBUG -arch ${{ matrix.arch }} \
+            -I. -Iinclude \
+            -o quadtrix main.cpp
+          file quadtrix
+
+      - name: Smoke test
+        shell: bash
+        run: |
+          set +e
+          ./quadtrix --chat >/dev/null 2>&1
+          exit 0
+
+      - name: Pack artifacts
+        shell: bash
+        run: |
+          set -euo pipefail
+          package="quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-macos-${{ matrix.build }}-cpu"
+          mkdir -p "${ARTIFACT_ROOT}/${package}"
+          cp quadtrix README.md LICENSE "${ARTIFACT_ROOT}/${package}/"
+          tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: quadtrix-bin-macos-${{ matrix.build }}-cpu
+          path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-macos-${{ matrix.build }}-cpu.tar.gz
+          if-no-files-found: error
+          retention-days: 30
+
+  publish-release:
+    name: Publish GitHub release
+    needs:
+      - release-metadata
+      - ubuntu-cpu
+      - windows-cpu
+      - macos-cpu
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: dist
+          merge-multiple: true
+
+      - name: Write release notes
+        shell: bash
+        run: |
+          cat > release-notes.md <<'EOF'
+          macOS/iOS:
+
+          macOS Apple Silicon (arm64)
+          macOS Apple Silicon (arm64, KleidiAI enabled) DISABLED
+          macOS Intel (x64) SKIPPED
+          iOS XCFramework DISABLED
+
+          Linux:
+
+          Ubuntu x64 (CPU)
+          Ubuntu arm64 (CPU)
+          Ubuntu s390x (CPU) SKIPPED
+          Ubuntu x64 (Vulkan) DISABLED
+          Ubuntu arm64 (Vulkan) DISABLED
+          Ubuntu x64 (ROCm 7.2) DISABLED
+          Ubuntu x64 (OpenVINO) DISABLED
+          Ubuntu x64 (SYCL FP32) DISABLED
+
+          Android:
+
+          Android arm64 (CPU) DISABLED
+
+          Windows:
+
+          Windows x64 (CPU)
+          Windows arm64 (CPU)
+          Windows x64 (CUDA 12) - CUDA 12.4 DLLs DISABLED
+          Windows x64 (CUDA 13) - CUDA 13.3 DLLs DISABLED
+          Windows x64 (Vulkan) DISABLED
+          Windows x64 (SYCL) DISABLED
+          Windows x64 (HIP) DISABLED
+          EOF
+
+      - name: Publish release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: ${{ needs.release-metadata.outputs.tag_name }}
+          target_commitish: ${{ github.sha }}
+          prerelease: false
+          body_path: release-notes.md
+          files: dist/*
diff --git a/CUDA/includes/attention.cuh b/CUDA/includes/attention.cuh
new file mode 100644
index 0000000..7feac08
--- /dev/null
+++ b/CUDA/includes/attention.cuh
@@ -0,0 +1,29 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status attention_forward(
+    const TensorView& input_qkv,
+    TensorView preatt,
+    TensorView att,
+    TensorView output,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+Status attention_backward(
+    const TensorView& grad_output,
+    const TensorView& input_qkv,
+    const TensorView& att,
+    TensorView grad_input_qkv,
+    TensorView grad_preatt,
+    TensorView grad_att,
+    int num_heads,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/checkpoint.h b/CUDA/includes/checkpoint.h
new file mode 100644
index 0000000..ba91b0f
--- /dev/null
+++ b/CUDA/includes/checkpoint.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "tensor.cuh"
+
+namespace quadtrix {
+namespace cuda {
+
+struct CheckpointMetadata {
+    int vocab_size = 0;
+    int max_sequence_length = 0;
+    int num_layers = 0;
+    int num_heads = 0;
+    int channels = 0;
+};
+
+inline bool load_checkpoint_metadata(const char*, CheckpointMetadata*) {
+    return false;
+}
+
+inline bool save_tensor_checkpoint(const char*, const TensorView&) {
+    return false;
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/common.h b/CUDA/includes/common.h
new file mode 100644
index 0000000..36df155
--- /dev/null
+++ b/CUDA/includes/common.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <limits>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class DType : std::uint8_t {
+    F32,
+    F16,
+    BF16,
+    I32,
+    U8,
+};
+
+enum class DeviceKind : std::uint8_t {
+    CPU,
+    CUDA,
+};
+
+struct Status {
+    bool ok;
+    cudaError_t cuda_error;
+    const char* message;
+
+    static Status success() {
+        return {true, cudaSuccess, "ok"};
+    }
+
+    static Status failure(cudaError_t error, const char* message) {
+        return {false, error, message};
+    }
+};
+
+inline const char* dtype_name(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return "f32";
+        case DType::F16:
+            return "f16";
+        case DType::BF16:
+            return "bf16";
+        case DType::I32:
+            return "i32";
+        case DType::U8:
+            return "u8";
+    }
+    return "unknown";
+}
+
+inline std::size_t dtype_size(DType dtype) {
+    switch (dtype) {
+        case DType::F32:
+            return 4;
+        case DType::F16:
+            return 2;
+        case DType::BF16:
+            return 2;
+        case DType::I32:
+            return 4;
+        case DType::U8:
+            return 1;
+    }
+
+    std::fprintf(stderr, "Unknown CUDA dtype value %u\n", static_cast<unsigned int>(dtype));
+    std::abort();
+}
+
+inline bool checked_mul(std::size_t lhs, std::size_t rhs, std::size_t* out) {
+    if (lhs != 0 && rhs > std::numeric_limits<std::size_t>::max() / lhs) {
+        return false;
+    }
+    *out = lhs * rhs;
+    return true;
+}
+
+inline Status check_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return Status::success();
+    }
+
+    std::fprintf(
+        stderr,
+        "CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    return Status::failure(error, expression);
+}
+
+inline void abort_on_cuda(cudaError_t error, const char* expression, const char* file, int line) {
+    if (error == cudaSuccess) {
+        return;
+    }
+
+    std::fprintf(
+        stderr,
+        "Fatal CUDA error at %s:%d: %s failed with %s\n",
+        file,
+        line,
+        expression,
+        cudaGetErrorString(error));
+    std::abort();
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
+
+#define QUADTRIX_CUDA_CHECK(expr) \
+    ::quadtrix::cuda::check_cuda((expr), #expr, __FILE__, __LINE__)
+
+#define QUADTRIX_CUDA_ABORT(expr) \
+    ::quadtrix::cuda::abort_on_cuda((expr), #expr, __FILE__, __LINE__)
diff --git a/CUDA/includes/dataloader.h b/CUDA/includes/dataloader.h
new file mode 100644
index 0000000..fd3c47d
--- /dev/null
+++ b/CUDA/includes/dataloader.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+struct TokenBatchView {
+    const std::int32_t* inputs = nullptr;
+    const std::int32_t* targets = nullptr;
+    int batch_size = 0;
+    int sequence_length = 0;
+};
+
+class DataLoader {
+public:
+    DataLoader() = default;
+
+    bool next(TokenBatchView* batch) {
+        if (batch != nullptr) {
+            *batch = {};
+        }
+        return false;
+    }
+};
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/gelu.cuh b/CUDA/includes/gelu.cuh
new file mode 100644
index 0000000..af87e64
--- /dev/null
+++ b/CUDA/includes/gelu.cuh
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class GeluMode : std::uint8_t {
+    Exact,
+    Approximate,
+};
+
+Status gelu_forward(
+    const TensorView& input,
+    TensorView output,
+    GeluMode mode = GeluMode::Approximate,
+    cudaStream_t stream = nullptr);
+
+Status gelu_backward(
+    const TensorView& grad_output,
+    const TensorView& input,
+    TensorView grad_input,
+    GeluMode mode = GeluMode::Approximate,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/global_norm.cuh b/CUDA/includes/global_norm.cuh
new file mode 100644
index 0000000..f418ab7
--- /dev/null
+++ b/CUDA/includes/global_norm.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status global_norm_squared(
+    const TensorView& grads,
+    TensorView partial_sums,
+    cudaStream_t stream = nullptr);
+
+Status clip_gradients_by_global_norm(
+    TensorView grads,
+    float global_norm,
+    float max_norm,
+    cudaStream_t stream = nullptr);
+
+inline float clip_scale(float global_norm, float max_norm) {
+    return global_norm > max_norm && global_norm > 0.0f ? max_norm / global_norm : 1.0f;
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/layernorm.cuh b/CUDA/includes/layernorm.cuh
new file mode 100644
index 0000000..2645537
--- /dev/null
+++ b/CUDA/includes/layernorm.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+namespace quadtrix {
+namespace cuda {
+
+Status layernorm_forward(
+    const TensorView& input,
+    const TensorView& gamma,
+    const TensorView& beta,
+    TensorView output,
+    TensorView mean,
+    TensorView rstd,
+    float epsilon = 1.0e-5f,
+    cudaStream_t stream = nullptr);
+
+Status layernorm_backward(
+    const TensorView& grad_output,
+    const TensorView& input,
+    const TensorView& gamma,
+    const TensorView& mean,
+    const TensorView& rstd,
+    TensorView grad_input,
+    TensorView grad_gamma,
+    TensorView grad_beta,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/logger.h b/CUDA/includes/logger.h
new file mode 100644
index 0000000..219c50f
--- /dev/null
+++ b/CUDA/includes/logger.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <cstdarg>
+#include <cstdio>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class LogLevel {
+    Info,
+    Warn,
+    Error,
+};
+
+inline const char* log_level_name(LogLevel level) {
+    switch (level) {
+        case LogLevel::Info:
+            return "info";
+        case LogLevel::Warn:
+            return "warn";
+        case LogLevel::Error:
+            return "error";
+    }
+    return "unknown";
+}
+
+inline void log_message(LogLevel level, const char* format, ...) {
+    std::fprintf(level == LogLevel::Error ? stderr : stdout, "[cuda:%s] ", log_level_name(level));
+    va_list args;
+    va_start(args, format);
+    std::vfprintf(level == LogLevel::Error ? stderr : stdout, format, args);
+    va_end(args);
+    std::fprintf(level == LogLevel::Error ? stderr : stdout, "\n");
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/matmul.cuh b/CUDA/includes/matmul.cuh
new file mode 100644
index 0000000..12dd4b2
--- /dev/null
+++ b/CUDA/includes/matmul.cuh
@@ -0,0 +1,99 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+enum class MatmulTranspose : std::uint8_t {
+    None,
+    Transpose,
+};
+
+struct BlasStatus {
+    bool ok;
+    cublasStatus_t cublas_status;
+    const char* message;
+
+    static BlasStatus success() {
+        return {true, CUBLAS_STATUS_SUCCESS, "ok"};
+    }
+
+    static BlasStatus failure(cublasStatus_t status, const char* message) {
+        return {false, status, message};
+    }
+};
+
+const char* cublas_status_name(cublasStatus_t status);
+
+class BlasHandle {
+public:
+    explicit BlasHandle(int device_id = 0);
+    ~BlasHandle();
+
+    BlasHandle(const BlasHandle&) = delete;
+    BlasHandle& operator=(const BlasHandle&) = delete;
+
+    BlasHandle(BlasHandle&& other) noexcept;
+    BlasHandle& operator=(BlasHandle&& other) noexcept;
+
+    cublasHandle_t get() const {
+        return handle_;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    BlasStatus set_stream(cudaStream_t stream);
+
+private:
+    cublasHandle_t handle_ = nullptr;
+    int device_id_ = 0;
+};
+
+BlasStatus matmul(
+    BlasHandle& handle,
+    const TensorView& a,
+    MatmulTranspose op_a,
+    const TensorView& b,
+    MatmulTranspose op_b,
+    TensorView c,
+    float alpha = 1.0f,
+    float beta = 0.0f,
+    cudaStream_t stream = nullptr);
+
+BlasStatus matmul_forward(
+    BlasHandle& handle,
+    const TensorView& input,
+    const TensorView& weight,
+    TensorView output,
+    cudaStream_t stream = nullptr,
+    float alpha = 1.0f,
+    float beta = 0.0f);
+
+BlasStatus matmul_backward_input(
+    BlasHandle& handle,
+    const TensorView& grad_output,
+    const TensorView& weight,
+    TensorView grad_input,
+    cudaStream_t stream = nullptr,
+    float alpha = 1.0f,
+    float beta = 0.0f);
+
+BlasStatus matmul_backward_weight(
+    BlasHandle& handle,
+    const TensorView& input,
+    const TensorView& grad_output,
+    TensorView grad_weight,
+    cudaStream_t stream = nullptr,
+    float alpha = 1.0f,
+    float beta = 0.0f);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/memory.cuh b/CUDA/includes/memory.cuh
new file mode 100644
index 0000000..e08fa4a
--- /dev/null
+++ b/CUDA/includes/memory.cuh
@@ -0,0 +1,120 @@
+#pragma once
+
+#include "common.h"
+#include "runtime.cuh"
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+#include <utility>
+
+namespace quadtrix {
+namespace cuda {
+
+class DeviceBuffer {
+public:
+    DeviceBuffer() = default;
+
+    explicit DeviceBuffer(std::size_t bytes, int device_id = -1) {
+        allocate(bytes, device_id);
+    }
+
+    ~DeviceBuffer() {
+        release();
+    }
+
+    DeviceBuffer(const DeviceBuffer&) = delete;
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    DeviceBuffer(DeviceBuffer&& other) noexcept {
+        swap(other);
+    }
+
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
+        if (this != &other) {
+            release();
+            swap(other);
+        }
+        return *this;
+    }
+
+    void allocate(std::size_t bytes, int device_id = -1) {
+        release();
+        if (bytes == 0) {
+            return;
+        }
+        if (device_id >= 0) {
+            device_id_ = device_id;
+            DeviceGuard guard(device_id);
+            QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes));
+        } else {
+            device_id_ = current_device();
+            QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes));
+        }
+        bytes_ = bytes;
+    }
+
+    void release() {
+        if (ptr_ != nullptr) {
+            if (device_id_ >= 0) {
+                DeviceGuard guard(device_id_);
+                cudaFree(ptr_);
+            } else {
+                cudaFree(ptr_);
+            }
+            ptr_ = nullptr;
+            bytes_ = 0;
+            device_id_ = -1;
+        }
+    }
+
+    void* data() {
+        return ptr_;
+    }
+
+    const void* data() const {
+        return ptr_;
+    }
+
+    std::size_t bytes() const {
+        return bytes_;
+    }
+
+    bool empty() const {
+        return ptr_ == nullptr || bytes_ == 0;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    void swap(DeviceBuffer& other) noexcept {
+        std::swap(ptr_, other.ptr_);
+        std::swap(bytes_, other.bytes_);
+        std::swap(device_id_, other.device_id_);
+    }
+
+private:
+    void* ptr_ = nullptr;
+    std::size_t bytes_ = 0;
+    int device_id_ = -1;
+};
+
+inline Status copy_h2d(void* dst_device, const void* src_host, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_host, bytes, cudaMemcpyHostToDevice, stream));
+}
+
+inline Status copy_d2h(void* dst_host, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_host, src_device, bytes, cudaMemcpyDeviceToHost, stream));
+}
+
+inline Status copy_d2d(void* dst_device, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_device, bytes, cudaMemcpyDeviceToDevice, stream));
+}
+
+inline Status memset_device(void* dst_device, int value, std::size_t bytes, cudaStream_t stream = nullptr) {
+    return QUADTRIX_CUDA_CHECK(cudaMemsetAsync(dst_device, value, bytes, stream));
+}
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/nccl_all_reduce.cuh b/CUDA/includes/nccl_all_reduce.cuh
new file mode 100644
index 0000000..c712a6a
--- /dev/null
+++ b/CUDA/includes/nccl_all_reduce.cuh
@@ -0,0 +1,96 @@
+#pragma once
+
+#include "tensor.cuh"
+
+#include <cuda_runtime.h>
+
+#ifdef QUADTRIX_ENABLE_NCCL
+#include <nccl.h>
+#else
+typedef struct {
+    char internal[128];
+} ncclUniqueId;
+typedef struct ncclComm* ncclComm_t;
+typedef enum {
+    ncclSuccess = 0,
+    ncclUnhandledCudaError = 1,
+    ncclSystemError = 2,
+    ncclInternalError = 3,
+    ncclInvalidArgument = 4,
+    ncclInvalidUsage = 5,
+    ncclNumResults = 6
+} ncclResult_t;
+#endif
+
+namespace quadtrix {
+namespace cuda {
+
+struct NcclStatus {
+    bool ok;
+    ncclResult_t nccl_status;
+    const char* message;
+
+    static NcclStatus success() {
+        return {true, ncclSuccess, "ok"};
+    }
+
+    static NcclStatus failure(ncclResult_t status, const char* message) {
+        return {false, status, message};
+    }
+};
+
+const char* nccl_status_name(ncclResult_t status);
+
+class NcclCommunicator {
+public:
+    NcclCommunicator() = default;
+    NcclCommunicator(ncclUniqueId unique_id, int world_size, int rank, int device_id);
+    ~NcclCommunicator();
+
+    NcclCommunicator(const NcclCommunicator&) = delete;
+    NcclCommunicator& operator=(const NcclCommunicator&) = delete;
+
+    NcclCommunicator(NcclCommunicator&& other) noexcept;
+    NcclCommunicator& operator=(NcclCommunicator&& other) noexcept;
+
+    ncclComm_t get() const {
+        return comm_;
+    }
+
+    int world_size() const {
+        return world_size_;
+    }
+
+    int rank() const {
+        return rank_;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    bool valid() const {
+        return comm_ != nullptr;
+    }
+
+private:
+    ncclComm_t comm_ = nullptr;
+    int world_size_ = 1;
+    int rank_ = 0;
+    int device_id_ = 0;
+};
+
+NcclStatus create_unique_id(ncclUniqueId* unique_id);
+
+NcclStatus all_reduce_sum(
+    NcclCommunicator& communicator,
+    TensorView tensor,
+    cudaStream_t stream = nullptr);
+
+NcclStatus all_reduce_average(
+    NcclCommunicator& communicator,
+    TensorView tensor,
+    cudaStream_t stream = nullptr);
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/includes/tensor.cuh b/CUDA/includes/tensor.cuh
new file mode 100644
index 0000000..c61d77e
--- /dev/null
+++ b/CUDA/includes/tensor.cuh
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "common.h"
+#include "memory.cuh"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+namespace quadtrix {
+namespace cuda {
+
+constexpr int kMaxTensorDims = 8;
+
+struct TensorShape {
+    int rank = 0;
+    std::array<std::int64_t, kMaxTensorDims> dims{};
+    std::array<std::int64_t, kMaxTensorDims> strides{};
+
+    static TensorShape contiguous(const std::int64_t* sizes, int ndim) {
+        if (ndim < 1 || ndim > kMaxTensorDims) {
+            std::fprintf(stderr, "Tensor rank %d is outside supported range [1, %d]\n", ndim, kMaxTensorDims);
+            std::abort();
+        }
+
+        TensorShape shape;
+        shape.rank = ndim;
+        for (int i = 0; i < ndim; ++i) {
+            if (sizes[i] <= 0) {
+                std::fprintf(stderr, "Tensor dimension %d must be positive, got %lld\n", i, static_cast<long long>(sizes[i]));
+                std::abort();
+            }
+            shape.dims[i] = sizes[i];
+        }
+
+        std::int64_t stride = 1;
+        for (int i = ndim - 1; i >= 0; --i) {
+            shape.strides[i] = stride;
+            stride *= shape.dims[i];
+        }
+        return shape;
+    }
+
+    std::size_t numel() const {
+        std::size_t total = 1;
+        for (int i = 0; i < rank; ++i) {
+            if (dims[i] <= 0) {
+                return 0;
+            }
+            std::size_t next = 0;
+            if (!checked_mul(total, static_cast<std::size_t>(dims[i]), &next)) {
+                return 0;
+            }
+            total = next;
+        }
+        return rank == 0 ? 0 : total;
+    }
+
+    bool is_contiguous() const {
+        std::int64_t expected = 1;
+        for (int i = rank - 1; i >= 0; --i) {
+            if (strides[i] != expected) {
+                return false;
+            }
+            expected *= dims[i];
+        }
+        return true;
+    }
+};
+
+struct TensorView {
+    void* data = nullptr;
+    TensorShape shape;
+    DType dtype = DType::F32;
+    DeviceKind device = DeviceKind::CUDA;
+    int device_id = 0;
+
+    std::size_t numel() const {
+        return shape.numel();
+    }
+
+    std::size_t bytes() const {
+        std::size_t out = 0;
+        if (!checked_mul(numel(), dtype_size(dtype), &out)) {
+            return 0;
+        }
+        return out;
+    }
+
+    template <typename T>
+    T* data_as() {
+        return static_cast<T*>(data);
+    }
+
+    template <typename T>
+    const T* data_as() const {
+        return static_cast<const T*>(data);
+    }
+};
+
+class Tensor {
+public:
+    Tensor() = default;
+
+    Tensor(const std::int64_t* dims, int rank, DType dtype, int device_id = 0)
+        : shape_(TensorShape::contiguous(dims, rank)), dtype_(dtype), device_id_(device_id) {
+        allocate();
+    }
+
+    Tensor(const Tensor&) = delete;
+    Tensor& operator=(const Tensor&) = delete;
+    Tensor(Tensor&&) noexcept = default;
+    Tensor& operator=(Tensor&&) noexcept = default;
+
+    TensorView view() {
+        return {storage_.data(), shape_, dtype_, DeviceKind::CUDA, device_id_};
+    }
+
+    TensorView view() const {
+        return {const_cast<void*>(storage_.data()), shape_, dtype_, DeviceKind::CUDA, device_id_};
+    }
+
+    const TensorShape& shape() const {
+        return shape_;
+    }
+
+    DType dtype() const {
+        return dtype_;
+    }
+
+    int device_id() const {
+        return device_id_;
+    }
+
+    std::size_t numel() const {
+        return shape_.numel();
+    }
+
+    std::size_t bytes() const {
+        return storage_.bytes();
+    }
+
+    void* data() {
+        return storage_.data();
+    }
+
+    const void* data() const {
+        return storage_.data();
+    }
+
+private:
+    void allocate() {
+        std::size_t bytes = 0;
+        if (!checked_mul(shape_.numel(), dtype_size(dtype_), &bytes)) {
+            std::fprintf(stderr, "Tensor allocation size overflow\n");
+            std::abort();
+        }
+        storage_.allocate(bytes, device_id_);
+    }
+
+    TensorShape shape_;
+    DType dtype_ = DType::F32;
+    int device_id_ = 0;
+    DeviceBuffer storage_;
+};
+
+}  // namespace cuda
+}  // namespace quadtrix
diff --git a/CUDA/llmcpp/adamw.cuh b/CUDA/llmcpp/adamw.cuh
new file mode 100644
index 0000000..4453576
--- /dev/null
+++ b/CUDA/llmcpp/adamw.cuh
@@ -0,0 +1,98 @@
+/*
+AdamW kernel
+*/
+
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+// Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
+// Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
+__device__ float lerp(float start, float end, float weight) {
+    return fma(weight, end, fma(-weight, start, start));
+}
+
+template <typename Tp, typename Tg>
+__device__ void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                             float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
+                             float grad_scale, unsigned int seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }  // guard
+
+    // get the gradient, m, and v for this parameter
+    float grad = grad_scale * (float)grads_memory[idx];
+    float m = m_memory[idx];
+    float v = v_memory[idx];
+    // update the first moment (momentum)
+    m = lerp(grad, m, beta1);
+    m_memory[idx] = m;
+    // update the second moment (RMSprop)
+    v = lerp(grad * grad, v, beta2);
+    v_memory[idx] = v;
+    m /= beta1_correction;  // m_hat
+    v /= beta2_correction;  // v_hat
+    // fetch the old value of this parameter as a float, from either source
+    float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
+    // update this parameter
+    float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
+    // update our low precision version of the parameters using stochastic rounding
+    // this will be used in the next forward pass
+    stochastic_rounding(param, &params_memory[idx], seed);
+    // write the full, float version of the param into our master copy, if we maintain one
+    // this will be used in the next update
+    if (master_params_memory != NULL) { master_params_memory[idx] = param; }
+}
+
+template <typename Tp, typename Tg>
+__global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                              ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,
+                              float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
+                              float grad_scale, unsigned int seed) {
+    adamw_update(params_memory + blockIdx.y * w_stride,
+                 master_params_memory ? master_params_memory + blockIdx.y * s_stride : NULL,
+                 grads_memory + blockIdx.y * g_stride,
+                 m_memory + blockIdx.y * s_stride,
+                 v_memory + blockIdx.y * s_stride,
+                 num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, grad_scale,
+                 seed
+                 );
+}
+
+template <typename Tp>
+__global__ void init_from_master_kernel(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                                          ptrdiff_t w_stride, ptrdiff_t s_stride, unsigned int seed) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }
+    params_memory += blockIdx.y * w_stride; // adjust for layer offset
+    master_params_memory += blockIdx.y * s_stride;
+    stochastic_rounding(master_params_memory[idx], &params_memory[idx], seed);
+}
+
+template <typename Tp, typename Tg>
+void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                  ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,  int num_slices, float learning_rate, float beta1, float beta2, int t, float eps, float weight_decay,
+                  float grad_scale, unsigned int seed, cudaStream_t stream) {
+    // AdamW update
+    int block_size = 512;
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    float beta1_correction = 1.0f - powf(beta1, t);
+    float beta2_correction = 1.0f - powf(beta2, t);
+    adamw_kernel3<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>(params_memory, master_params_memory, grads_memory,
+                                                         m_memory, v_memory, num_parameters, w_stride, g_stride, s_stride,
+                                                         learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
+                                                         grad_scale, seed);
+    cudaCheck(cudaGetLastError());
+}
+
+template <typename Tp>
+void init_from_master(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                        ptrdiff_t w_stride, ptrdiff_t s_stride, int num_slices, unsigned int seed, cudaStream_t stream) {
+    int block_size = 512; // must match block size of adamw_update so that RNG also matches
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    init_from_master_kernel<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>
+                             (params_memory, master_params_memory, num_parameters, w_stride, s_stride, seed);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/CUDA/llmcpp/cuda_utils.cuh b/CUDA/llmcpp/cuda_utils.cuh
new file mode 100644
index 0000000..030ec07
--- /dev/null
+++ b/CUDA/llmcpp/cuda_utils.cuh
@@ -0,0 +1,286 @@
+// Utilities for use in __device__ code
+
+#ifndef CUDA_UTILS_CUH
+#define CUDA_UTILS_CUH
+
+#include "cuda_common.h"
+
+// ----------------------------------------------------------------------------
+// Packed128 data structure that forces the compiler to use 128-bit loads/stores
+// in GPUs that support (the LDG.128 and STS.128 instructions)
+// This is a bit similar to the use of float4 in the case of 32-bit floats, but
+// supports arbitrary precision.
+
+template<class ElementType>
+struct alignas(16) Packed128 {
+    Packed128() = default;
+    __device__ explicit Packed128(int4 bits) {
+        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
+        memcpy(&payload, &bits, sizeof(bits));
+    }
+
+    __device__  static Packed128 constant(ElementType value) {
+        Packed128 result;
+        for(int k = 0; k < size; ++k) {
+            result.payload[k] = value;
+        }
+        return result;
+    }
+    __device__ static Packed128 zeros() {
+        return constant(0.f);
+    }
+    __device__ static Packed128 ones() {
+        return constant(1.f);
+    }
+
+    __device__ ElementType& operator[](int index) {
+        return payload[index];
+    }
+    __device__ const ElementType& operator[](int index) const {
+        return payload[index];
+    }
+    __device__ int4 get_bits() const {
+        int4 bits;
+        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
+        memcpy(&bits, &payload, sizeof(bits));
+        return bits;
+    }
+    static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);
+    ElementType payload[size];
+};
+
+// load a Packed128 from an aligned memory address
+template<class ElementType>
+__device__ Packed128<ElementType> load128(const ElementType* address) {
+    return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
+}
+// load a Packed128 from an aligned memory address with streaming cache hint
+template<class ElementType>
+__device__ Packed128<ElementType> load128cs(const ElementType* address) {
+    return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
+}
+// store a Packed128 to an aligned memory address
+template<class ElementType>
+__device__ void store128(ElementType* target, Packed128<ElementType> value) {
+    *reinterpret_cast<int4*>(target) = value.get_bits();
+}
+// store a Packed128 to an aligned memory address with streaming cache hint
+template<class ElementType>
+__device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
+    __stcs(reinterpret_cast<int4*>(target), value.get_bits());
+}
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
+
+// short-form typedefs
+typedef Packed128<float> f128;
+typedef Packed128<floatX> x128;
+
+// ----------------------------------------------------------------------------
+// DType support
+
+// enumerator to indentify the datatype of a tensor.
+enum class DType : uint8_t {
+    FP32, FP16, BF16
+};
+
+// Given a datatype enum, returns the underlying number of bytes
+// for a scalar of that type
+size_t sizeof_dtype(DType type) {
+    switch (type) {
+        case DType::FP32:
+            return sizeof(float);
+        case DType::FP16:
+            return sizeof(half);
+        case DType::BF16:
+            return sizeof(nv_bfloat16);
+        default: // handle or get compiler warning
+            fprintf(stderr, "Unknown datatype\n");
+            exit(EXIT_FAILURE);
+    }
+}
+
+DType dtype_of(float* f) { return DType::FP32; }
+DType dtype_of(nv_bfloat16 * f) { return DType::BF16; }
+DType dtype_of(half * f) { return DType::FP16; }
+
+
+
+// ----------------------------------------------------------------------------
+// Copy, cast functions
+
+// device functions and the kernel to cast data between types
+template<typename Td, typename Ts>
+__device__ Td cast_value(Ts val);
+
+template<>
+__device__ float cast_value<float, float>(float val) {
+    return val;
+}
+
+template<>
+__device__ float cast_value<float, half>(half val) {
+    return __half2float(val);
+}
+
+template<>
+__device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template<typename Td, typename Ts>
+__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n, ptrdiff_t stride_dst, ptrdiff_t stride_src) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // need to try grid stride looping for more perf later
+    if (idx < n) {
+        dst[idx + stride_dst * blockIdx.y] = cast_value<Td, Ts>(src[idx + stride_src * blockIdx.y]);
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Warp/Block communication primitives
+
+// warp-level reduction for summing values
+__device__ inline float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
+    }
+    return val;
+}
+// warp-level reduction for finding the maximum value
+__device__ inline float warpReduceMax(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val = fmaxf(val, __shfl_xor_sync(0xFFFFFFFF, val, offset));
+    }
+    return val;
+}
+// requires all 32 threads in the warp to be active, but should work for any block size
+// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
+// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
+// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
+using reduction_func_t = float (*) (float);
+template<reduction_func_t warp_reduction>
+__device__ inline float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
+    // two reductions of up to 1024 threads:
+    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
+    __shared__ float shared_val[WARP_SIZE];
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int num_warps = blockDim.x / WARP_SIZE;
+
+    float warp_val = warp_reduction(val);
+    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
+    __syncthreads();
+    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
+    float block_val = warp_reduction(warp_val);
+
+    if (final_sync) {
+        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
+    }
+    return block_val;
+}
+
+// Performs a _deterministic_ sum reduction. determinism is achieved by requiring that only
+// a single block be used.
+template<class Float>
+__global__ void global_sum_single_block_kernel(float* result, const Float* values, size_t count) {
+    assert(gridDim.x == 1);     // only a single block!
+    float thread_sum = 0;
+    for(size_t index = threadIdx.x; index < count; index += blockDim.x) {
+        thread_sum += (float)values[index];
+    }
+
+    float reduction = blockReduce<warpReduceSum>(thread_sum, true);
+    if(threadIdx.x == 0) {
+        *result = reduction;
+    }
+}
+
+template<class Float>
+void global_sum_deterministic(float* result, const Float* values, int count, cudaStream_t stream) {
+    global_sum_single_block_kernel<<<1, 1024, 0, stream>>>(result, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+// ----------------------------------------------------------------------------
+// memory management
+
+// allocate memory, preferrably on the device
+// returns a status code. 0 = OK, 1 = fell back to managed memory
+int cudaMallocConditionallyManaged(void** out, size_t bytes, const char *file, int line) {
+    // try to allocate
+    cudaError_t err = cudaMalloc(out, bytes);
+    if(err == cudaErrorMemoryAllocation) {
+        // if we OOM, fallback to a managed allocation. slower but at least won't crash.
+        cudaGetLastError(); // reset the error before the next API call
+        cudaCheck_(cudaMallocManaged(out, bytes), file, line);
+        cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), file, line);
+        return 1;
+    } else {
+        cudaCheck_(err, file, line);
+        return 0;
+    }
+}
+
+#define cudaMallocConditionallyManaged(out, bytes)\
+(cudaMallocConditionallyManaged((void**)out, bytes, __FILE__, __LINE__))
+
+// ----------------------------------------------------------------------------
+// Random Number Generation used in Stochastic Rounding
+
+// SquirrelNoise5 - Squirrel's Raw Noise utilities (version 5)
+// This gives us a random number from threadIdx/blockIdx + a single seed for the entire GPU
+// todo - possibly overkill and we don't need such high quality random numbers? (tbd)
+// http://eiserloh.net/noise/SquirrelNoise5.hpp
+__device__ __host__ constexpr unsigned int SquirrelNoise5(unsigned int positionX, unsigned int seed)
+{
+    constexpr unsigned int SQ5_BIT_NOISE1 = 0xd2a80a3f;	// 11010010101010000000101000111111
+    constexpr unsigned int SQ5_BIT_NOISE2 = 0xa884f197;	// 10101000100001001111000110010111
+    constexpr unsigned int SQ5_BIT_NOISE3 = 0x6C736F4B; // 01101100011100110110111101001011
+    constexpr unsigned int SQ5_BIT_NOISE4 = 0xB79F3ABB;	// 10110111100111110011101010111011
+    constexpr unsigned int SQ5_BIT_NOISE5 = 0x1b56c4f5;	// 00011011010101101100010011110101
+    unsigned int mangledBits = positionX;
+    mangledBits *= SQ5_BIT_NOISE1;
+    mangledBits += seed;
+    mangledBits ^= (mangledBits >> 9);
+    mangledBits += SQ5_BIT_NOISE2;
+    mangledBits ^= (mangledBits >> 11);
+    mangledBits *= SQ5_BIT_NOISE3;
+    mangledBits ^= (mangledBits >> 13);
+    mangledBits += SQ5_BIT_NOISE4;
+    mangledBits ^= (mangledBits >> 15);
+    mangledBits *= SQ5_BIT_NOISE5;
+    mangledBits ^= (mangledBits >> 17);
+    return mangledBits;
+}
+__device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed)
+{
+    constexpr unsigned int PRIME_NUMBER = 198491317u; // Large prime number with non-boring bits
+    unsigned int x = static_cast<unsigned int>(indexX);
+    unsigned int y = static_cast<unsigned int>(indexY);
+
+    return SquirrelNoise5(x + (PRIME_NUMBER * y), seed);
+}
+
+// stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift)
+__device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) {
+    // todo - is this stochastic rounding *too good*? can we cut any corners?
+    // makes sure each thread gets a different random number
+    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x * blockDim.x + blockIdx.y, seed);
+    unsigned int threshold = random & 0xFFFF;
+    unsigned int float_bits = __float_as_uint(in);
+    unsigned int rounded_bits = float_bits & 0x0000FFFF;
+    float_bits = (rounded_bits > threshold) ? (float_bits | 0xFFFF) : (float_bits  & ~0xFFFF);
+    *out = __float2bfloat16_rn(__uint_as_float(float_bits));
+}
+__device__ __forceinline__ void stochastic_rounding(float in, half *out, unsigned int random) {
+    *out = (float)in; // todo - implement this...
+}
+__device__ __forceinline__ void stochastic_rounding(float in, float *out, unsigned int random) {
+    *out = in; // dummy function for when floatX is float (FP32 mode)
+}
+
+#endif
\ No newline at end of file
diff --git a/CUDA/llmcpp/cudnn_att.cpp b/CUDA/llmcpp/cudnn_att.cpp
new file mode 100644
index 0000000..0330abe
--- /dev/null
+++ b/CUDA/llmcpp/cudnn_att.cpp
@@ -0,0 +1,297 @@
+// all cudnn-related functions are in this file, so that they don't need to be recompiled everytime
+// we change some unrelated piece of the code.
+// TODO this currently duplicates some of the utilities from the main file
+
+#define NOMINMAX
+#include <unistd.h>
+#include "cudnn_att.h"
+#include <cudnn_frontend.h>
+
+namespace fe = cudnn_frontend;
+
+// Specific configurations based on the enabled precision
+#if defined(ENABLE_FP32)
+static_assert(false, "cuDNN is not supported in FP32 mode.")
+// use fp16 (note: this may require gradient scaler, currently not implemented!)
+#elif defined(ENABLE_FP16)
+#define CUDNN_16BIT fe::DataType_t::HALF
+#else // Default to bfloat16
+#define CUDNN_16BIT fe::DataType_t::BFLOAT16
+#endif
+
+static cudnnHandle_t cudnn_handle;
+static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!)
+static void* cudnn_workspace = NULL;
+
+static void cuDNNCheck(cudnnStatus_t error, const char *file, int line) {
+    if (error != CUDNN_STATUS_SUCCESS) {
+        printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, cudnnGetErrorString(error));
+        exit(EXIT_FAILURE);
+    }
+};
+#define cuDNNCheck(err) (cuDNNCheck(err, __FILE__, __LINE__))
+
+static void checkCudnnFE(const fe::error_object& e, const char *file, int line) {
+    if(!e.is_good()) {
+        printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, e.err_msg.c_str());
+        exit(EXIT_FAILURE);
+    }
+}
+#define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__)
+
+enum UIDs {
+    Q_UID,
+    K_UID,
+    V_UID,
+    Attn_scale_UID,
+    O_UID,
+    Stats_UID,
+    dO_UID,
+    dQ_UID,
+    dK_UID,
+    dV_UID
+};
+
+// Need a cache because graph->build_operation_graph() is slow but everything else seems fast
+using cache_type_fwd = std::map<std::tuple<int,int,int,int, int>, std::shared_ptr<fe::graph::Graph>>;
+using cache_type_bwd = std::map<std::tuple<int,int,int,int>, std::shared_ptr<fe::graph::Graph>>;
+
+// Loosely based on cuDNN frontend samples functions and massively simplified
+auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) {
+
+    static cache_type_fwd user_maintained_cache_fwd;
+
+    auto key = std::make_tuple(B, H, T, HS, is_inference_only);
+
+    auto it = user_maintained_cache_fwd.find(key);
+    if (it != user_maintained_cache_fwd.end()) {
+        return it->second;
+    }
+
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(CUDNN_16BIT)
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
+                               .set_dim({B, H, T, HS})
+                               .set_uid(Q_UID)
+                               .set_stride({3 * H * HS * T,  HS, 3 * H * HS, 1}));
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
+                               .set_dim({B, H, T, HS})
+                               .set_uid(K_UID)
+                               .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
+                               .set_dim({B, H, T, HS})
+                               .set_uid(V_UID)
+                               .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1}));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                               .set_dim({1, 1, 1, 1})
+                               .set_stride({1, 1, 1, 1})
+                               .set_uid(Attn_scale_UID)
+                               .set_is_pass_by_value(true)
+                               .set_data_type(fe::DataType_t::FLOAT));
+
+    auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention");
+    sdpa_options.set_is_inference(is_inference_only);
+    sdpa_options.set_attn_scale(attn_scale);
+    sdpa_options.set_causal_mask(true);
+
+    // Create the graph operation and get the output tensors back
+    auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options);
+
+    // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32
+    O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID);
+
+    assert(stats == nullptr || is_inference_only == false);
+    if (is_inference_only == false) {
+        stats->set_output(true).set_data_type(fe::DataType_t::FLOAT)
+                               .set_dim({B, H, T, 1})
+                               .set_stride({H * T, T, 1, 1})
+                               .set_uid(Stats_UID);
+    }
+
+    checkCudnnFE(graph->validate());
+
+    // Build the operation graph and execution part (this is the VERY SLOW PART)
+    checkCudnnFE(graph->build_operation_graph(cudnn_handle));
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    checkCudnnFE(graph->check_support(cudnn_handle));
+    checkCudnnFE(graph->build_plans(cudnn_handle));
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // In H100 this may be around 16B
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
+
+    user_maintained_cache_fwd.insert({key, graph});
+
+    return graph;
+}
+
+auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) {
+    static cache_type_bwd user_maintained_cache_bwd;
+
+    auto key = std::make_tuple(B, NH, T, HS);
+
+    auto it = user_maintained_cache_bwd.find(key);
+    if (it != user_maintained_cache_bwd.end()) {
+        return it->second;
+    }
+
+    auto graph = std::make_shared<fe::graph::Graph>();
+    graph->set_io_data_type(CUDNN_16BIT)
+          .set_intermediate_data_type(fe::DataType_t::FLOAT)
+          .set_compute_data_type(fe::DataType_t::FLOAT);
+
+    // (B, N, 3, NH, HS)
+    // must come from inp (which means we also need to convert THAT to FP16)
+    auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(Q_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(K_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(V_UID)
+                            .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}));
+    auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(O_UID)
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+    auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO")
+                            .set_dim({B, NH, T, HS})
+                            .set_uid(dO_UID)
+                            .set_stride({NH * HS * T, HS, NH * HS, 1}));
+
+    auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats")
+                            .set_dim({B, NH, T, 1})
+                            .set_uid(Stats_UID)
+                            .set_stride({NH * T, T, 1, 1})
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale")
+                            .set_dim({1, 1, 1, 1})
+                            .set_stride({1, 1, 1, 1})
+                            .set_is_pass_by_value(true)
+                            .set_uid(Attn_scale_UID)
+                            .set_data_type(fe::DataType_t::FLOAT));
+    auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
+#if CUDNN_FRONTEND_MAJOR_VERSION > 1 || CUDNN_FRONTEND_MINOR_VERSION >= 5
+                            .set_deterministic_algorithm(true) // 1.5+ needs this for determinism
+#endif
+                            .set_causal_mask(true)
+                            .set_attn_scale(attn_scale);
+
+    // Create the graph operation and get the output tensors back
+    auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options);
+
+    dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID);
+    dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID);
+    dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID);
+
+    checkCudnnFE(graph->validate());
+
+    // Build the operation graph and execution part (this is the VERY SLOW PART)
+    checkCudnnFE(graph->build_operation_graph(cudnn_handle));
+    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    checkCudnnFE(graph->check_support(cudnn_handle));
+    checkCudnnFE(graph->build_plans(cudnn_handle));
+
+    // Reallocate the workspace if the required size is greater than the current workspace
+    // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum
+    if (graph->get_workspace_size() > cudnn_workspace_size) {
+        if (cudnn_workspace_size > 0) {
+            cudaCheck(cudaFree(cudnn_workspace));
+        }
+        cudnn_workspace_size = graph->get_workspace_size();
+        cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size));
+    }
+
+    user_maintained_cache_bwd.insert({key, graph});
+    return graph;
+}
+
+void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
+                             float* stats, // output for backward pass: (B, NH, T)
+                             floatX* inp,  // input: (B, T, 3, NH, HS) QKV
+                             int B, int T, int NH, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    int HS = C / NH; // number of features per head
+    bool is_inference_only = (stats == nullptr);
+
+    cuDNNCheck(cudnnSetStream(cudnn_handle, stream));
+
+    // Get graph and tensors from cache (or generate it on first use)
+    auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
+
+    // Prepare all the tensor pointers for executing the graph
+    void* devPtrQ = inp;
+    void* devPtrK = (inp + C);
+    void* devPtrV = (inp + 2 * C);
+    float attn_scale_cpu = 1.0 / sqrtf(HS);
+    void* devPtrO = out;
+
+    // Build variant pack
+    std::unordered_map<int64_t , void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}};
+
+    // Add the stats tensor unless we are only doing inference (only needed for backward pass)
+    if (is_inference_only == false) {
+        variant_pack[Stats_UID] = stats;
+    }
+
+    // Execute graph
+    checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
+    cudaCheck(cudaGetLastError());
+}
+
+void attention_backward_cudnn(floatX* dqkvr,                                       // output
+                              floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs
+                              int B, int T, int NH, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    int HS = C / NH; // number of features per head
+
+    // Get graph and tensors from cache (or generate it on first use)
+    auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS);
+
+    // Prepare all the tensor pointers for executing the graph
+    void* devPtrQ = qkvr;
+    void* devPtrK = (qkvr + NH * HS);
+    void* devPtrV = (qkvr + 2 * NH * HS);
+    void* devPtrO = o;
+    void* devPtrdO = dout;
+    void* devPtrStats = stats;
+    float attn_scale_cpu = 1.0 / sqrtf(HS);
+
+    void* devPtrdQ = dqkvr;
+    void* devPtrdK = (dqkvr + NH * HS);
+    void* devPtrdV = (dqkvr + 2 * NH * HS);
+
+    // Build variant pack that links each tensor to its data pointer
+    std::unordered_map<int64_t, void*> variant_pack = {
+        {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats},
+        {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV},
+        {Attn_scale_UID, &attn_scale_cpu}};
+
+    // Execute graph
+    cuDNNCheck(cudnnSetStream(cudnn_handle, stream));
+    checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
+    cudaCheck(cudaGetLastError());
+}
+
+void create_cudnn() {
+    cuDNNCheck(cudnnCreate(&cudnn_handle));
+}
+
+void destroy_cudnn() {
+    if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
+    cuDNNCheck(cudnnDestroy(cudnn_handle));
+}
\ No newline at end of file
diff --git a/CUDA/llmcpp/dataloader.h b/CUDA/llmcpp/dataloader.h
new file mode 100644
index 0000000..0ee0588
--- /dev/null
+++ b/CUDA/llmcpp/dataloader.h
@@ -0,0 +1,496 @@
+#ifndef DATALOADER_H
+#define DATALOADER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>
+#include "utils.h"
+#include "rand.h"
+#ifndef _WIN32
+#include <glob.h>
+#endif
+#define HEADER_SIZE 256
+
+typedef struct
+{
+
+    int process_rank;
+    int num_processes;
+
+    size_t B;
+    size_t T;
+    size_t num_tokens;
+    size_t shard_num_samples;
+
+    glob_t glob_result;
+    size_t current_shard_idx;
+    size_t current_sample_idx;
+
+    FILE *tokens_file;
+
+    uint16_t *buffer;
+    int *inputs;
+    int *targets;
+
+    mt19937_state shuffle_rng;
+    int should_shuffle;
+    int *shard_indices;
+    int *intra_shard_indices;
+
+    size_t total_batch_size_bytes;
+    size_t local_batch_offset_bytes;
+    size_t header_bytes;
+    int64_t file_size_bytes;
+} DataLoader;
+
+int64_t dataloader_load_shard_(DataLoader *loader, int shard_index)
+{
+    if (loader->should_shuffle)
+    {
+        shard_index = loader->shard_indices[shard_index];
+    }
+
+    const char *filename = loader->glob_result.gl_pathv[shard_index];
+
+    if (loader->tokens_file != NULL)
+    {
+        fcloseCheck(loader->tokens_file);
+    }
+    loader->tokens_file = fopenCheck(filename, "rb");
+
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file);
+    if (header[0] != 20240520)
+    {
+
+        printf("---> HINT: Are you passing in a correct file?\n");
+        printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n");
+        exit(EXIT_FAILURE);
+    }
+    if (header[1] != 1)
+    {
+        printf("Bad version in data file\n");
+        exit(EXIT_FAILURE);
+    }
+    int64_t ntok = header[2]; //
+    assert(ntok > 0);
+    fseekCheck(loader->tokens_file, 0, SEEK_END);
+    loader->file_size_bytes = ftell(loader->tokens_file);
+    fseekCheck(loader->tokens_file, 0, SEEK_SET);
+    int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
+    if (loader->file_size_bytes != expected_file_size)
+    {
+        printf("Error: file size is not as expected\n");
+        exit(EXIT_FAILURE);
+    }
+
+    loader->shard_num_samples = (ntok * sizeof(uint16_t) - sizeof(uint16_t)) / loader->total_batch_size_bytes;
+    return ntok;
+}
+
+void prepare_intra_shard_indices_(DataLoader *loader)
+{
+
+    if (loader->intra_shard_indices != NULL)
+    {
+
+        free(loader->intra_shard_indices);
+    }
+    loader->intra_shard_indices = (int *)mallocCheck(loader->shard_num_samples * sizeof(int));
+    init_identity_permutation(loader->intra_shard_indices, (int)loader->shard_num_samples);
+    random_permutation(loader->intra_shard_indices, (int)loader->shard_num_samples, &loader->shuffle_rng);
+}
+
+void dataloader_reset(DataLoader *loader)
+{
+    loader->current_shard_idx = 0;
+    loader->current_sample_idx = 0;
+
+    if (loader->should_shuffle)
+    {
+        random_permutation(loader->shard_indices, (int)loader->glob_result.gl_pathc, &loader->shuffle_rng);
+    }
+
+    dataloader_load_shard_(loader, (int)loader->current_shard_idx);
+
+    if (loader->should_shuffle)
+    {
+        prepare_intra_shard_indices_(loader);
+    }
+}
+
+void dataloader_advance_(DataLoader *loader)
+{
+    if (loader->current_shard_idx == loader->glob_result.gl_pathc - 1)
+    {
+
+        dataloader_reset(loader);
+        return;
+    }
+
+    loader->current_shard_idx = (loader->current_shard_idx + 1) % loader->glob_result.gl_pathc;
+    loader->current_sample_idx = 0;
+    dataloader_load_shard_(loader, (int)loader->current_shard_idx);
+
+    if (loader->should_shuffle)
+    {
+        prepare_intra_shard_indices_(loader);
+    }
+}
+
+void dataloader_init(DataLoader *loader,
+                     const char *filename_pattern,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes,
+                     int should_shuffle)
+{
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+    loader->tokens_file = NULL;
+    loader->should_shuffle = should_shuffle;
+    loader->header_bytes = HEADER_SIZE * sizeof(int);
+    loader->total_batch_size_bytes = ((loader->num_processes * (loader->B * loader->T)) * sizeof(uint16_t));
+    loader->local_batch_offset_bytes = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
+
+    int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result);
+    if (glob_status != 0)
+    {
+        printf("Error: failed to glob pattern: %s\n", filename_pattern);
+        exit(EXIT_FAILURE);
+    }
+    if (loader->glob_result.gl_pathc == 0)
+    {
+        printf("Error: no files found matching the pattern: %s\n", filename_pattern);
+        exit(EXIT_FAILURE);
+    }
+
+    if (should_shuffle)
+    {
+        mt19937_state shuffle_rng;
+        manual_seed(&shuffle_rng, 42 + process_rank);
+        loader->shuffle_rng = shuffle_rng;
+        loader->shard_indices = (int *)mallocCheck(loader->glob_result.gl_pathc * sizeof(int));
+        init_identity_permutation(loader->shard_indices, (int)loader->glob_result.gl_pathc);
+        loader->intra_shard_indices = NULL;
+    }
+
+    int64_t ntok_total = 0;
+    for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++)
+    {
+        int64_t shard_ntok = dataloader_load_shard_(loader, shard_index);
+
+        assert(shard_ntok >= (int64_t)(num_processes * B * T + 1));
+        ntok_total += shard_ntok;
+    }
+
+    loader->buffer = (uint16_t *)mallocCheck((B * T + 1) * sizeof(uint16_t));
+    loader->inputs = (int *)mallocCheck(B * T * sizeof(int));
+    loader->targets = (int *)mallocCheck(B * T * sizeof(int));
+    loader->num_tokens = ntok_total;
+
+    dataloader_reset(loader);
+}
+
+void dataloader_load_batch(DataLoader *loader)
+{
+    assert(!loader->should_shuffle || (loader->should_shuffle && loader->intra_shard_indices != NULL));
+    assert(loader->current_sample_idx < loader->shard_num_samples);
+    size_t idx = loader->should_shuffle ? loader->intra_shard_indices[loader->current_sample_idx] : loader->current_sample_idx;
+    size_t global_batch_offset_bytes = idx * loader->total_batch_size_bytes;
+    int64_t current_offset = loader->header_bytes + global_batch_offset_bytes + loader->local_batch_offset_bytes;
+
+    size_t B = loader->B;
+    size_t T = loader->T;
+
+    fseekCheck(loader->tokens_file, (int)current_offset, SEEK_SET);
+    freadCheck(loader->buffer, sizeof(uint16_t), B * T + 1, loader->tokens_file);
+
+    for (int i = 0; i < B * T; i++)
+    {
+        loader->inputs[i] = (int)loader->buffer[i];
+        loader->targets[i] = (int)loader->buffer[i + 1];
+    }
+}
+
+void dataloader_next_batch(DataLoader *loader)
+{
+
+    if (loader->current_sample_idx >= loader->shard_num_samples)
+    {
+        dataloader_advance_(loader);
+    }
+    dataloader_load_batch(loader);
+    loader->current_sample_idx += 1;
+}
+
+void dataloader_resume(DataLoader *loader, size_t current_shard_idx, size_t current_sample_idx)
+{
+
+    loader->current_shard_idx = current_shard_idx;
+    loader->current_sample_idx = current_sample_idx;
+    dataloader_load_shard_(loader, (int)loader->current_shard_idx);
+}
+
+void dataloader_free(DataLoader *loader)
+{
+    free(loader->buffer);
+    free(loader->inputs);
+    free(loader->targets);
+    if (loader->should_shuffle)
+    {
+        free(loader->shard_indices);
+        free(loader->intra_shard_indices);
+    }
+    fcloseCheck(loader->tokens_file);
+    globfree(&loader->glob_result);
+}
+
+#define ASSUMED_NUM_COMPLETIONS 4
+#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N))
+
+typedef struct
+{
+
+    int process_rank;
+    int num_processes;
+
+    size_t B;
+    size_t T;
+    FILE *eval_file;
+    uint16_t *buffer;
+    int num_examples;
+    int num_batches;
+    int start_example_index;
+    int end_example_index;
+    int current_example_index;
+    int *inputs;
+    int *targets;
+    char *mask;
+    int *label;
+    int num_completions;
+} EvalLoader;
+
+void evalloader_reset(EvalLoader *loader)
+{
+    int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes);
+    int can_fit_examples = (int)(loader->B / ASSUMED_NUM_COMPLETIONS);
+    if (can_fit_examples == 0)
+    {
+
+        printf("HellaSwag EvalLoader: batch size %zu is < %d\n", loader->B, ASSUMED_NUM_COMPLETIONS);
+        printf("---> HINT: Disable HellaSwag eval with -h 0, or increase batch size with -b\n");
+        exit(EXIT_FAILURE);
+    }
+    loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
+
+    loader->start_example_index = examples_per_process * loader->process_rank;
+    loader->end_example_index = examples_per_process * (loader->process_rank + 1);
+
+    if (loader->end_example_index > loader->num_examples)
+    {
+        loader->end_example_index = loader->num_examples;
+    }
+
+    int64_t header_bytes = HEADER_SIZE * sizeof(int);
+    fseekCheck(loader->eval_file, (int)header_bytes, SEEK_SET);
+    for (int i = 0; i < loader->start_example_index; i++)
+    {
+        uint16_t example_header[3];
+        // read 3 uint16_t values: <START_EXAMPLE>, <EXAMPLE_BYTES>, <EXAMPLE_INDEX>
+        freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+        // validate the <START_EXAMPLE> delimiter
+        assert(example_header[0] == 65535); // <START_EXAMPLE> delimiter
+        // validate the <EXAMPLE_INDEX>
+        assert(example_header[2] == i); // <EXAMPLE_INDEX> should match the loop index
+        // skip to the next example, keeping in mind that we already read the header
+        size_t remaining_bytes = example_header[1] - sizeof(uint16_t) * 3;
+        assert(remaining_bytes > 0); // we expect some bytes in the example
+        fseekCheck(loader->eval_file, (int)remaining_bytes, SEEK_CUR);
+    }
+    // now we are at the start of the example we want to start at, pointing at <START_EXAMPLE>
+    loader->current_example_index = loader->start_example_index;
+}
+
+void evalloader_init(EvalLoader *loader,
+                     const char *filename,
+                     size_t B,
+                     size_t T,
+                     int process_rank,
+                     int num_processes)
+{
+    loader->process_rank = process_rank;
+    loader->num_processes = num_processes;
+    loader->B = B;
+    loader->T = T;
+
+    // open the file and validate the header
+    loader->eval_file = fopenCheck(filename, "rb");
+    // validate the header
+    int header[HEADER_SIZE];
+    freadCheck(header, sizeof(int), HEADER_SIZE, loader->eval_file);
+    if (header[0] != 20240522)
+    {
+        printf("Bad magic in eval file\n");
+        exit(EXIT_FAILURE);
+    }
+    if (header[1] != 1)
+    {
+        printf("Bad version in data file\n");
+        exit(EXIT_FAILURE);
+    }
+    loader->num_examples = header[2];              // number of examples in the file
+    assert(loader->num_examples >= num_processes); // avoid headaches for now
+    size_t longest_example_bytes = header[3];      // longest example in the file
+    // basic sensibility check we could relax later. but roughly each example
+    // contains the prompt (or "context") and 4 completions, all of these have to be
+    // up to T tokens, and their tokens are uint16_t (so 2 bytes/token).
+    // There's a few more things in each example but they are minor.
+    // So longest example should be roughly this. Just trying to make sure it's sensible.
+    assert(longest_example_bytes > 0 && longest_example_bytes < (1 + ASSUMED_NUM_COMPLETIONS) * T * 2);
+
+    // allocate all the space we'll need
+    int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS);
+    loader->buffer = (uint16_t *)mallocCheck(longest_example_bytes);
+    loader->inputs = (int *)calloc(B * T, sizeof(int));
+    loader->targets = (int *)calloc(B * T, sizeof(int));
+    loader->mask = (char *)mallocCheck(B * T * sizeof(char));
+    loader->label = (int *)mallocCheck(can_fit_examples * sizeof(int));
+
+    // reset the loader, to initialize it
+    evalloader_reset(loader);
+}
+
+void evalloader_next_example_(EvalLoader *loader, int example_batch_index)
+{
+    size_t B = loader->B;
+    size_t T = loader->T;
+    int batch_dim_offset = example_batch_index * ASSUMED_NUM_COMPLETIONS;
+    uint16_t example_header[3];
+    freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file);
+    assert(example_header[0] == 65535);
+    assert(example_header[2] == loader->current_example_index);
+    assert(example_header[2] >= loader->start_example_index && example_header[2] < loader->end_example_index);
+
+    size_t example_bytes = example_header[1] - sizeof(uint16_t) * 3;
+    freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file);
+    int label = (int)loader->buffer[0];
+    int can_fit_examples = (int)(loader->B / ASSUMED_NUM_COMPLETIONS);
+    assert(label >= 0 && label < ASSUMED_NUM_COMPLETIONS);
+    assert(example_batch_index >= 0 && example_batch_index < can_fit_examples);
+    loader->label[example_batch_index] = label;
+    int num_completions = (int)loader->buffer[1];
+    assert(num_completions == ASSUMED_NUM_COMPLETIONS);
+    assert(batch_dim_offset + num_completions <= B);
+    loader->num_completions = num_completions;
+
+    int context_length = (int)loader->buffer[2];
+    uint16_t *context_tokens_start = &loader->buffer[3];
+    assert(context_length > 0 && context_length < T);
+    for (int b = 0; b < num_completions; b++)
+    {
+        for (int i = 0; i < context_length; i++)
+        {
+            int boff = batch_dim_offset + b;
+            int tok_cur = (int)context_tokens_start[i];
+            loader->inputs[boff * T + i] = tok_cur;
+        }
+    }
+    uint16_t *completions_iter = loader->buffer + 3 + context_length;
+    for (int c = 0; c < num_completions; c++)
+    {
+        int coff = batch_dim_offset + c;
+        int completion_length = (int)completions_iter[0];
+        uint16_t *completion_tokens_start = completions_iter + 1;
+        assert(completion_length > 0 && context_length + completion_length < T);
+        for (int i = 0; i < completion_length; i++)
+        {
+            int tok_cur = (int)completion_tokens_start[i];
+
+            loader->inputs[coff * T + context_length + i] = tok_cur;
+
+            loader->targets[coff * T + context_length + i - 1] = tok_cur;
+
+            loader->mask[coff * T + context_length + i - 1] = 1;
+        }
+        completions_iter += 1 + completion_length;
+        loader->current_example_index += 1;
+    }
+
+    void evalloader_next_batch(EvalLoader * loader)
+    {
+        size_t B = loader->B;
+        size_t T = loader->T;
+        memset(loader->mask, 0, B * T * sizeof(char));
+        int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS);
+        for (int i = 0; i < can_fit_examples; i++)
+        {
+            if (loader->current_example_index >= loader->end_example_index)
+            {
+                break;
+            }
+            evalloader_next_example_(loader, i);
+        }
+    }
+
+    int evalloader_stat_losses(EvalLoader * loader, float *losses)
+    {
+        int correct = 0;
+        size_t B = loader->B;
+        size_t T = loader->T;
+        int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS);
+        for (int i = 0; i < can_fit_examples; i++)
+        {
+            float min_loss = 0.0f;
+            int min_loss_index = -1;
+            char active = 0;
+            for (int b = 0; b < ASSUMED_NUM_COMPLETIONS; b++)
+            {
+                int boff = i * ASSUMED_NUM_COMPLETIONS + b;
+                float average_loss = 0.0f;
+                int count = 0;
+                for (int t = 0; t < T; t++)
+                {
+                    char mask = loader->mask[boff * T + t];
+                    if (mask == 1)
+                    {
+                        active = 1;
+                        average_loss += losses[boff * T + t];
+                        count++;
+                    }
+                }
+                if (count > 0)
+                {
+                    average_loss /= count;
+                }
+                if (b == 0 || average_loss < min_loss)
+                {
+                    min_loss = average_loss;
+                    min_loss_index = b;
+                }
+            }
+            if (active && (min_loss_index == loader->label[i]))
+            {
+                correct += 1;
+            }
+        }
+        return correct;
+    }
+
+    void evalloader_free(EvalLoader * loader)
+    {
+        free(loader->buffer);
+        free(loader->inputs);
+        free(loader->targets);
+        free(loader->mask);
+        free(loader->label);
+        fcloseCheck(loader->eval_file);
+    }
+
+#endif
\ No newline at end of file
diff --git a/CUDA/llmcpp/layernorm.cuh b/CUDA/llmcpp/layernorm.cuh
new file mode 100644
index 0000000..9777d06
--- /dev/null
+++ b/CUDA/llmcpp/layernorm.cuh
@@ -0,0 +1,505 @@
+/*
+LayerNorm CUDA kernel, and also Residual, because sometimes they are fused
+
+Note in llm.c we try to be clever in the backward pass to conserve memory.
+All parameters use a += in the backward pass, so we can do gradient accumulation.
+But all activations have = instead of += because these are faster (just read, no write).
+This is okay for all activations except for those in the residual stream, where the
+gradients have to add. We make sure that we do a += as necessary.
+E.g., the layernorms are connected to the residuals so we += in layernorm backward.
+*/
+
+#include <assert.h>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+__global__ void layernorm_forward_kernel3(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
+                                    const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
+                                    const floatX* __restrict__ bias, int N, int C) {
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
+
+    int idx = blockIdx.x * num_warps + warp_id;
+    if(idx >= N) { return; } // guard
+
+    // the row of input that this group of threads is responsible for
+    const floatX* x = inp + idx * C;
+
+    // mean
+    float sum = 0.0f;
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
+        sum += (float)x[i];
+    }
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    if(lane_id == 0 && mean != nullptr) {
+        __stcs(mean + idx, m);
+    }
+
+    // rstd
+    sum = 0.0f;
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
+        float diff = (float)x[i] - m;
+        sum += diff * diff;
+    }
+    sum = warpReduceSum(sum);
+    float s = rsqrtf(sum / C + 1e-5f);
+    if(lane_id == 0 && rstd != nullptr) {
+        __stcs(rstd + idx, s);
+    }
+
+    // final normalization and scaling by weight/bias
+    floatX* o = out + idx * C;
+    for (int c = lane_id; c < C; c += WARP_SIZE) {
+        // load and store using the .cs "streaming" hint to the compiler,
+        // indicating that this data will not be reused soon, and can be streamed through the caches
+        // this allows the threads to get more cache-hits for the (shared) weight and bias parameters
+        float n = s * ((float)__ldcs(x+c) - m);
+        __stcs(o+c, (floatX)(n * (float)weight[c] + (float)bias[c]));
+    }
+}
+
+__global__ void layernorm_forward_kernel6(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
+                                    const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
+                                    const floatX* __restrict__ bias, int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_in = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx >= N) { return; } // guard
+
+    // adjust pointers to current token
+    inp += idx * C;
+    out += idx * C;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = load128cs(inp + c);
+        for(int k = 0; k < x128::size; ++k) {
+            sum += (float)in_data[k];
+        }
+        s_in[c / x128::size] = in_data;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)in_data[k] - m) * ((float)in_data[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out_data;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)in_data[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out_data[k] = (floatX)o;
+        }
+
+        store128cs(out + c, out_data);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0 && mean != nullptr) {
+        __stcs(mean + idx, m);
+    }
+    // store the rstd, no need to cache it
+    if(threadIdx.x == 0 && rstd != nullptr) {
+        __stcs(rstd + idx, s);
+    }
+}
+
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, float* mean, float* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+
+    x128 packed_out;
+    x128 packed_inp1 = load128cs(inp1 + idx);
+    x128 packed_inp2 = load128cs(inp2 + idx);
+    for (int k = 0; k < packed_inp1.size; k++) {
+        packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
+    }
+    store128(out + idx, packed_out);
+}
+
+__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
+    layernorm_backward_kernel10(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                const floatX* dout, const floatX* inp, const floatX* weight,
+                                const float* mean, const float* rstd,
+                                int B, int T, int C) {
+    int BLOCK_SIZE = blockDim.x;
+    int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
+    extern __shared__ float shared[];
+
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = WARP_SIZE * x128::size;
+    int iterations_C = CEIL_DIV(C, C_per_iteration); // + 2;
+
+    // the first half of shared memory is bias, second is weight
+    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + rounded_C;
+    // warp zero doesn't actually write to the _tmp_shared memory locations, so we don't need to reserve memory
+    // the obvious solution is to change the addressing below to use (threadId.x-32) as offset, but that causes
+    // register spills, so instead we mess with the base pointer here, which doesn't increase register usage.
+    float* dbias_tmp_shared = shared + 2 * rounded_C - WARP_SIZE * f128::size;
+    float* dweight_tmp_shared = shared + 2 * rounded_C + f128::size * BLOCK_SIZE - 2 * WARP_SIZE * f128::size;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x * f128::size; i < rounded_C; i += BLOCK_SIZE * f128::size) {
+        store128(dbias_shared + i, f128::zeros());
+        store128(dweight_shared + i, f128::zeros());
+    }
+    __syncthreads();
+
+    for (int bt = baseIdx; bt < B * T; bt += warpsInGrid) {
+        const floatX* dout_bt = dout + bt * C;
+        const floatX* inp_bt = inp +bt * C;
+        floatX* dinp_bt = dinp + bt * C;
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * (float)inp128_i[k];
+            }
+        }
+
+        const float mean_bt = mean[bt];
+        const float rstd_bt = rstd[bt];
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C * rstd_bt - dnorm_mean * mean_bt * rstd_bt;
+
+        for (int c = 0; c < iterations_C; c++) {
+            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
+
+            x128 dout128   = x128::zeros();
+            x128 inp128    = x128::zeros();
+            x128 dinp128   = x128::zeros();
+            x128 weight128 = x128::zeros();
+
+            if(global_index < C) {
+                dout128 = load128cs(dout_bt + global_index);
+                inp128 = load128cs(inp_bt + global_index);
+                dinp128 = load128(dinp_bt + global_index);
+                weight128 = load128(weight + global_index);
+            }
+
+            for(int o = 0; o < x128::size / f128::size; ++o) {
+                f128 dbias_f;
+                f128 dweight_f;
+                for(int i = 0; i < f128::size; ++i) {
+                    int x = o * f128::size + i;
+                    float dout_i = (float)dout128[x];
+                    float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                    dbias_f[i] = dout_i;
+                    dweight_f[i] = norm_bti * dout_i;
+
+                    float dval = 0.0f;
+                    dval += (float) weight128[x] * (float)dout128[x]; // term 1
+                    dval -= dnorm_mean; // term 2
+                    dval -= norm_bti * dnorm_norm_mean; // term 3
+                    dval *= rstd_bt; // final scale
+                    dinp128[x] = (floatX) ((float) dinp128[x] + dval);
+                }
+
+                if (warpId != 0) {
+                    store128(dbias_tmp_shared + threadIdx.x * f128::size, dbias_f);
+                    // this seems to generate a 64-bit store, instead of 128-bit.
+                    // however, forcing 128-bit (e.g., using inline ptx), results in register
+                    // spilling and much worse performance, so we'll keep it like this for now
+                    // but ideally, we could reduce the register pressure a little.
+                    store128(dweight_tmp_shared + threadIdx.x * f128::size, dweight_f);
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        f128 dbias_tmp = load128(dbias_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
+                        f128 dweight_tmp = load128(dweight_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
+                        for(int i = 0; i < f128::size; ++i) {
+                            dbias_f[i] += dbias_tmp[i];
+                            dweight_f[i] += dweight_tmp[i];
+                        }
+                    }
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    f128 db_old = load128(dbias_shared + global_index + f128::size * o);
+                    f128 dw_old = load128(dweight_shared + global_index + f128::size * o);
+                    for(int i = 0; i < f128::size; ++i) {
+                        dbias_f[i] += db_old[i];
+                        dweight_f[i] += dw_old[i];
+                    }
+                    store128(dbias_shared + global_index + f128::size * o, dbias_f);
+                    store128(dweight_shared + global_index + f128::size * o, dweight_f);
+                }
+            }
+            if(global_index < C) {
+                // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+                store128cg(dinp_bt + global_index, dinp128);
+            }
+        }
+    }
+    __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
+        // Write to global memory in the same "shared memory banking friendly" order
+        store128(scratch_dbias + i + 2*C*blockIdx.x, load128(dbias_shared + i));
+        store128(scratch_dweight + i + 2*C*blockIdx.x, load128(dweight_shared + i));
+    }
+    __syncthreads();
+    // that portion of shared memory is no longer used, so we can repurpose it for the scratch flag.
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*rounded_C);
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        // Reduction of the partial sums by the final block
+        // todo - there isn't enough parallelism even inside that single SM...
+        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
+        for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
+            f128 dbias_accum = f128::zeros();
+            f128 dweight_accum = f128::zeros();
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
+        for (int c = warpId; c < iterations_C; c += warpsInBlock) {
+            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for(int o = 0; o < x128::size / f128::size; ++o) {
+                f128 s_db = load128(dbias_shared + global_index + o * f128::size);
+                f128 s_dw = load128(dweight_shared + global_index + o * f128::size);
+                for(int i = 0; i < f128::size; ++i) {
+                    int x = o * f128::size + i;
+                    dbias128[x] = (floatX)(s_db[i] + (float)dbias128[x]);
+                    dweight128[x] = (floatX)(s_dw[i] + (float)dweight128[x]);
+                }
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+// similar to `fused_residual_forward5`
+void layernorm_forward(floatX* out, float* mean, float* rstd,
+                       floatX* inp, const floatX* weight, const floatX* bias,
+                       int B, int T, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    int block_y = block_size / WARP_SIZE;
+    const int N = B * T;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(layernorm_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaCheck(cudaGetLastError());
+    if (status == cudaSuccess) {
+        layernorm_forward_kernel6<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    } else {
+        // fall back to the version without shared memory
+        const int grid_size_fb = CEIL_DIV(N * WARP_SIZE, block_size);
+        layernorm_forward_kernel3<<<grid_size_fb, block_size, 0, stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    assert(N % (block_size * x128::size) == 0);
+    const int grid_size = CEIL_DIV(N, block_size * x128::size);
+    residual_forward_kernel<<<grid_size, block_size, 0, stream>>>(out, inp1, inp2);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward5(floatX* residual, floatX* normed, float* mean, float* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, cudaStream_t stream) {
+    const int block_size = 256;
+    int block_y = block_size / WARP_SIZE;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaCheck(cudaGetLastError());
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(residual, normed,
+                                                                                              mean, rstd, inp1, inp2,
+                                                                                              weight, bias, N, C);
+    } else {
+        residual_forward(residual, inp1, inp2, N*C, stream);
+        layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C, stream);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                        const floatX* dout, const floatX* inp, const floatX* weight, const float* mean, const float* rstd,
+                        int B, int T, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 512;
+    const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3
+    const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
+    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
+    size_t shared_mem_size = (2 * rounded_C + 2 * (block_size - 32) * f128::size) * sizeof(float);
+
+    cudaCheck(cudaMemsetAsync(scratch, 0, 1 * sizeof(float), stream)); // only need to reset the flag to 0
+    layernorm_backward_kernel10<<<grid_size, block_size, shared_mem_size, stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/CUDA/llmcpp/mfu.h b/CUDA/llmcpp/mfu.h
new file mode 100644
index 0000000..1c40b7b
--- /dev/null
+++ b/CUDA/llmcpp/mfu.h
@@ -0,0 +1,244 @@
+#ifndef MFU_H
+#define MFU_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if __has_include(<nvml.h>)
+#define USE_NVML 1
+#include <nvml.h>
+#else
+#define USE_NVML 0
+#endif
+
+// tied to enum PrecisionMode, in a future refactor make them the same
+#define MFUH_PRECISION_FP32 0
+#define MFUH_PRECISION_FP16 1
+#define MFUH_PRECISION_BF16 2
+
+#if USE_NVML
+inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
+    if (status != NVML_SUCCESS) {
+        printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+};
+#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__))
+#endif
+
+
+typedef struct {
+    float TF_32;       // tensor-core performance 32 bit
+    float BF_16_32;    // bf16 with 32 bit accumulate
+    float FP_16_32;    // fp16 with 32 bit accumulate
+    float FP_16_16;    // fp16 with 16 bit accumulate
+    float FP_8_32;     // and so on
+    float FP_8_16;
+    float CLOCK;        // clock frequency from the spec sheet
+    float CORES;        // #TCs from the spec sheet
+} PerfData;
+
+// basic default data from the nvidia whitepapers
+static const PerfData VOLTA = {125.0f, -1.f, 125.f, -1.f, -1.f, -1.f, 1530.f, 640.f};
+static const PerfData AMPERE_DATACENTER = {156.f, 312.f, 312.f, 312.f, -1.f, -1.f, 1410.f, 432.f};
+static const PerfData AMPERE_CONSUMER = {40.f, 80.f, 80.f, 160.f, -1.f, -1.f, 1860.f, 336.f};
+static const PerfData HOPPER = {378.f, 756.f, 756.f, 756.f, 1513.f, 1513.f, 1620.f, 456.f};
+static const PerfData ADA = {82.6f, 165.2f, 165.2f, 330.3f, 330.3f, 660.6f, 2520.f, 512.f};
+
+typedef struct {
+    const char* name;
+    const PerfData* perf_data;
+    float new_cores;
+    float new_mhz;
+} GPUEntry;
+
+// the overrides for each specific GPU
+static GPUEntry gpu_db[] = {
+    {"Tesla V100-SXM2-16GB", &VOLTA, 640, 1530},
+    {"Tesla V100-PCIE-32GB", &VOLTA, 640, 1530},
+    {"NVIDIA A100-PCIE-40GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-PCIE-80GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-SXM4-40GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-SXM4-80GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA RTX A2000", &AMPERE_CONSUMER, 104, 1200},
+    {"NVIDIA RTX A4000", &AMPERE_CONSUMER, 192, 1560},
+    {"NVIDIA RTX A4500", &AMPERE_CONSUMER, 224, 1650},
+    {"NVIDIA RTX A5000", &AMPERE_CONSUMER, 256, 1695},
+    {"NVIDIA RTX A5500", &AMPERE_CONSUMER, 320, 1770},
+    {"NVIDIA RTX A6000", &AMPERE_CONSUMER, 336, 1800},
+    {"NVIDIA GeForce RTX 3090 Ti", &AMPERE_CONSUMER, 336, 1860},
+    {"NVIDIA GeForce RTX 3090", &AMPERE_CONSUMER, 328, 1695},
+    {"NVIDIA GeForce RTX 3080 Ti", &AMPERE_CONSUMER, 320, 1665},
+    {"NVIDIA GeForce RTX 3080", &AMPERE_CONSUMER, 272, 1710},
+    {"NVIDIA GeForce RTX 3070 Ti", &AMPERE_CONSUMER, 192, 1770},
+    {"NVIDIA GeForce RTX 3070", &AMPERE_CONSUMER, 184, 1725},
+    {"NVIDIA GeForce RTX 3060 Ti", &AMPERE_CONSUMER, 152, 1665},
+    {"NVIDIA GeForce RTX 3060", &AMPERE_CONSUMER, 112, 1777},
+    {"NVIDIA RTX A2000 ADA", &ADA, 88, 2130},
+    {"NVIDIA RTX A4000 ADA", &ADA, 192, 2175},
+    {"NVIDIA RTX A4500 ADA", &ADA, 224, 2580},
+    {"NVIDIA RTX A5000 ADA", &ADA, 400, 2550},
+    {"NVIDIA RTX A5880 ADA", &ADA, 440, 2460},
+    {"NVIDIA RTX A6000 ADA", &ADA, 568, 2505},
+    {"NVIDIA GeForce RTX 4090", &ADA, 512, 2520},
+    {"NVIDIA GeForce RTX 4080 SUPER", &ADA, 320, 2550},
+    {"NVIDIA GeForce RTX 4080", &ADA, 304, 2505},
+    {"NVIDIA GeForce RTX 4070 Ti SUPER", &ADA, 264, 2610},
+    {"NVIDIA GeForce RTX 4070 Ti", &ADA, 240, 2610},
+    {"NVIDIA GeForce RTX 4070 SUPER", &ADA, 224, 2475},
+    {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
+    {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
+    {"NVIDIA GeForce RTX 4060 Ti", &ADA, 136, 2535},
+    {"NVIDIA GeForce RTX 4060", &ADA, 96, 2460},
+    {"NVIDIA H100 PCIe", &HOPPER, 456, 1620},
+    {"NVIDIA H100 80GB HBM3", &HOPPER, 528, 1830}, // HBM3 = SXM5
+};
+
+float get_flops_promised(const char* device, int precision_mode) {
+    /*
+    This function is used to estimate the Model Flops Utilization (MFU)
+    basically we have to figure out how many flops the GPU can do per second.
+    Note that this is not a simple endeavor and may well go wrong! The details are tricky.
+    The returned value is in units of 1e12.
+
+    For the non-top models, actual performance numbers aren't that easy to find, e.g.,
+    here https://www.techpowerup.com/gpu-specs/rtx-a4000.c3756, does "Theoretical Performance"
+    seems to be without tensor cores.
+
+    So, instead we use that all these cards just use the same types of tensor cores in different
+    numbers and at different frequencies. Then we just need to look up these two easily accesible
+    numbers for all the other GPUs.
+    linear scaling seems to work: comparing spec sheet and calculation:
+    4080: 304TCs, 2505 GHz; 97.5TFlops = 165.2/512*304 /2520 * 2505
+
+    Original numbers for the top GPUS are from.
+    https://resources.nvidia.com/en-us-tensor-core
+    https://images.nvidia.com/aem-dam/Solutions/geforce/ada/nvidia-ada-gpu-architecture.pdf
+    */
+
+   // validate the precision mode as one of the three possible values
+    if (!(precision_mode == MFUH_PRECISION_FP32 || precision_mode == MFUH_PRECISION_FP16 || precision_mode == MFUH_PRECISION_BF16)) {
+        fprintf(stderr, "Invalid precision mode: %d\n", precision_mode);
+        return -1.0f;
+    }
+
+    // do a linear search until you find our GPU, then calculate the flops promised
+    int num_gpu_entries = sizeof(gpu_db) / sizeof(gpu_db[0]);
+    for (int i = 0; i < num_gpu_entries; i++) {
+        if (strcmp(gpu_db[i].name, device) == 0) {
+            const PerfData* perf_data = gpu_db[i].perf_data;
+
+            // look up the default flops value for the given precision mode
+            float value = -1.0f;
+            if (precision_mode == MFUH_PRECISION_BF16) { value = perf_data->BF_16_32; }
+            if (precision_mode == MFUH_PRECISION_FP32) { value = perf_data->TF_32; }
+            if (precision_mode == MFUH_PRECISION_FP16) { value = perf_data->FP_16_32; }
+
+            // we'd get here if we're e.g. trying to use BF16 on Volta GPU or something...
+            if (value < 0.0f) {
+                fprintf(stderr, "No data for GPU %s and precision mode %d\n", device, precision_mode);
+                return -1.0f;
+            }
+
+            // adjust flops based on the specific core count and clock frequency of this GPU
+            float new_cores = gpu_db[i].new_cores;
+            float new_mhz = gpu_db[i].new_mhz;
+            float adjusted = value * (new_cores / perf_data->CORES) * (new_mhz / perf_data->CLOCK);
+            return adjusted;
+        }
+    }
+
+    return -1.0f; // ¯\_(ツ)_/¯
+}
+
+struct GPUUtilInfo {
+    unsigned int clock;
+    unsigned int max_clock;
+    unsigned int power;
+    unsigned int power_limit;
+    unsigned int fan;
+    unsigned int temperature;
+    unsigned int temp_slowdown;
+
+    float gpu_utilization;
+    float mem_utilization;
+    const char* throttle_reason;
+};
+
+// lazily initialize nvml and generate a handle to the GPU
+#if USE_NVML
+nvmlDevice_t nvml_get_device() {
+    static bool needs_init = true;
+    static nvmlDevice_t device;
+    if(needs_init) {
+        needs_init = false;
+        nvmlCheck(nvmlInit());
+        nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device));
+    }
+    return device;
+}
+
+// convert throttle reason bitfield into a text reason.
+// this is a lossy conversion; we just want to give some idea of what is happening
+const char* get_throttle_reason(unsigned long long bits) {
+    if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
+        return "power cap";
+    } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
+        return "thermal cap";
+    } else if (bits & (nvmlClocksThrottleReasonAll)) {
+        return "other cap";
+    } else {
+        return "no cap";
+    }
+}
+
+// gather data for a GPUUtilInfo object
+GPUUtilInfo get_gpu_utilization_info() {
+    GPUUtilInfo info;
+    nvmlDevice_t device = nvml_get_device();
+    // query different infos directly
+    nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock));
+    nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock));
+    nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit));
+    nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power));
+    nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature));
+    nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown));
+    unsigned long long throttle;
+    nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle));
+    info.throttle_reason = get_throttle_reason(throttle);
+    nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan));
+
+    // for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
+    // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
+    // and have no memory management required
+    constexpr const int BUFFER_LIMIT = 128;
+    nvmlSample_t buffer[BUFFER_LIMIT];
+    nvmlValueType_t v_type;
+    unsigned int sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float gpu_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        gpu_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    gpu_utilization /= (float)sample_count;
+
+    // sample count may have been modified by the query above; reset back to buffer size
+    sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float mem_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        mem_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    mem_utilization /= (float)sample_count;
+
+    info.gpu_utilization = gpu_utilization;
+    info.mem_utilization = mem_utilization;
+    return info;
+}
+#else
+GPUUtilInfo get_gpu_utilization_info() {
+    fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking.");
+    exit(EXIT_FAILURE);
+}
+#endif
+#endif // MFU_H
diff --git a/CUDA/llmcpp/rand.h b/CUDA/llmcpp/rand.h
new file mode 100644
index 0000000..b66aa04
--- /dev/null
+++ b/CUDA/llmcpp/rand.h
@@ -0,0 +1,240 @@
+/*
+Mersenne Twisters implementation, numerically identical to torch.
+
+Example usage:
+
+    mt19937_state state;
+    manual_seed(&state, 137);
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+    printf("%u\n", randint32(&state));
+
+    float t8[8];
+    normal_(t8, 8, 0, 1, &state);
+    for (int i = 0; i < 8; i++) {
+        printf("%f\n", t8[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+    float t16[16];
+    normal_(t16, 16, 0, 1, &state);
+    for (int i = 0; i < 16; i++) {
+        printf("%f\n", t16[i]);
+    }
+    printf("%u\n", randint32(&state));
+
+PyTorch reference (producing identical results):
+
+    import torch
+    torch.manual_seed(137)
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(8);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+    t = torch.zeros(16);
+    t.normal_()
+    for i in range(len(t)) :
+        print(t[i].item())
+    print(torch.randint(0, 0xFFFFFFFF, [1]).item())
+
+Both output:
+
+    4053805790
+    2173880614
+    380293709
+    1237255315
+    2986595568
+    0.7947664260864258
+    1.4369317293167114
+    - 0.2292192131280899
+    0.47556325793266296
+    - 0.6334410905838013
+    - 0.5791953802108765
+    - 0.0925704762339592
+    - 0.8659197092056274
+    2186503452
+    - 1.2813878059387207
+    - 2.646395683288574
+    - 0.06569503247737885
+    0.2180829495191574
+    - 0.46536165475845337
+    - 0.33108410239219666
+    2.5485482215881348
+    0.10425379872322083
+    0.8460659980773926
+    0.9462448358535767
+    - 0.2913765013217926
+    0.34313806891441345
+    - 1.1186704635620117
+    - 0.18305328488349915
+    - 2.3153159618377686
+    0.3961987793445587
+    2756748748
+*/
+
+#ifndef RAND_H
+#define RAND_H
+
+#include <math.h>
+
+#define MERSENNE_STATE_M 397u
+#define MERSENNE_STATE_N 624u
+
+#define LMASK 0x7ffffffful
+#define UMASK 0x80000000ul
+
+// Copyright(c) Makoto Matsumoto and Takuji Nishimura
+
+// This implementation follows PyTorch so that we are numerically identical when running verification tests.
+
+typedef struct {
+    unsigned long long seed_;
+    int left_;
+    unsigned int next_;
+    unsigned int state_[MERSENNE_STATE_N];
+    unsigned int MATRIX_A[2];
+} mt19937_state;
+
+void manual_seed(mt19937_state* state, unsigned int seed) {
+    state->MATRIX_A[0] = 0x0u;
+    state->MATRIX_A[1] = 0x9908b0df;
+    state->state_[0] = seed & 0xffffffff;
+    for (unsigned int j = 1; j < MERSENNE_STATE_N; j++) {
+        state->state_[j] = 1812433253 * (state->state_[j - 1] ^ (state->state_[j - 1] >> 30)) + j;
+        state->state_[j] &= 0xffffffff;
+    }
+    state->left_ = 1;
+    state->next_ = 0;
+}
+
+void next_state(mt19937_state* state) {
+    state->left_ = MERSENNE_STATE_N;
+    state->next_ = 0;
+    unsigned int y, j;
+    for (j = 0; j < MERSENNE_STATE_N - MERSENNE_STATE_M; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + MERSENNE_STATE_M] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    for (; j < MERSENNE_STATE_N - 1; j++) {
+        y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK);
+        state->state_[j] = state->state_[j + (MERSENNE_STATE_M - MERSENNE_STATE_N)] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+    }
+    y = (state->state_[MERSENNE_STATE_N - 1] & UMASK) | (state->state_[0] & LMASK);
+    state->state_[MERSENNE_STATE_N - 1] = state->state_[MERSENNE_STATE_M - 1] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1];
+}
+
+unsigned int randint32(mt19937_state* state) {
+    if (!state) return 0;
+    if (state->MATRIX_A[0] != 0 || state->MATRIX_A[1] != 0x9908b0df) manual_seed(state, 5489); // auto-initialize
+    if (--state->left_ <= 0) {
+        next_state(state);
+    }
+    unsigned int y = state->state_[state->next_++];
+    y ^= y >> 11;
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= y >> 18;
+    return y;
+}
+
+inline unsigned long long randint64(mt19937_state* state) {
+    return (((unsigned long long)(randint32(state)) << 32) | randint32(state));
+}
+
+inline float randfloat32(mt19937_state* state) {
+    return (randint32(state) & ((1ull << 24) - 1)) * (1.0f / (1ull << 24));
+}
+
+inline double randfloat64(mt19937_state* state) {
+    return (randint64(state) & ((1ull << 53) - 1)) * (1.0 / (1ull << 53));
+}
+
+void uniform_(float* data, unsigned int numel, float from, float to, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state) * (to - from) + from;
+    }
+}
+
+// Box-Muller transform: maps uniform random numbers to Gaussian distributed numbers
+// https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+void normal_fill_16(float* data, float mean, float std) {
+    #define EPSILONE 1e-12f
+    for (unsigned int t = 0; t < 8; t++) {
+        float u1 = 1 - data[t];
+        float u2 = data[t + 8];
+        float radius = sqrtf(-2 * logf(u1 + EPSILONE));
+        float theta = (float) (2.0 * M_PI * u2);
+        data[t] = (radius * cosf(theta) * std + mean);
+        data[t + 8] = (radius * sinf(theta) * std + mean);
+    }
+}
+
+void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    for (unsigned int t = 0; t < numel; t++) {
+        data[t] = randfloat32(state);
+    }
+    for (unsigned int i = 0; i < numel - 15; i += 16) {
+        normal_fill_16(data + i, mean, std);
+    }
+    if (numel % 16 != 0) {
+        // recompute the last 16 values
+        data = data + numel - 16;
+        for (unsigned int i = 0; i < 16; i++) {
+            data[i] = randfloat32(state);
+        }
+        normal_fill_16(data, mean, std);
+    }
+}
+
+void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
+    #define EPSILONE 1e-12f
+    if (numel >= 16) {
+        normal_fill(data, numel, mean, std, state);
+    }
+    else {
+        double next_double_normal_sample = 0.0; // make compiler warning happy, won't be used
+        int has_next_double_normal_sample = 0;
+        for (unsigned int  t = 0; t < numel; t++) {
+            if (has_next_double_normal_sample) {
+                data[t] = (float)(next_double_normal_sample * std + mean);
+                has_next_double_normal_sample = 0;
+                continue;
+            }
+            // for numel < 16 we draw a double (float64)
+            float u1 = (float) randfloat64(state);
+            float u2 = (float) randfloat64(state);
+            float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE));
+            float theta = (float) (2.0 * M_PI * u1);
+            next_double_normal_sample = radius * sinf(theta);
+            has_next_double_normal_sample = 1;
+            data[t] = (radius * cosf(theta) * std + mean);
+        }
+    }
+}
+
+void init_identity_permutation(int *data, int numel) {
+    for (int i = 0; i < numel; i++) {
+        data[i] = i;
+    }
+}
+
+void random_permutation(int* data, int numel, mt19937_state* state) {
+    for (int i = numel - 1; i > 0; i--) {
+        // pick an index j in [0, i] with equal probability
+        int j = randint32(state) % (i + 1);
+        // swap i <-> j
+        int tmp = data[i];
+        data[i] = data[j];
+        data[j] = tmp;
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/CUDA/llmcpp/utils.h b/CUDA/llmcpp/utils.h
new file mode 100644
index 0000000..775534c
--- /dev/null
+++ b/CUDA/llmcpp/utils.h
@@ -0,0 +1,223 @@
+/*
+ This file contains utilities shared between the different training scripts.
+ In particular, we define a series of macros xxxCheck that call the corresponding
+ C standard library function and check its return code. If an error was reported,
+ the program prints some debug information and exits.
+*/
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+// implementation of dirent for Windows is in dev/unistd.h
+#ifndef _WIN32
+#include <dirent.h>
+#include <arpa/inet.h>
+#endif
+
+// ----------------------------------------------------------------------------
+// fread convenience utils, with nice handling of error checking using macros
+// simple replace fopen, fread, fclose, fseek
+// with fopenCheck, freadCheck, fcloseCheck, fseekCheck
+
+extern inline FILE *fopen_check(const char *path, const char *mode, const char *file, int line) {
+    FILE *fp = fopen(path, mode);
+    if (fp == NULL) {
+        fprintf(stderr, "Error: Failed to open file '%s' at %s:%d\n", path, file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        fprintf(stderr, "  Path: %s\n", path);
+        fprintf(stderr, "  Mode: %s\n", mode);
+        fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n");
+        fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n");
+        exit(EXIT_FAILURE);
+    }
+    return fp;
+}
+
+#define fopenCheck(path, mode) fopen_check(path, mode, __FILE__, __LINE__)
+
+extern inline void fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) {
+    size_t result = fread(ptr, size, nmemb, stream);
+    if (result != nmemb) {
+        if (feof(stream)) {
+            fprintf(stderr, "Error: Unexpected end of file at %s:%d\n", file, line);
+        } else if (ferror(stream)) {
+            fprintf(stderr, "Error: File read error at %s:%d\n", file, line);
+        } else {
+            fprintf(stderr, "Error: Partial read at %s:%d. Expected %zu elements, read %zu\n",
+                    file, line, nmemb, result);
+        }
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        fprintf(stderr, "  Expected elements: %zu\n", nmemb);
+        fprintf(stderr, "  Read elements: %zu\n", result);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define freadCheck(ptr, size, nmemb, stream) fread_check(ptr, size, nmemb, stream, __FILE__, __LINE__)
+
+extern inline void fclose_check(FILE *fp, const char *file, int line) {
+    if (fclose(fp) != 0) {
+        fprintf(stderr, "Error: Failed to close file at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define fcloseCheck(fp) fclose_check(fp, __FILE__, __LINE__)
+
+extern inline void sclose_check(int sockfd, const char *file, int line) {
+    if (close(sockfd) != 0) {
+        fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define scloseCheck(sockfd) sclose_check(sockfd, __FILE__, __LINE__)
+
+#ifdef _WIN32
+extern inline void closesocket_check(int sockfd, const char *file, int line) {
+    if (closesocket(sockfd) != 0) {
+        fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define closesocketCheck(sockfd) closesocket_check(sockfd, __FILE__, __LINE__)
+#endif
+
+extern inline void fseek_check(FILE *fp, long off, int whence, const char *file, int line) {
+    if (fseek(fp, off, whence) != 0) {
+        fprintf(stderr, "Error: Failed to seek in file at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  Offset: %ld\n", off);
+        fprintf(stderr, "  Whence: %d\n", whence);
+        fprintf(stderr, "  File:   %s\n", file);
+        fprintf(stderr, "  Line:   %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define fseekCheck(fp, off, whence) fseek_check(fp, off, whence, __FILE__, __LINE__)
+
+extern inline void fwrite_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) {
+    size_t result = fwrite(ptr, size, nmemb, stream);
+    if (result != nmemb) {
+        if (feof(stream)) {
+            fprintf(stderr, "Error: Unexpected end of file at %s:%d\n", file, line);
+        } else if (ferror(stream)) {
+            fprintf(stderr, "Error: File write error at %s:%d\n", file, line);
+        } else {
+            fprintf(stderr, "Error: Partial write at %s:%d. Expected %zu elements, wrote %zu\n",
+                    file, line, nmemb, result);
+        }
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        fprintf(stderr, "  Expected elements: %zu\n", nmemb);
+        fprintf(stderr, "  Written elements: %zu\n", result);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define fwriteCheck(ptr, size, nmemb, stream) fwrite_check(ptr, size, nmemb, stream, __FILE__, __LINE__)
+
+// ----------------------------------------------------------------------------
+// malloc error-handling wrapper util
+
+extern inline void *malloc_check(size_t size, const char *file, int line) {
+    void *ptr = malloc(size);
+    if (ptr == NULL) {
+        fprintf(stderr, "Error: Memory allocation failed at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        fprintf(stderr, "  Size: %zu bytes\n", size);
+        exit(EXIT_FAILURE);
+    }
+    return ptr;
+}
+
+#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
+
+
+// ----------------------------------------------------------------------------
+// check that all tokens are within range
+extern inline void token_check(const int* tokens, int token_count, int vocab_size, const char *file, int line) {
+    for(int i = 0; i < token_count; i++) {
+        if(!(0 <= tokens[i] && tokens[i] < vocab_size)) {
+            fprintf(stderr, "Error: Token out of vocabulary at %s:%d\n", file, line);
+            fprintf(stderr, "Error details:\n");
+            fprintf(stderr, "  File: %s\n", file);
+            fprintf(stderr, "  Line: %d\n", line);
+            fprintf(stderr, "  Token: %d\n", tokens[i]);
+            fprintf(stderr, "  Position: %d\n", i);
+            fprintf(stderr, "  Vocab: %d\n", vocab_size);
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+#define tokenCheck(tokens, count, vocab) token_check(tokens, count, vocab, __FILE__, __LINE__)
+
+// ----------------------------------------------------------------------------
+// I/O ops
+
+extern inline void create_dir_if_not_exists(const char *dir) {
+    if (dir == NULL) { return; }
+    struct stat st = {0};
+    if (stat(dir, &st) == -1) {
+        if (mkdir(dir, 0700) == -1) {
+            printf("ERROR: could not create directory: %s\n", dir);
+            exit(EXIT_FAILURE);
+        }
+        printf("created directory: %s\n", dir);
+    }
+}
+
+extern inline int find_max_step(const char* output_log_dir) {
+    // find the DONE file in the log dir with highest step count
+    if (output_log_dir == NULL) { return -1; }
+    DIR* dir;
+    struct dirent* entry;
+    int max_step = -1;
+    dir = opendir(output_log_dir);
+    if (dir == NULL) { return -1; }
+    while ((entry = readdir(dir)) != NULL) {
+        if (strncmp(entry->d_name, "DONE_", 5) == 0) {
+            int step = atoi(entry->d_name + 5);
+            if (step > max_step) {
+                max_step = step;
+            }
+        }
+    }
+    closedir(dir);
+    return max_step;
+}
+
+extern inline int ends_with_bin(const char* str) {
+    // checks if str ends with ".bin". could be generalized in the future.
+    if (str == NULL) { return 0; }
+    size_t len = strlen(str);
+    const char* suffix = ".bin";
+    size_t suffix_len = strlen(suffix);
+    if (len < suffix_len) { return 0; }
+    int suffix_matches = strncmp(str + len - suffix_len, suffix, suffix_len) == 0;
+    return suffix_matches;
+}
+
+#endif
\ No newline at end of file
diff --git a/CUDA/llmcpp/zero.cuh b/CUDA/llmcpp/zero.cuh
new file mode 100644
index 0000000..e6c5b6e
--- /dev/null
+++ b/CUDA/llmcpp/zero.cuh
@@ -0,0 +1,597 @@
+/*
+Utilities for ZeRO sharding
+*/
+
+#ifndef LLMC_ZERO_CUH
+#define LLMC_ZERO_CUH
+
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+
+#ifdef MULTI_GPU
+#include <nccl.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#endif
+
+// defines: fcloseCheck, fwriteCheck, scloseCheck, sclosesocketCheck
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// Multi-GPU related
+#ifdef MULTI_GPU
+
+#if defined(ENABLE_FP32)
+const ncclDataType_t ncclFloatX = ncclFloat;
+#elif defined(ENABLE_FP16)
+const ncclDataType_t ncclFloatX = ncclHalf;
+#else // Default to bfloat16
+const ncclDataType_t ncclFloatX = ncclBfloat16;
+#endif
+
+void nccl_check(ncclResult_t status, const char *file, int line) {
+    if (status != ncclSuccess) {
+        printf("[NCCL ERROR] at file %s:%d:\n%s\n", file, line, ncclGetErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+}
+#define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__))
+
+#ifdef USE_MPI
+void mpi_check(int status, const char *file, int line) {
+    if (status != MPI_SUCCESS) {
+        char mpi_error[4096];
+        int mpi_error_len = 0;
+        assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) == MPI_SUCCESS);
+        printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len, mpi_error);
+        exit(EXIT_FAILURE);
+    }
+}
+#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
+#endif
+
+#endif // MULTI_GPU
+
+// ----------------------------------------------------------------------------
+// Parameters specific to training on multiple GPUs.
+typedef struct {
+    int process_rank;      // Rank of this process among all processes. 0 if no multi-GPU.
+    int num_processes;     // Total number of processes. 1 if no multi-GPU.
+    int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
+
+    // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
+    // 0-Disabled
+    // 1-Optimizer State Sharding (OSS)
+    // 2-Optimizer + Gradient State Sharding (SDP)
+    // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
+    int zero_stage;
+    size_t shard_num_parameters;
+#ifdef MULTI_GPU
+    ncclComm_t nccl_comm;       // NCCL communication primitive, used for collective multi-GPU work.
+    cudaStream_t nccl_stream;   // CUDA Stream to perform NCCL operations.
+    cudaEvent_t compute_nccl_sync; // Event used to synchronize NCCL with the compute
+    float* unified_buffer;
+#endif
+} MultiGpuConfig;
+
+// one global variable to hold the multi-GPU configuration for this process
+// inline, so we can include this header multiple times without getting multiple definitions
+inline MultiGpuConfig multi_gpu_config;
+
+#ifdef MULTI_GPU
+
+#ifdef _WIN32
+void send_nccl_id_to_clients_windows(ncclUniqueId *nccl_id, SOCKET client_sockets[], int num_clients) {
+    for (int i = 0; i < num_clients; ++i) {
+        if (send(client_sockets[i], (const char *)nccl_id, sizeof(*nccl_id), 0) == SOCKET_ERROR) {
+            printf("Failed to send nccl_id");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+        closesocketCheck(client_sockets[i]);
+    }
+}
+#else
+void send_nccl_id_to_clients(ncclUniqueId *nccl_id, int client_sockets[], int num_clients) {
+    for (int i = 0; i < num_clients; ++i) {
+        if (send(client_sockets[i], nccl_id, sizeof(*nccl_id), 0) == -1) {
+            printf("Failed to send nccl_id");
+            exit(EXIT_FAILURE);
+        }
+        scloseCheck(client_sockets[i]);
+    }
+}
+#endif
+
+#ifdef _WIN32
+// Same as get_nccl_id_via_tcp but for Windows
+ncclUniqueId get_nccl_id_via_tcp_windows(MultiGpuConfig* result, const char* server_ip) {
+    ncclUniqueId nccl_id;
+
+    int SERVER_PORT = 12345;  // hardcoded an arbitrary port number between 1024 and 49151 (registered ports)
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        printf("WSAStartup failed");
+        exit(EXIT_FAILURE);
+    }
+
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+
+        int MAX_CLIENTS = result->num_processes - 1;
+        SOCKET client_sockets[MAX_CLIENTS];
+        int num_clients = 0;
+        SOCKET server_socket, new_socket;
+        struct sockaddr_in address;
+        int addrlen = sizeof(address);
+
+        // Step 1) create a server TCP socket
+        if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) {
+            printf("Socket failed");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        address.sin_family = AF_INET;  // IPv4
+        address.sin_addr.s_addr = inet_addr(server_ip);
+        address.sin_port = htons(SERVER_PORT);
+
+        // Step 3) bind the socket to the address and port
+        if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) == SOCKET_ERROR) {
+            printf("Bind failed");
+            closesocketCheck(server_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 4) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server
+        if (listen(server_socket, MAX_CLIENTS) == SOCKET_ERROR) {
+            printf("Listen failed");
+            closesocketCheck(server_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 5) accept connections from clients
+        printf("Waiting for clients to connect...\n");
+        while (num_clients < MAX_CLIENTS) {
+            if ((new_socket = accept(server_socket, (struct sockaddr *)&address, &addrlen)) == INVALID_SOCKET) {
+                printf("Accept failed");
+                closesocketCheck(server_socket);
+                WSACleanup();
+                exit(EXIT_FAILURE);
+            }
+            client_sockets[num_clients++] = new_socket;
+            printf("Client %d connected\n", num_clients);
+        }
+
+        // Step 6) send the NCCL ID to all clients
+        send_nccl_id_to_clients_windows(&nccl_id, client_sockets, num_clients);
+        printf("NCCL ID sent to all clients\n");
+
+        closesocketCheck(server_socket);
+    } else {
+        int num_connection_attempts = 5;
+        int time_to_sleep = 2;
+        SOCKET client_socket;
+        struct sockaddr_in serv_addr;
+
+        // Step 1) create a client TCP socket
+        if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) {
+            printf("Socket creation error");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        serv_addr.sin_family = AF_INET;
+        serv_addr.sin_port = htons(SERVER_PORT);
+        if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) {
+            printf("Invalid address or address not supported");
+            closesocketCheck(client_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails
+        while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == SOCKET_ERROR) {
+            printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep);
+            if (--num_connection_attempts == 0) {
+                printf("Failed to connect to the server\n");
+                closesocketCheck(client_socket);
+                WSACleanup();
+                exit(EXIT_FAILURE);
+            }
+            Sleep(time_to_sleep * 1000);
+        }
+
+        // Step 4) receive the NCCL ID from the server
+        if (recv(client_socket, (char *)&nccl_id, sizeof(nccl_id), 0) <= 0) {
+            printf("Failed to receive nccl_id");
+            closesocketCheck(client_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        printf("Received NCCL ID\n");
+        closesocketCheck(client_socket);
+    }
+
+    WSACleanup();
+    return nccl_id;
+}
+#else
+ncclUniqueId get_nccl_id_via_tcp(MultiGpuConfig* result, const char* server_ip) {
+    ncclUniqueId nccl_id;
+
+    int SERVER_PORT = 12345;  // hardcoded an arbitrary port number between 1024 and 49151 (registered ports)
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+
+        int MAX_CLIENTS = result->num_processes - 1;
+        int client_sockets[MAX_CLIENTS];
+        int num_clients = 0;
+        int server_socket, new_socket;
+        struct sockaddr_in address;
+        int addrlen = sizeof(address);
+        int opt = 1;
+
+        // Step 1) create a server TCP socket
+        if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+            printf("Socket failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set socket options
+        // SOL_SOCKET - means that option is configured at socket level
+        // SO_REUSEADDR - allows to bind to an address which is in a TIME_WAIT state (already used by another socket) - useful when restarting the server
+        // SO_REUSEPORT - allows to bind to the same port multiple times
+        if (setsockopt(server_socket, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)) < 0) {
+            printf("Setsockopt failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) set the server address and port
+        address.sin_family = AF_INET;  // IPv4
+        address.sin_addr.s_addr = inet_addr(server_ip); // alternatively use INADDR_ANY to bind to all interfaces, currently we only allow ethernet
+        address.sin_port = htons(SERVER_PORT);
+
+        // Step 4) bind the socket to the address and port
+        if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) < 0) {
+            printf("Bind failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 5) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server
+        if (listen(server_socket, MAX_CLIENTS) < 0) {
+            printf("Listen failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 6) accept connections from clients
+        printf("Waiting for clients to connect...\n");
+        while (num_clients < MAX_CLIENTS) {
+            if ((new_socket = accept(server_socket, (struct sockaddr *)&address, (socklen_t*)&addrlen)) < 0) {
+                printf("Accept failed");
+                exit(EXIT_FAILURE);
+            }
+            client_sockets[num_clients++] = new_socket;
+            printf("Client %d connected\n", num_clients);
+        }
+
+        // Step 7) send the NCCL ID to all clients
+        send_nccl_id_to_clients(&nccl_id, client_sockets, num_clients);
+        printf("NCCL ID sent to all clients\n");
+
+        scloseCheck(server_socket);
+    } else {
+        int num_connection_attempts = 5;
+        int time_to_sleep = 2;
+        int client_socket;
+        struct sockaddr_in serv_addr;
+
+        // Step 1) create a client TCP socket
+        if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+            printf("Socket creation error");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        serv_addr.sin_family = AF_INET;
+        serv_addr.sin_port = htons(SERVER_PORT);
+        if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) {
+            printf("Invalid address or address not supported");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails
+        while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+            printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep);
+            if (--num_connection_attempts == 0) {
+                printf("Failed to connect to the server\n");
+                exit(EXIT_FAILURE);
+            }
+            sleep(time_to_sleep);
+        }
+
+        // Step 4) receive the NCCL ID from the server
+        if (recv(client_socket, &nccl_id, sizeof(nccl_id), 0) <= 0) {
+            printf("Failed to receive nccl_id");
+            exit(EXIT_FAILURE);
+        }
+
+        printf("Received NCCL ID\n");
+        scloseCheck(client_socket);
+    }
+
+    return nccl_id;
+}
+#endif
+
+ncclUniqueId get_nccl_id_via_fs(MultiGpuConfig* result, char* fs_path) {
+    // Works assuming that the filesystem is shared among all processes
+    ncclUniqueId nccl_id;
+    FILE* idFile;
+    static char filename[1024];
+    snprintf(filename, sizeof(filename), "%s/ncclUniqueId.sync", fs_path);
+
+    if (result->process_rank != 0) {  // client processse should wait for the server to write to the file
+        // This is a naive and not 100% robust way to synchronize the processes but it should work almost always
+        sleep(2);
+    }
+
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+        idFile = fopen(filename, "wb");
+        assert(idFile != NULL);
+        fwriteCheck(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fcloseCheck(idFile);
+    } else {
+        // Other ranks wait until the file is available and read the unique ID
+        do {
+            sleep(1);  // 1 second
+            idFile = fopen(filename, "rb");
+            if (idFile != NULL) break;
+        } while (idFile == NULL);
+        freadCheck(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fcloseCheck(idFile);
+    }
+
+    return nccl_id;
+}
+
+#ifdef USE_MPI
+// Determine which GPU this process should use.
+// Processes on the same machines use different GPU indicies. Processes on other machines don't.
+// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread
+int multi_gpu_get_local_device_idx(int process_rank, int num_processes) {
+    char hostname[1024];
+    hostname[1023] = '\0';
+    // All processes on the same machine will share the same hostname.
+    gethostname(hostname, 1023);
+    for (int i=0; i < 1024; i++) {
+        if (hostname[i] == '.') {
+            hostname[i] = '\0';
+            break;
+        }
+    }
+    uint64_t hostname_hash = 5381u;
+    for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5u) + hostname_hash) ^ hostname[c]; }
+
+    // Distribute all hostname hashes to all processes.
+    uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t));
+    all_hostsname_hashes[process_rank] = hostname_hash;
+    mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
+
+    // Identify which GPU we need to use.
+    int local_device_idx = 0;
+    for (int current_process = 0; current_process < num_processes; ++current_process) {
+        if (current_process == process_rank) {
+        // Found my gpu, local_device_idx now has my target GPU index.
+        break;
+        }
+        if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) {
+        // This process ID runs on the same machine, but it's not me, skip this GPU
+        local_device_idx++;
+        }
+    }
+
+    free(all_hostsname_hashes);
+    return local_device_idx;
+}
+#endif
+
+#endif
+
+MultiGpuConfig multi_gpu_config_init(int num_processes, int process_rank, int gpus_per_node, char* server_ip, char* fs_path, char* init_method) {
+#ifdef MULTI_GPU
+    MultiGpuConfig result;
+    ncclUniqueId nccl_id;
+    // Get nccl_id using MPI, TCP, or FS (file system synchronization) methods
+    // On newer slurm versions (slurm-wlm package) PMIx is disabled so we can not use MPI for NCCL init in multi node setup
+    if (strcmp(init_method, "mpi") == 0) {
+        #ifdef USE_MPI
+        mpiCheck(MPI_Init(NULL, NULL));
+        mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank));
+        mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes));
+        result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes);
+        if (result.process_rank == 0) {
+            ncclCheck(ncclGetUniqueId(&nccl_id));
+        }
+        mpiCheck(MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
+        #else
+        printf("MPI support is disabled. Please enable MPI support to use MPI-based NCCL-init method.\n");
+        exit(EXIT_FAILURE);
+        #endif
+    } else {
+        result.process_rank = process_rank;
+        result.num_processes = num_processes;
+        result.local_device_idx = process_rank % gpus_per_node;
+        if (strcmp(init_method, "tcp") == 0) {
+            #ifdef _WIN32
+            nccl_id = get_nccl_id_via_tcp_windows(&result, server_ip);
+            #else
+            nccl_id = get_nccl_id_via_tcp(&result, server_ip);
+            #endif
+        } else if (strcmp(init_method, "fs") == 0) {
+            nccl_id = get_nccl_id_via_fs(&result, fs_path);
+        } else {
+            printf("Invalid NCCL-init method\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+    cudaCheck(cudaSetDevice(result.local_device_idx));
+    ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank));
+    cudaCheck(cudaStreamCreate(&result.nccl_stream));
+    // event without timing for maximum performance
+    cudaCheck(cudaEventCreate(&result.compute_nccl_sync, cudaEventDisableTiming));
+    nvtxNameCudaStreamA(result.nccl_stream, "nccl stream");
+    nvtxNameCudaEventA(result.compute_nccl_sync, "nccl compute sync");
+    cudaCheck(cudaMallocManaged(&result.unified_buffer, sizeof(float)));
+    return result;
+#else
+    printf("Multi-GPU support is disabled. Using a single GPU.\n");
+    cudaCheck(cudaSetDevice(0));
+    MultiGpuConfig result;
+    result.process_rank = 0;
+    result.num_processes = 1;
+    result.local_device_idx = 0;
+    return result;
+#endif
+}
+
+void multi_gpu_config_free(MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    ncclCheck(ncclCommDestroy(config->nccl_comm));
+    cudaCheck(cudaStreamDestroy(config->nccl_stream));
+    cudaCheck(cudaEventDestroy(config->compute_nccl_sync));
+    cudaCheck(cudaFree(config->unified_buffer));
+    #ifdef USE_MPI
+    mpiCheck(MPI_Finalize());
+    #endif
+#endif
+}
+
+void multi_gpu_barrier(const MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    if (config->num_processes > 1) {
+        ncclCheck(ncclAllReduce(config->unified_buffer, config->unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream));
+    }
+    cudaCheck(cudaDeviceSynchronize());
+#endif
+}
+
+// Offset and size of a tensor shard
+typedef struct {
+    ptrdiff_t offset;
+    size_t size;
+} ShardInfo;
+
+// Get info about sharding for a tensor of elements many numbers
+ShardInfo multi_gpu_get_shard_offset(size_t elements, const MultiGpuConfig* config, int shard_at_stage) {
+    const int nproc = config->num_processes;
+    if(config->zero_stage >= shard_at_stage) {
+        if (elements % nproc != 0) {
+            fprintf(stderr, "Number of elements %zu must be a multiple of the number of processes %d\n", elements, nproc);
+            exit(EXIT_FAILURE);
+        }
+        return {(ptrdiff_t) (config->process_rank * (elements / nproc)), elements / nproc};
+    } else {
+        return {0, elements};
+    }
+}
+
+// Block NCCL stream until computations on compute_stream are done, then aggregate multiple pointers in an NCCL group.
+// This can work either as an all-reduce (i.e., no ZeRo), or a reduce-scatter (ZeRO 1).
+// The awkward `(&pointers)[N]` syntax ensures we are capturing the parameters as sized arrays, so that it becomes impossible
+// to call this function if pointers and pointers_sizes do not match.
+template<int N>
+void multi_gpu_async_reduce_gradient(
+        floatX* const (&pointers)[N], const size_t (&pointers_sizes)[N],
+        MultiGpuConfig* config, cudaStream_t compute_stream) {
+    if (config->num_processes == 1) {
+        return; // no multi-GPU, just exit.
+    }
+
+#ifdef MULTI_GPU
+    NVTX_RANGE_FN();
+    // mark an event on the compute stream, and immediately wait on this in the nccl stream
+    // this means that the nccl stream won't start executing before all compute kernels that
+    // have been submitted before this point have finished.
+    // by using an event instead of cudaSyncStream, we avoid having to synchronize the host, and
+    // can enqueue new work to the GPU right away.
+    cudaCheck(cudaEventRecord(config->compute_nccl_sync, compute_stream));
+    cudaCheck(cudaStreamWaitEvent(config->nccl_stream, config->compute_nccl_sync));
+    ncclCheck(ncclGroupStart()); // NCCL group: aggregate all pointers in a single NCCL GPU kernel.
+    for (int i = 0; i < N; ++i) {
+        if(config->zero_stage == 0) {
+            ncclCheck(ncclAllReduce(
+                    pointers[i], pointers[i],
+                    pointers_sizes[i],
+                    ncclFloatX, ncclAvg,
+                    config->nccl_comm, config->nccl_stream
+            ));
+        } else if(config->zero_stage == 1) {
+            assert(pointers_sizes[i] % config->num_processes == 0);
+            size_t shard_size = pointers_sizes[i] / config->num_processes;
+            ptrdiff_t shard_offset = (ptrdiff_t)shard_size * config->process_rank;
+            ncclCheck(ncclReduceScatter(
+                    pointers[i], pointers[i] + shard_offset,
+                    shard_size,
+                    ncclFloatX, ncclAvg,
+                    config->nccl_comm, config->nccl_stream
+            ));
+        }
+    }
+    ncclCheck(ncclGroupEnd());
+#endif
+}
+
+// convenience macro that only prints if the rank of process is zero
+#define printf0(...) if (::multi_gpu_config.process_rank == 0) { printf(__VA_ARGS__); }
+
+void set_zero_configs(MultiGpuConfig* config, int zero_stage, size_t total_parameters) {
+    config->zero_stage = 0;
+    config->shard_num_parameters = total_parameters;
+    // Check the Zero Stage and define sharding parameters
+    if (zero_stage == 0) {
+        printf0("| Zero Optimization is disabled                                              |\n");
+    }
+    else if (zero_stage == 1) {
+        if (total_parameters % config->num_processes != 0) {
+            printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
+            config->zero_stage = 0;
+        }
+        else {
+            config->zero_stage = 1;
+            config->shard_num_parameters = total_parameters / config->num_processes;
+        }
+    }
+    else{
+        printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+        config->zero_stage = 0;
+    }
+}
+
+// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
+float multi_gpu_cpu_float_sum(float value, MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    if (config->num_processes == 1) return value;
+
+    float* unified_buffer = config->unified_buffer;
+    *unified_buffer = value;
+    ncclCheck(ncclAllReduce(unified_buffer, unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream));
+    cudaCheck(cudaDeviceSynchronize());
+    return *unified_buffer;
+#else
+    return value;
+#endif
+}
+
+#endif
+
diff --git a/README.md b/README.md
index 56f99cc..b441cf4 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,12 @@
-# Quadtrix.cpp
+# Quadtrix.cpp (llm.cpp)
 
-<p align="center">
+<h1 align="center">
   <img width="785" height="261" alt="image" src="https://github.com/user-attachments/assets/7bd2d8c6-d1e3-4ca0-96c0-0161d3cf235a" />
-</p>
+</h1><br>
+
+
+  [![Release](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/release.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/release.yml)  [![Package](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/docker-publish.yml)
+  [![CI](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/ci.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/ci.yml)
 
 A local large language model with a modular, multi-path execution architecture. Train, run inference, and serve a chat interface — all from a single repository, across bare-metal C++, PyTorch, and a React frontend.
 
diff --git a/benchmark/results/python_benchmark.csv b/benchmark/results/python_benchmark.csv
new file mode 100644
index 0000000..c264086
--- /dev/null
+++ b/benchmark/results/python_benchmark.csv
@@ -0,0 +1,13 @@
+suite,name,backend,batch_size,sequence_length,tokens,avg_ms,median_ms,min_ms,max_ms,p90_ms,p95_ms,std_ms,tokens_per_sec,samples,loss,memory_mb,notes
+data,tokenizer_encode,python,0,0,220975,169.76018999121152,164.62069997214712,124.44350001169369,211.44290000665933,204.09656001720577,207.76973001193255,29.81756930091779,1301689.165236207,10,,188.39453125,
+data,batch_sample_to_device,python,4,32,128,0.34600000944919884,0.2575500402599573,0.2452000044286251,0.8668999653309584,0.48601999878883345,0.6764599820598955,0.1852238791693057,369942.18642873614,10,,189.2734375,
+primitive,matmul_3d_1x16,python,1,16,16,0.028490001568570733,0.026749970857053995,0.024800014216452837,0.04350004019215703,0.03351001651026308,0.038505028351210044,0.005415998067507546,561600.5306805843,10,,181.234375,
+primitive,matmul_3d_4x32,python,4,32,128,0.047069991705939174,0.043849984649568796,0.03890000516548753,0.07130001904442906,0.05185997579246759,0.06157999741844831,0.008578937902311791,2719354.632557738,10,,181.296875,
+primitive,attention_scores_4x32,python,4,32,128,0.11958999675698578,0.10689999908208847,0.10410003596916795,0.20840001525357366,0.12946999049745497,0.16893500287551425,0.030181239200475163,1070323.6346774376,10,,181.93359375,
+forward,batch1_seq8,python,1,8,8,16.073119995417073,15.318600024329498,14.594200009014457,20.715299993753433,17.489159997785464,19.102229995769445,1.798887105385644,497.7253950870173,10,10.797359466552734,166.43359375,
+forward,batch1_seq32,python,1,32,32,21.528740011854097,21.653899981174618,20.405600022058934,22.147400013636798,22.095200035255402,22.1213000244461,0.548371285312407,1486.3851754622074,10,10.882255554199219,190.01171875,
+forward,batch4_seq32,python,4,32,128,44.681840017437935,45.51370002445765,37.46199997840449,54.08870003884658,48.26489001279697,51.17679502582177,4.5654932173684655,2864.6984983171146,10,10.885703086853027,253.171875,
+training,adamw_step_b4_s32,python,4,32,128,229.80256001465023,207.2436999878846,200.93890000134706,321.9230000395328,279.9100400414318,300.9165200404823,46.404669570312535,556.9998871720134,5,10.602718353271484,392.30078125,
+generation,empty,python,1,1,32,563.3423800056335,548.9804000244476,466.00820001913235,704.8150000046007,643.7829400005285,674.2989700025645,72.57013670803387,56.80382150492566,10,,218.44140625,
+generation,short,python,1,6,32,524.1239399998449,524.1038500098512,493.7280000303872,561.7482999805361,549.8817999905441,555.8150499855401,20.612269243289685,61.054261326070076,10,,218.47265625,
+generation,long,python,1,32,32,561.3779200008139,560.0390000035986,545.9933000383899,574.2078999755904,570.0918399612419,572.1498699684162,7.699534842668483,57.00259817834233,10,,218.14453125,
diff --git a/init.py b/init.py
new file mode 100644
index 0000000..e870447
--- /dev/null
+++ b/init.py
@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+import os
+import signal
+import subprocess
+import sys
+import time
+import webbrowser
+from pathlib import Path
+
+
+ROOT = Path(__file__).resolve().parent
+BACKEND = ROOT / "backend"
+FRONTEND = ROOT / "frontend"
+DEFAULT_CHECKPOINT = ROOT / "engine" / "best_model.pt"
+
+
+def npm_command() -> str:
+    return "npm.cmd" if os.name == "nt" else "npm"
+
+
+def python_command() -> str:
+    venv_python = ROOT / ".venv" / ("Scripts/python.exe" if os.name == "nt" else "bin/python")
+    return str(venv_python) if venv_python.exists() else sys.executable
+
+
+def start_process(name: str, command: list[str], cwd: Path, env: dict[str, str]) -> subprocess.Popen:
+    print(f"[start] {name}: {' '.join(command)}")
+    return subprocess.Popen(command, cwd=str(cwd), env=env)
+
+
+def stop_process(process: subprocess.Popen) -> None:
+    if process.poll() is not None:
+        return
+    if os.name == "nt":
+        process.terminate()
+    else:
+        process.send_signal(signal.SIGTERM)
+    try:
+        process.wait(timeout=8)
+    except subprocess.TimeoutExpired:
+        process.kill()
+
+
+def main() -> int:
+    api_port = os.environ.get("API_PORT", "3001")
+    frontend_port = os.environ.get("FRONTEND_PORT", "5173")
+    checkpoint = Path(os.environ.get("TORCH_CHECKPOINT_PATH", str(DEFAULT_CHECKPOINT))).resolve()
+
+    if not checkpoint.exists():
+        print(f"[error] .pt checkpoint not found: {checkpoint}")
+        print("        Set TORCH_CHECKPOINT_PATH to your best_model.pt file.")
+        return 1
+
+    backend_env = os.environ.copy()
+    backend_env.update(
+        {
+            "API_PORT": api_port,
+            "CORS_ORIGINS": f"http://localhost:{frontend_port},http://127.0.0.1:{frontend_port}",
+            "TORCH_CHECKPOINT_PATH": str(checkpoint),
+        }
+    )
+
+    frontend_env = os.environ.copy()
+    frontend_env.update(
+        {
+            "VITE_API_BASE_URL": f"http://localhost:{api_port}",
+            "VITE_TORCH_ONLY": "1",
+        }
+    )
+
+    backend = start_process(
+        "backend (.pt)",
+        [python_command(), "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", api_port, "--reload"],
+        BACKEND,
+        backend_env,
+    )
+    frontend = start_process(
+        "frontend",
+        [npm_command(), "run", "dev", "--", "--port", frontend_port],
+        FRONTEND,
+        frontend_env,
+    )
+
+    url = f"http://localhost:{frontend_port}"
+    print(f"[ready] frontend: {url}")
+    print(f"[ready] backend : http://localhost:{api_port}")
+    print("[mode]  PyTorch .pt only")
+    print("[stop]  Press Ctrl+C to stop both servers.")
+
+    if os.environ.get("NO_BROWSER") != "1":
+        time.sleep(2)
+        webbrowser.open(url)
+
+    try:
+        while True:
+            if backend.poll() is not None:
+                print(f"[exit] backend stopped with code {backend.returncode}")
+                return backend.returncode or 1
+            if frontend.poll() is not None:
+                print(f"[exit] frontend stopped with code {frontend.returncode}")
+                return frontend.returncode or 1
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("\n[stop] stopping servers...")
+        return 0
+    finally:
+        stop_process(frontend)
+        stop_process(backend)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())