diff --git a/.devops/Dockerfile.dev.frontend b/.devops/Dockerfile.dev.frontend new file mode 100644 index 0000000..de054d6 --- /dev/null +++ b/.devops/Dockerfile.dev.frontend @@ -0,0 +1,12 @@ +FROM node:20-alpine + +WORKDIR /app + +COPY frontend/package*.json ./ +RUN npm ci + +COPY frontend/ ./ + +EXPOSE 5173 + +CMD ["npm", "run", "dev", "--", "--host", "0.0.0.0"] diff --git a/.devops/Dockerfile.frontend b/.devops/Dockerfile.frontend new file mode 100644 index 0000000..70ca5aa --- /dev/null +++ b/.devops/Dockerfile.frontend @@ -0,0 +1,22 @@ +FROM node:20-alpine AS build + +WORKDIR /app + +COPY frontend/package*.json ./ +RUN npm ci + +COPY frontend/ ./ + +ARG VITE_API_BASE_URL=/api +ENV VITE_API_BASE_URL=${VITE_API_BASE_URL} + +RUN npm run build + +FROM nginx:1.27-alpine AS runtime + +COPY .devops/nginx.conf /etc/nginx/conf.d/default.conf +COPY --from=build /app/dist /usr/share/nginx/html + +EXPOSE 80 + +CMD ["nginx", "-g", "daemon off;"] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bf49286..992ef59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,12 +1,10 @@ name: CI on: - push: - branches: [master, dev] workflow_dispatch: inputs: image: - description: "Which image to build?" + description: "Which image to build? (cpp=C++ engine, cpu=PyTorch CPU, cuda=PyTorch CUDA, all=all three)" required: true type: choice options: @@ -14,7 +12,7 @@ on: - cpu - cuda - all - push: + push_image: description: "Push to ghcr.io?" required: true default: "true" @@ -23,10 +21,10 @@ on: env: REGISTRY: ghcr.io - IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix jobs: + file-integrity: name: File integrity if: github.event_name == 'push' @@ -44,9 +42,9 @@ jobs: failed=0 for f in "${files[@]}"; do if [ -f "$f" ]; then - echo "✅ $f" + echo "PASS: $f" else - echo "❌ $f — MISSING" + echo "FAIL: $f -- MISSING" failed=1 fi done @@ -66,8 +64,8 @@ jobs: args: "check engine/ --ignore E501 --exit-zero" - build-cpp: - name: C++ compile check + build-binary-linux: + name: Binary (ubuntu-latest) if: github.event_name == 'push' runs-on: ubuntu-latest steps: @@ -85,10 +83,43 @@ jobs: - name: Smoke test run: ./quadtrix --help || true + - name: Upload binary + uses: actions/upload-artifact@v4 + with: + name: quadtrix-linux-amd64 + path: quadtrix + retention-days: 7 + + + build-binary-macos: + name: Binary (macos-14) + if: github.event_name == 'push' + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + + - name: Compile main.cpp + run: | + g++ -std=c++17 -O3 \ + -I. -Iinclude \ + -o quadtrix main.cpp + + - name: Smoke test + run: ./quadtrix --help || true + + - name: Package binary + run: tar -czf quadtrix-macos-arm64.tar.gz quadtrix + + - name: Upload binary + uses: actions/upload-artifact@v4 + with: + name: quadtrix-macos-arm64 + path: quadtrix-macos-arm64.tar.gz + retention-days: 7 build-cpp-image: - name: Build — cpp - if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all') + name: "Build -- cpp (C++ engine - linux/amd64 + arm64)" + if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cpp' || inputs.image == 'all') }} runs-on: ubuntu-latest permissions: contents: read @@ -99,8 +130,11 @@ jobs: - uses: docker/setup-qemu-action@v3 - uses: docker/setup-buildx-action@v3 + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV + - name: Login to GHCR - if: inputs.push == 'true' + if: ${{ inputs.push_image == 'true' }} uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} @@ -123,7 +157,7 @@ jobs: context: . file: .devops/Dockerfile.cpp platforms: linux/amd64,linux/arm64 - push: ${{ inputs.push == 'true' }} + push: ${{ inputs.push_image == 'true' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha,scope=cpp @@ -131,8 +165,8 @@ jobs: build-cpu-image: - name: Build — cpu - if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all') + name: "Build -- cpu (PyTorch CPU - linux/amd64 + arm64)" + if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cpu' || inputs.image == 'all') }} runs-on: ubuntu-latest permissions: contents: read @@ -143,8 +177,11 @@ jobs: - uses: docker/setup-qemu-action@v3 - uses: docker/setup-buildx-action@v3 + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV + - name: Login to GHCR - if: inputs.push == 'true' + if: ${{ inputs.push_image == 'true' }} uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} @@ -167,7 +204,7 @@ jobs: context: . file: .devops/Dockerfile platforms: linux/amd64,linux/arm64 - push: ${{ inputs.push == 'true' }} + push: ${{ inputs.push_image == 'true' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha,scope=cpu @@ -175,8 +212,8 @@ jobs: build-cuda-image: - name: Build — cuda - if: github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all') + name: "Build -- cuda (PyTorch CUDA - linux/amd64 only)" + if: ${{ github.event_name == 'workflow_dispatch' && (inputs.image == 'cuda' || inputs.image == 'all') }} runs-on: ubuntu-latest permissions: contents: read @@ -186,8 +223,11 @@ jobs: - uses: docker/setup-buildx-action@v3 + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> $GITHUB_ENV + - name: Login to GHCR - if: inputs.push == 'true' + if: ${{ inputs.push_image == 'true' }} uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} @@ -210,8 +250,8 @@ jobs: context: . file: .devops/Dockerfile.backend platforms: linux/amd64 - push: ${{ inputs.push == 'true' }} + push: ${{ inputs.push_image == 'true' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha,scope=cuda - cache-to: type=gha,mode=max,scope=cuda \ No newline at end of file + cache-to: type=gha,mode=max,scope=cuda diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index ca9493f..0986534 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -1,66 +1,86 @@ -name: Release +name: Docker Images on: workflow_dispatch: inputs: + image: + description: "Image variant to build" + required: true + type: choice + options: + - cpp + - cpu + - cuda + - all version: - description: "Version tag (e.g. 1.2.3)" + description: "Optional image tag for manual runs" + required: false + push_image: + description: "Push to ghcr.io" required: true + default: "true" + type: choice + options: ["true", "false"] + +concurrency: + group: docker-images-${{ github.ref }} + cancel-in-progress: true env: REGISTRY: ghcr.io IMAGE_PREFIX: ghcr.io/${{ github.repository_owner }}/quadtrix jobs: - - build-binaries: - name: Binary (${{ matrix.os }}) - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-22.04, macos-14] - include: - - os: ubuntu-22.04 - artifact_name: quadtrix-linux-x64 - binary: quadtrix - - os: macos-14 - artifact_name: quadtrix-macos-arm64 - binary: quadtrix + build-cpp-image: + name: Docker cpp + if: ${{ inputs.image == 'cpp' || inputs.image == 'all' }} + runs-on: ubuntu-latest + permissions: + contents: read + packages: write steps: - uses: actions/checkout@v4 - - name: Compile (Linux) - if: runner.os == 'Linux' - run: | - sudo apt-get update && sudo apt-get install -y g++ - g++ -std=c++17 -O3 -march=native \ - -I. -Iinclude \ - -o ${{ matrix.binary }} main.cpp - strip ${{ matrix.binary }} - - - name: Compile (macOS) - if: runner.os == 'macOS' - run: | - g++ -std=c++17 -O3 -march=native \ - -I. -Iinclude \ - -o ${{ matrix.binary }} main.cpp - - - name: Package - run: | - mkdir dist - cp ${{ matrix.binary }} dist/ - cp README.md LICENSE dist/ - tar -czf ${{ matrix.artifact_name }}.tar.gz -C dist . - - - name: Upload to Release - uses: softprops/action-gh-release@v2 + - uses: docker/setup-qemu-action@v3 + - uses: docker/setup-buildx-action@v3 + + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV" + + - name: Login to GHCR + if: ${{ inputs.push_image == 'true' }} + uses: docker/login-action@v3 with: - tag_name: v${{ github.event.inputs.version }} - files: ${{ matrix.artifact_name }}.tar.gz - generate_release_notes: true + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_PREFIX }}-cpp + tags: | + type=ref,event=tag + type=sha,prefix=sha- + type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }} + type=raw,value=latest,enable=${{ inputs.push_image == 'true' }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + file: .devops/Dockerfile.cpp + platforms: linux/amd64,linux/arm64 + push: ${{ inputs.push_image == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cpp + cache-to: type=gha,mode=max,scope=cpp - publish-images: - name: Publish Docker images + build-cpu-image: + name: Docker cpu + if: ${{ inputs.image == 'cpu' || inputs.image == 'all' }} runs-on: ubuntu-latest permissions: contents: read @@ -71,62 +91,82 @@ jobs: - uses: docker/setup-qemu-action@v3 - uses: docker/setup-buildx-action@v3 + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV" + - name: Login to GHCR + if: ${{ inputs.push_image == 'true' }} uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Parse tag - id: tag - run: echo "VERSION=${{ github.event.inputs.version }}" >> $GITHUB_OUTPUT - - - name: Build & push backend - uses: docker/build-push-action@v6 + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 with: - context: . - file: .devops/Dockerfile.backend - platforms: linux/amd64,linux/arm64 - push: true + images: ${{ env.IMAGE_PREFIX }}-cpu tags: | - ${{ env.IMAGE_PREFIX }}-backend:latest - ${{ env.IMAGE_PREFIX }}-backend:${{ steps.tag.outputs.VERSION }} - cache-from: type=gha,scope=backend - cache-to: type=gha,mode=max,scope=backend + type=ref,event=tag + type=sha,prefix=sha- + type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }} + type=raw,value=latest,enable=${{ inputs.push_image == 'true' }} - - name: Build & push frontend + - name: Build and push uses: docker/build-push-action@v6 with: context: . - file: .devops/Dockerfile.frontend + file: .devops/Dockerfile platforms: linux/amd64,linux/arm64 - push: true + push: ${{ inputs.push_image == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cpu + cache-to: type=gha,mode=max,scope=cpu + + build-cuda-image: + name: Docker cuda + if: ${{ inputs.image == 'cuda' || inputs.image == 'all' }} + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 + + - uses: docker/setup-buildx-action@v3 + + - name: Set lowercase image prefix + run: echo "IMAGE_PREFIX=ghcr.io/${GITHUB_REPOSITORY_OWNER,,}/quadtrix" >> "$GITHUB_ENV" + + - name: Login to GHCR + if: ${{ inputs.push_image == 'true' }} + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.IMAGE_PREFIX }}-cuda tags: | - ${{ env.IMAGE_PREFIX }}-frontend:latest - ${{ env.IMAGE_PREFIX }}-frontend:${{ steps.tag.outputs.VERSION }} - cache-from: type=gha,scope=frontend - cache-to: type=gha,mode=max,scope=frontend + type=ref,event=tag + type=sha,prefix=sha- + type=raw,value=${{ inputs.version }},enable=${{ inputs.version != '' }} + type=raw,value=latest,enable=${{ inputs.push_image == 'true' }} - - name: Build & push cpp + - name: Build and push uses: docker/build-push-action@v6 with: context: . - file: .devops/Dockerfile.cpp - platforms: linux/amd64,linux/arm64 - push: true - tags: | - ${{ env.IMAGE_PREFIX }}-cpp:latest - ${{ env.IMAGE_PREFIX }}-cpp:${{ steps.tag.outputs.VERSION }} - cache-from: type=gha,scope=cpp - cache-to: type=gha,mode=max,scope=cpp - - - name: Create Release summary - run: | - echo "## Docker images published" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "| Image | Tags |" >> $GITHUB_STEP_SUMMARY - echo "|-------|------|" >> $GITHUB_STEP_SUMMARY - echo "| \`quadtrix-backend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY - echo "| \`quadtrix-frontend\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY - echo "| \`quadtrix-cpp\` | \`latest\`, \`${{ steps.tag.outputs.VERSION }}\` |" >> $GITHUB_STEP_SUMMARY + file: .devops/Dockerfile.backend + platforms: linux/amd64 + push: ${{ inputs.push_image == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha,scope=cuda + cache-to: type=gha,mode=max,scope=cuda diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml index 4824b9e..b50de6b 100644 --- a/.github/workflows/pr-check.yml +++ b/.github/workflows/pr-check.yml @@ -15,7 +15,7 @@ jobs: pr-sha: ${{ steps.get-sha.outputs.sha }} steps: - name: Check commenter permission - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const { data } = await github.rest.repos.getCollaboratorPermissionLevel({ @@ -34,7 +34,7 @@ jobs: } - name: React with rocket - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | await github.rest.reactions.createForIssueComment({ @@ -46,7 +46,7 @@ jobs: - name: Get PR head SHA id: get-sha - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const { data: pr } = await github.rest.pulls.get({ @@ -57,7 +57,7 @@ jobs: core.setOutput('sha', pr.head.sha); - name: Set checks to pending - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const sha = '${{ steps.get-sha.outputs.sha }}'; @@ -96,7 +96,7 @@ jobs: - name: Report status if: always() - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | await github.rest.repos.createCommitStatus({ @@ -158,7 +158,7 @@ jobs: - name: Report status if: always() - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | await github.rest.repos.createCommitStatus({ @@ -218,7 +218,7 @@ jobs: - name: Report status if: always() - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | await github.rest.repos.createCommitStatus({ @@ -237,7 +237,7 @@ jobs: runs-on: ubuntu-latest if: always() steps: - - uses: actions/github-script@v7 + - uses: actions/github-script@v9 with: script: | const jobs = ${{ toJSON(needs) }}; diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..e92b486 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,263 @@ +name: Release + +on: + workflow_dispatch: + inputs: + version: + description: "Release version, for example v1.2.3 or 1.2.3" + required: true + +concurrency: + group: release + cancel-in-progress: false + +env: + ARTIFACT_ROOT: release-assets + +jobs: + release-metadata: + name: Release metadata + runs-on: ubuntu-latest + outputs: + tag_name: ${{ steps.tag.outputs.tag_name }} + steps: + - id: tag + shell: bash + run: | + set -euo pipefail + raw_tag="${{ inputs.version }}" + if [[ "${raw_tag}" == v* ]]; then + tag_name="${raw_tag}" + else + tag_name="v${raw_tag}" + fi + echo "tag_name=${tag_name}" >> "$GITHUB_OUTPUT" + + ubuntu-cpu: + name: Ubuntu ${{ matrix.build }} CPU + needs: release-metadata + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - build: x64 + os: ubuntu-22.04 + - build: arm64 + os: ubuntu-24.04-arm + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Dependencies + shell: bash + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y build-essential file + + - name: Toolchain workaround + if: ${{ contains(matrix.os, 'ubuntu-24.04') }} + shell: bash + run: | + set -euo pipefail + sudo apt-get install -y gcc-14 g++-14 + echo "CC=gcc-14" >> "$GITHUB_ENV" + echo "CXX=g++-14" >> "$GITHUB_ENV" + + - name: Build + shell: bash + run: | + set -euo pipefail + ${CXX:-g++} -std=c++17 -O3 -DNDEBUG \ + -I. -Iinclude \ + -o quadtrix main.cpp + file quadtrix + + - name: Smoke test + shell: bash + run: | + set +e + ./quadtrix --chat >/dev/null 2>&1 + exit 0 + + - name: Pack artifacts + shell: bash + run: | + set -euo pipefail + package="quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-ubuntu-${{ matrix.build }}-cpu" + mkdir -p "${ARTIFACT_ROOT}/${package}" + cp quadtrix README.md LICENSE "${ARTIFACT_ROOT}/${package}/" + tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}" + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: quadtrix-bin-ubuntu-${{ matrix.build }}-cpu + path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-ubuntu-${{ matrix.build }}-cpu.tar.gz + if-no-files-found: error + retention-days: 30 + + windows-cpu: + name: Windows ${{ matrix.arch }} CPU + needs: release-metadata + runs-on: windows-2022 + strategy: + fail-fast: false + matrix: + include: + - arch: x64 + vcvars: x64 + - arch: arm64 + vcvars: amd64_arm64 + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Build + shell: cmd + run: | + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.vcvars }} + cl /nologo /std:c++17 /O2 /DNDEBUG /EHsc /Iinclude /I. main.cpp /Fe:quadtrix.exe + + - name: Smoke test + if: ${{ matrix.arch == 'x64' }} + shell: pwsh + run: | + $ErrorActionPreference = 'Continue' + & .\quadtrix.exe --chat | Out-Null + exit 0 + + - name: Pack artifacts + shell: pwsh + run: | + $package = "quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-windows-${{ matrix.arch }}-cpu" + New-Item -ItemType Directory -Force "${env:ARTIFACT_ROOT}\${package}" | Out-Null + Copy-Item quadtrix.exe "${env:ARTIFACT_ROOT}\${package}\" + Copy-Item README.md, LICENSE "${env:ARTIFACT_ROOT}\${package}\" + Compress-Archive -Path "${env:ARTIFACT_ROOT}\${package}\*" -DestinationPath "${package}.zip" -Force + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: quadtrix-bin-windows-${{ matrix.arch }}-cpu + path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-windows-${{ matrix.arch }}-cpu.zip + if-no-files-found: error + retention-days: 30 + + macos-cpu: + name: macOS ${{ matrix.build }} CPU + needs: release-metadata + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - build: arm64 + arch: arm64 + os: macos-14 + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Build + shell: bash + run: | + set -euo pipefail + clang++ -std=c++17 -O3 -DNDEBUG -arch ${{ matrix.arch }} \ + -I. -Iinclude \ + -o quadtrix main.cpp + file quadtrix + + - name: Smoke test + shell: bash + run: | + set +e + ./quadtrix --chat >/dev/null 2>&1 + exit 0 + + - name: Pack artifacts + shell: bash + run: | + set -euo pipefail + package="quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-macos-${{ matrix.build }}-cpu" + mkdir -p "${ARTIFACT_ROOT}/${package}" + cp quadtrix README.md LICENSE "${ARTIFACT_ROOT}/${package}/" + tar -czf "${package}.tar.gz" -C "${ARTIFACT_ROOT}" "${package}" + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: quadtrix-bin-macos-${{ matrix.build }}-cpu + path: quadtrix-${{ needs.release-metadata.outputs.tag_name }}-bin-macos-${{ matrix.build }}-cpu.tar.gz + if-no-files-found: error + retention-days: 30 + + publish-release: + name: Publish GitHub release + needs: + - release-metadata + - ubuntu-cpu + - windows-cpu + - macos-cpu + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + path: dist + merge-multiple: true + + - name: Write release notes + shell: bash + run: | + cat > release-notes.md <<'EOF' + macOS/iOS: + + macOS Apple Silicon (arm64) + macOS Apple Silicon (arm64, KleidiAI enabled) DISABLED + macOS Intel (x64) SKIPPED + iOS XCFramework DISABLED + + Linux: + + Ubuntu x64 (CPU) + Ubuntu arm64 (CPU) + Ubuntu s390x (CPU) SKIPPED + Ubuntu x64 (Vulkan) DISABLED + Ubuntu arm64 (Vulkan) DISABLED + Ubuntu x64 (ROCm 7.2) DISABLED + Ubuntu x64 (OpenVINO) DISABLED + Ubuntu x64 (SYCL FP32) DISABLED + + Android: + + Android arm64 (CPU) DISABLED + + Windows: + + Windows x64 (CPU) + Windows arm64 (CPU) + Windows x64 (CUDA 12) - CUDA 12.4 DLLs DISABLED + Windows x64 (CUDA 13) - CUDA 13.3 DLLs DISABLED + Windows x64 (Vulkan) DISABLED + Windows x64 (SYCL) DISABLED + Windows x64 (HIP) DISABLED + EOF + + - name: Publish release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ needs.release-metadata.outputs.tag_name }} + target_commitish: ${{ github.sha }} + prerelease: false + body_path: release-notes.md + files: dist/* diff --git a/CUDA/includes/attention.cuh b/CUDA/includes/attention.cuh new file mode 100644 index 0000000..7feac08 --- /dev/null +++ b/CUDA/includes/attention.cuh @@ -0,0 +1,29 @@ +#pragma once + +#include "tensor.cuh" + +#include + +namespace quadtrix { +namespace cuda { + +Status attention_forward( + const TensorView& input_qkv, + TensorView preatt, + TensorView att, + TensorView output, + int num_heads, + cudaStream_t stream = nullptr); + +Status attention_backward( + const TensorView& grad_output, + const TensorView& input_qkv, + const TensorView& att, + TensorView grad_input_qkv, + TensorView grad_preatt, + TensorView grad_att, + int num_heads, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/checkpoint.h b/CUDA/includes/checkpoint.h new file mode 100644 index 0000000..ba91b0f --- /dev/null +++ b/CUDA/includes/checkpoint.h @@ -0,0 +1,25 @@ +#pragma once + +#include "tensor.cuh" + +namespace quadtrix { +namespace cuda { + +struct CheckpointMetadata { + int vocab_size = 0; + int max_sequence_length = 0; + int num_layers = 0; + int num_heads = 0; + int channels = 0; +}; + +inline bool load_checkpoint_metadata(const char*, CheckpointMetadata*) { + return false; +} + +inline bool save_tensor_checkpoint(const char*, const TensorView&) { + return false; +} + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/common.h b/CUDA/includes/common.h new file mode 100644 index 0000000..36df155 --- /dev/null +++ b/CUDA/includes/common.h @@ -0,0 +1,120 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace quadtrix { +namespace cuda { + +enum class DType : std::uint8_t { + F32, + F16, + BF16, + I32, + U8, +}; + +enum class DeviceKind : std::uint8_t { + CPU, + CUDA, +}; + +struct Status { + bool ok; + cudaError_t cuda_error; + const char* message; + + static Status success() { + return {true, cudaSuccess, "ok"}; + } + + static Status failure(cudaError_t error, const char* message) { + return {false, error, message}; + } +}; + +inline const char* dtype_name(DType dtype) { + switch (dtype) { + case DType::F32: + return "f32"; + case DType::F16: + return "f16"; + case DType::BF16: + return "bf16"; + case DType::I32: + return "i32"; + case DType::U8: + return "u8"; + } + return "unknown"; +} + +inline std::size_t dtype_size(DType dtype) { + switch (dtype) { + case DType::F32: + return 4; + case DType::F16: + return 2; + case DType::BF16: + return 2; + case DType::I32: + return 4; + case DType::U8: + return 1; + } + + std::fprintf(stderr, "Unknown CUDA dtype value %u\n", static_cast(dtype)); + std::abort(); +} + +inline bool checked_mul(std::size_t lhs, std::size_t rhs, std::size_t* out) { + if (lhs != 0 && rhs > std::numeric_limits::max() / lhs) { + return false; + } + *out = lhs * rhs; + return true; +} + +inline Status check_cuda(cudaError_t error, const char* expression, const char* file, int line) { + if (error == cudaSuccess) { + return Status::success(); + } + + std::fprintf( + stderr, + "CUDA error at %s:%d: %s failed with %s\n", + file, + line, + expression, + cudaGetErrorString(error)); + return Status::failure(error, expression); +} + +inline void abort_on_cuda(cudaError_t error, const char* expression, const char* file, int line) { + if (error == cudaSuccess) { + return; + } + + std::fprintf( + stderr, + "Fatal CUDA error at %s:%d: %s failed with %s\n", + file, + line, + expression, + cudaGetErrorString(error)); + std::abort(); +} + +} // namespace cuda +} // namespace quadtrix + +#define QUADTRIX_CUDA_CHECK(expr) \ + ::quadtrix::cuda::check_cuda((expr), #expr, __FILE__, __LINE__) + +#define QUADTRIX_CUDA_ABORT(expr) \ + ::quadtrix::cuda::abort_on_cuda((expr), #expr, __FILE__, __LINE__) diff --git a/CUDA/includes/dataloader.h b/CUDA/includes/dataloader.h new file mode 100644 index 0000000..fd3c47d --- /dev/null +++ b/CUDA/includes/dataloader.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace quadtrix { +namespace cuda { + +struct TokenBatchView { + const std::int32_t* inputs = nullptr; + const std::int32_t* targets = nullptr; + int batch_size = 0; + int sequence_length = 0; +}; + +class DataLoader { +public: + DataLoader() = default; + + bool next(TokenBatchView* batch) { + if (batch != nullptr) { + *batch = {}; + } + return false; + } +}; + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/gelu.cuh b/CUDA/includes/gelu.cuh new file mode 100644 index 0000000..af87e64 --- /dev/null +++ b/CUDA/includes/gelu.cuh @@ -0,0 +1,31 @@ +#pragma once + +#include "tensor.cuh" + +#include + +#include + +namespace quadtrix { +namespace cuda { + +enum class GeluMode : std::uint8_t { + Exact, + Approximate, +}; + +Status gelu_forward( + const TensorView& input, + TensorView output, + GeluMode mode = GeluMode::Approximate, + cudaStream_t stream = nullptr); + +Status gelu_backward( + const TensorView& grad_output, + const TensorView& input, + TensorView grad_input, + GeluMode mode = GeluMode::Approximate, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/global_norm.cuh b/CUDA/includes/global_norm.cuh new file mode 100644 index 0000000..f418ab7 --- /dev/null +++ b/CUDA/includes/global_norm.cuh @@ -0,0 +1,26 @@ +#pragma once + +#include "tensor.cuh" + +#include + +namespace quadtrix { +namespace cuda { + +Status global_norm_squared( + const TensorView& grads, + TensorView partial_sums, + cudaStream_t stream = nullptr); + +Status clip_gradients_by_global_norm( + TensorView grads, + float global_norm, + float max_norm, + cudaStream_t stream = nullptr); + +inline float clip_scale(float global_norm, float max_norm) { + return global_norm > max_norm && global_norm > 0.0f ? max_norm / global_norm : 1.0f; +} + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/layernorm.cuh b/CUDA/includes/layernorm.cuh new file mode 100644 index 0000000..2645537 --- /dev/null +++ b/CUDA/includes/layernorm.cuh @@ -0,0 +1,32 @@ +#pragma once + +#include "tensor.cuh" + +#include + +namespace quadtrix { +namespace cuda { + +Status layernorm_forward( + const TensorView& input, + const TensorView& gamma, + const TensorView& beta, + TensorView output, + TensorView mean, + TensorView rstd, + float epsilon = 1.0e-5f, + cudaStream_t stream = nullptr); + +Status layernorm_backward( + const TensorView& grad_output, + const TensorView& input, + const TensorView& gamma, + const TensorView& mean, + const TensorView& rstd, + TensorView grad_input, + TensorView grad_gamma, + TensorView grad_beta, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/logger.h b/CUDA/includes/logger.h new file mode 100644 index 0000000..219c50f --- /dev/null +++ b/CUDA/includes/logger.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +namespace quadtrix { +namespace cuda { + +enum class LogLevel { + Info, + Warn, + Error, +}; + +inline const char* log_level_name(LogLevel level) { + switch (level) { + case LogLevel::Info: + return "info"; + case LogLevel::Warn: + return "warn"; + case LogLevel::Error: + return "error"; + } + return "unknown"; +} + +inline void log_message(LogLevel level, const char* format, ...) { + std::fprintf(level == LogLevel::Error ? stderr : stdout, "[cuda:%s] ", log_level_name(level)); + va_list args; + va_start(args, format); + std::vfprintf(level == LogLevel::Error ? stderr : stdout, format, args); + va_end(args); + std::fprintf(level == LogLevel::Error ? stderr : stdout, "\n"); +} + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/matmul.cuh b/CUDA/includes/matmul.cuh new file mode 100644 index 0000000..12dd4b2 --- /dev/null +++ b/CUDA/includes/matmul.cuh @@ -0,0 +1,99 @@ +#pragma once + +#include "tensor.cuh" + +#include +#include + +#include + +namespace quadtrix { +namespace cuda { + +enum class MatmulTranspose : std::uint8_t { + None, + Transpose, +}; + +struct BlasStatus { + bool ok; + cublasStatus_t cublas_status; + const char* message; + + static BlasStatus success() { + return {true, CUBLAS_STATUS_SUCCESS, "ok"}; + } + + static BlasStatus failure(cublasStatus_t status, const char* message) { + return {false, status, message}; + } +}; + +const char* cublas_status_name(cublasStatus_t status); + +class BlasHandle { +public: + explicit BlasHandle(int device_id = 0); + ~BlasHandle(); + + BlasHandle(const BlasHandle&) = delete; + BlasHandle& operator=(const BlasHandle&) = delete; + + BlasHandle(BlasHandle&& other) noexcept; + BlasHandle& operator=(BlasHandle&& other) noexcept; + + cublasHandle_t get() const { + return handle_; + } + + int device_id() const { + return device_id_; + } + + BlasStatus set_stream(cudaStream_t stream); + +private: + cublasHandle_t handle_ = nullptr; + int device_id_ = 0; +}; + +BlasStatus matmul( + BlasHandle& handle, + const TensorView& a, + MatmulTranspose op_a, + const TensorView& b, + MatmulTranspose op_b, + TensorView c, + float alpha = 1.0f, + float beta = 0.0f, + cudaStream_t stream = nullptr); + +BlasStatus matmul_forward( + BlasHandle& handle, + const TensorView& input, + const TensorView& weight, + TensorView output, + cudaStream_t stream = nullptr, + float alpha = 1.0f, + float beta = 0.0f); + +BlasStatus matmul_backward_input( + BlasHandle& handle, + const TensorView& grad_output, + const TensorView& weight, + TensorView grad_input, + cudaStream_t stream = nullptr, + float alpha = 1.0f, + float beta = 0.0f); + +BlasStatus matmul_backward_weight( + BlasHandle& handle, + const TensorView& input, + const TensorView& grad_output, + TensorView grad_weight, + cudaStream_t stream = nullptr, + float alpha = 1.0f, + float beta = 0.0f); + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/memory.cuh b/CUDA/includes/memory.cuh new file mode 100644 index 0000000..e08fa4a --- /dev/null +++ b/CUDA/includes/memory.cuh @@ -0,0 +1,120 @@ +#pragma once + +#include "common.h" +#include "runtime.cuh" + +#include + +#include +#include + +namespace quadtrix { +namespace cuda { + +class DeviceBuffer { +public: + DeviceBuffer() = default; + + explicit DeviceBuffer(std::size_t bytes, int device_id = -1) { + allocate(bytes, device_id); + } + + ~DeviceBuffer() { + release(); + } + + DeviceBuffer(const DeviceBuffer&) = delete; + DeviceBuffer& operator=(const DeviceBuffer&) = delete; + + DeviceBuffer(DeviceBuffer&& other) noexcept { + swap(other); + } + + DeviceBuffer& operator=(DeviceBuffer&& other) noexcept { + if (this != &other) { + release(); + swap(other); + } + return *this; + } + + void allocate(std::size_t bytes, int device_id = -1) { + release(); + if (bytes == 0) { + return; + } + if (device_id >= 0) { + device_id_ = device_id; + DeviceGuard guard(device_id); + QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes)); + } else { + device_id_ = current_device(); + QUADTRIX_CUDA_ABORT(cudaMalloc(&ptr_, bytes)); + } + bytes_ = bytes; + } + + void release() { + if (ptr_ != nullptr) { + if (device_id_ >= 0) { + DeviceGuard guard(device_id_); + cudaFree(ptr_); + } else { + cudaFree(ptr_); + } + ptr_ = nullptr; + bytes_ = 0; + device_id_ = -1; + } + } + + void* data() { + return ptr_; + } + + const void* data() const { + return ptr_; + } + + std::size_t bytes() const { + return bytes_; + } + + bool empty() const { + return ptr_ == nullptr || bytes_ == 0; + } + + int device_id() const { + return device_id_; + } + + void swap(DeviceBuffer& other) noexcept { + std::swap(ptr_, other.ptr_); + std::swap(bytes_, other.bytes_); + std::swap(device_id_, other.device_id_); + } + +private: + void* ptr_ = nullptr; + std::size_t bytes_ = 0; + int device_id_ = -1; +}; + +inline Status copy_h2d(void* dst_device, const void* src_host, std::size_t bytes, cudaStream_t stream = nullptr) { + return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_host, bytes, cudaMemcpyHostToDevice, stream)); +} + +inline Status copy_d2h(void* dst_host, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) { + return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_host, src_device, bytes, cudaMemcpyDeviceToHost, stream)); +} + +inline Status copy_d2d(void* dst_device, const void* src_device, std::size_t bytes, cudaStream_t stream = nullptr) { + return QUADTRIX_CUDA_CHECK(cudaMemcpyAsync(dst_device, src_device, bytes, cudaMemcpyDeviceToDevice, stream)); +} + +inline Status memset_device(void* dst_device, int value, std::size_t bytes, cudaStream_t stream = nullptr) { + return QUADTRIX_CUDA_CHECK(cudaMemsetAsync(dst_device, value, bytes, stream)); +} + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/nccl_all_reduce.cuh b/CUDA/includes/nccl_all_reduce.cuh new file mode 100644 index 0000000..c712a6a --- /dev/null +++ b/CUDA/includes/nccl_all_reduce.cuh @@ -0,0 +1,96 @@ +#pragma once + +#include "tensor.cuh" + +#include + +#ifdef QUADTRIX_ENABLE_NCCL +#include +#else +typedef struct { + char internal[128]; +} ncclUniqueId; +typedef struct ncclComm* ncclComm_t; +typedef enum { + ncclSuccess = 0, + ncclUnhandledCudaError = 1, + ncclSystemError = 2, + ncclInternalError = 3, + ncclInvalidArgument = 4, + ncclInvalidUsage = 5, + ncclNumResults = 6 +} ncclResult_t; +#endif + +namespace quadtrix { +namespace cuda { + +struct NcclStatus { + bool ok; + ncclResult_t nccl_status; + const char* message; + + static NcclStatus success() { + return {true, ncclSuccess, "ok"}; + } + + static NcclStatus failure(ncclResult_t status, const char* message) { + return {false, status, message}; + } +}; + +const char* nccl_status_name(ncclResult_t status); + +class NcclCommunicator { +public: + NcclCommunicator() = default; + NcclCommunicator(ncclUniqueId unique_id, int world_size, int rank, int device_id); + ~NcclCommunicator(); + + NcclCommunicator(const NcclCommunicator&) = delete; + NcclCommunicator& operator=(const NcclCommunicator&) = delete; + + NcclCommunicator(NcclCommunicator&& other) noexcept; + NcclCommunicator& operator=(NcclCommunicator&& other) noexcept; + + ncclComm_t get() const { + return comm_; + } + + int world_size() const { + return world_size_; + } + + int rank() const { + return rank_; + } + + int device_id() const { + return device_id_; + } + + bool valid() const { + return comm_ != nullptr; + } + +private: + ncclComm_t comm_ = nullptr; + int world_size_ = 1; + int rank_ = 0; + int device_id_ = 0; +}; + +NcclStatus create_unique_id(ncclUniqueId* unique_id); + +NcclStatus all_reduce_sum( + NcclCommunicator& communicator, + TensorView tensor, + cudaStream_t stream = nullptr); + +NcclStatus all_reduce_average( + NcclCommunicator& communicator, + TensorView tensor, + cudaStream_t stream = nullptr); + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/includes/tensor.cuh b/CUDA/includes/tensor.cuh new file mode 100644 index 0000000..c61d77e --- /dev/null +++ b/CUDA/includes/tensor.cuh @@ -0,0 +1,168 @@ +#pragma once + +#include "common.h" +#include "memory.cuh" + +#include +#include +#include + +namespace quadtrix { +namespace cuda { + +constexpr int kMaxTensorDims = 8; + +struct TensorShape { + int rank = 0; + std::array dims{}; + std::array strides{}; + + static TensorShape contiguous(const std::int64_t* sizes, int ndim) { + if (ndim < 1 || ndim > kMaxTensorDims) { + std::fprintf(stderr, "Tensor rank %d is outside supported range [1, %d]\n", ndim, kMaxTensorDims); + std::abort(); + } + + TensorShape shape; + shape.rank = ndim; + for (int i = 0; i < ndim; ++i) { + if (sizes[i] <= 0) { + std::fprintf(stderr, "Tensor dimension %d must be positive, got %lld\n", i, static_cast(sizes[i])); + std::abort(); + } + shape.dims[i] = sizes[i]; + } + + std::int64_t stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + shape.strides[i] = stride; + stride *= shape.dims[i]; + } + return shape; + } + + std::size_t numel() const { + std::size_t total = 1; + for (int i = 0; i < rank; ++i) { + if (dims[i] <= 0) { + return 0; + } + std::size_t next = 0; + if (!checked_mul(total, static_cast(dims[i]), &next)) { + return 0; + } + total = next; + } + return rank == 0 ? 0 : total; + } + + bool is_contiguous() const { + std::int64_t expected = 1; + for (int i = rank - 1; i >= 0; --i) { + if (strides[i] != expected) { + return false; + } + expected *= dims[i]; + } + return true; + } +}; + +struct TensorView { + void* data = nullptr; + TensorShape shape; + DType dtype = DType::F32; + DeviceKind device = DeviceKind::CUDA; + int device_id = 0; + + std::size_t numel() const { + return shape.numel(); + } + + std::size_t bytes() const { + std::size_t out = 0; + if (!checked_mul(numel(), dtype_size(dtype), &out)) { + return 0; + } + return out; + } + + template + T* data_as() { + return static_cast(data); + } + + template + const T* data_as() const { + return static_cast(data); + } +}; + +class Tensor { +public: + Tensor() = default; + + Tensor(const std::int64_t* dims, int rank, DType dtype, int device_id = 0) + : shape_(TensorShape::contiguous(dims, rank)), dtype_(dtype), device_id_(device_id) { + allocate(); + } + + Tensor(const Tensor&) = delete; + Tensor& operator=(const Tensor&) = delete; + Tensor(Tensor&&) noexcept = default; + Tensor& operator=(Tensor&&) noexcept = default; + + TensorView view() { + return {storage_.data(), shape_, dtype_, DeviceKind::CUDA, device_id_}; + } + + TensorView view() const { + return {const_cast(storage_.data()), shape_, dtype_, DeviceKind::CUDA, device_id_}; + } + + const TensorShape& shape() const { + return shape_; + } + + DType dtype() const { + return dtype_; + } + + int device_id() const { + return device_id_; + } + + std::size_t numel() const { + return shape_.numel(); + } + + std::size_t bytes() const { + return storage_.bytes(); + } + + void* data() { + return storage_.data(); + } + + const void* data() const { + return storage_.data(); + } + +private: + void allocate() { + std::size_t bytes = 0; + if (!checked_mul(shape_.numel(), dtype_size(dtype_), &bytes)) { + std::fprintf(stderr, "Tensor allocation size overflow\n"); + std::abort(); + } + storage_.allocate(bytes, device_id_); + } + + TensorShape shape_; + DType dtype_ = DType::F32; + int device_id_ = 0; + DeviceBuffer storage_; +}; + +} // namespace cuda +} // namespace quadtrix diff --git a/CUDA/llmcpp/adamw.cuh b/CUDA/llmcpp/adamw.cuh new file mode 100644 index 0000000..4453576 --- /dev/null +++ b/CUDA/llmcpp/adamw.cuh @@ -0,0 +1,98 @@ +/* +AdamW kernel +*/ + +// llmc internal imports +#include "cuda_common.h" +#include "cuda_utils.cuh" + +// ---------------------------------------------------------------------------- +// CUDA kernels + +// Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation). +// Reference: https://developer.nvidia.com/blog/lerp-faster-cuda +__device__ float lerp(float start, float end, float weight) { + return fma(weight, end, fma(-weight, start, start)); +} + +template +__device__ void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, + float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, + float grad_scale, unsigned int seed) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_parameters) { return; } // guard + + // get the gradient, m, and v for this parameter + float grad = grad_scale * (float)grads_memory[idx]; + float m = m_memory[idx]; + float v = v_memory[idx]; + // update the first moment (momentum) + m = lerp(grad, m, beta1); + m_memory[idx] = m; + // update the second moment (RMSprop) + v = lerp(grad * grad, v, beta2); + v_memory[idx] = v; + m /= beta1_correction; // m_hat + v /= beta2_correction; // v_hat + // fetch the old value of this parameter as a float, from either source + float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx]; + // update this parameter + float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param)); + // update our low precision version of the parameters using stochastic rounding + // this will be used in the next forward pass + stochastic_rounding(param, ¶ms_memory[idx], seed); + // write the full, float version of the param into our master copy, if we maintain one + // this will be used in the next update + if (master_params_memory != NULL) { master_params_memory[idx] = param; } +} + +template +__global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, + ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride, + float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay, + float grad_scale, unsigned int seed) { + adamw_update(params_memory + blockIdx.y * w_stride, + master_params_memory ? master_params_memory + blockIdx.y * s_stride : NULL, + grads_memory + blockIdx.y * g_stride, + m_memory + blockIdx.y * s_stride, + v_memory + blockIdx.y * s_stride, + num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, grad_scale, + seed + ); +} + +template +__global__ void init_from_master_kernel(Tp* params_memory, float* master_params_memory, size_t num_parameters, + ptrdiff_t w_stride, ptrdiff_t s_stride, unsigned int seed) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= num_parameters) { return; } + params_memory += blockIdx.y * w_stride; // adjust for layer offset + master_params_memory += blockIdx.y * s_stride; + stochastic_rounding(master_params_memory[idx], ¶ms_memory[idx], seed); +} + +template +void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters, + ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride, int num_slices, float learning_rate, float beta1, float beta2, int t, float eps, float weight_decay, + float grad_scale, unsigned int seed, cudaStream_t stream) { + // AdamW update + int block_size = 512; + int num_blocks = CEIL_DIV(num_parameters, block_size); + float beta1_correction = 1.0f - powf(beta1, t); + float beta2_correction = 1.0f - powf(beta2, t); + adamw_kernel3<<>>(params_memory, master_params_memory, grads_memory, + m_memory, v_memory, num_parameters, w_stride, g_stride, s_stride, + learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, + grad_scale, seed); + cudaCheck(cudaGetLastError()); +} + +template +void init_from_master(Tp* params_memory, float* master_params_memory, size_t num_parameters, + ptrdiff_t w_stride, ptrdiff_t s_stride, int num_slices, unsigned int seed, cudaStream_t stream) { + int block_size = 512; // must match block size of adamw_update so that RNG also matches + int num_blocks = CEIL_DIV(num_parameters, block_size); + init_from_master_kernel<<>> + (params_memory, master_params_memory, num_parameters, w_stride, s_stride, seed); + cudaCheck(cudaGetLastError()); +} diff --git a/CUDA/llmcpp/cuda_utils.cuh b/CUDA/llmcpp/cuda_utils.cuh new file mode 100644 index 0000000..030ec07 --- /dev/null +++ b/CUDA/llmcpp/cuda_utils.cuh @@ -0,0 +1,286 @@ +// Utilities for use in __device__ code + +#ifndef CUDA_UTILS_CUH +#define CUDA_UTILS_CUH + +#include "cuda_common.h" + +// ---------------------------------------------------------------------------- +// Packed128 data structure that forces the compiler to use 128-bit loads/stores +// in GPUs that support (the LDG.128 and STS.128 instructions) +// This is a bit similar to the use of float4 in the case of 32-bit floats, but +// supports arbitrary precision. + +template +struct alignas(16) Packed128 { + Packed128() = default; + __device__ explicit Packed128(int4 bits) { + static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); + memcpy(&payload, &bits, sizeof(bits)); + } + + __device__ static Packed128 constant(ElementType value) { + Packed128 result; + for(int k = 0; k < size; ++k) { + result.payload[k] = value; + } + return result; + } + __device__ static Packed128 zeros() { + return constant(0.f); + } + __device__ static Packed128 ones() { + return constant(1.f); + } + + __device__ ElementType& operator[](int index) { + return payload[index]; + } + __device__ const ElementType& operator[](int index) const { + return payload[index]; + } + __device__ int4 get_bits() const { + int4 bits; + static_assert(sizeof(bits) == sizeof(payload), "Size mismatch."); + memcpy(&bits, &payload, sizeof(bits)); + return bits; + } + static constexpr const size_t size = sizeof(int4) / sizeof(ElementType); + ElementType payload[size]; +}; + +// load a Packed128 from an aligned memory address +template +__device__ Packed128 load128(const ElementType* address) { + return Packed128{*reinterpret_cast(address)}; +} +// load a Packed128 from an aligned memory address with streaming cache hint +template +__device__ Packed128 load128cs(const ElementType* address) { + return Packed128{__ldcs(reinterpret_cast(address))}; +} +// store a Packed128 to an aligned memory address +template +__device__ void store128(ElementType* target, Packed128 value) { + *reinterpret_cast(target) = value.get_bits(); +} +// store a Packed128 to an aligned memory address with streaming cache hint +template +__device__ void store128cs(ElementType* target, Packed128 value) { + __stcs(reinterpret_cast(target), value.get_bits()); +} +// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1 +template +__device__ void store128cg(ElementType* target, Packed128 value) { + __stcg(reinterpret_cast(target), value.get_bits()); +} + +// short-form typedefs +typedef Packed128 f128; +typedef Packed128 x128; + +// ---------------------------------------------------------------------------- +// DType support + +// enumerator to indentify the datatype of a tensor. +enum class DType : uint8_t { + FP32, FP16, BF16 +}; + +// Given a datatype enum, returns the underlying number of bytes +// for a scalar of that type +size_t sizeof_dtype(DType type) { + switch (type) { + case DType::FP32: + return sizeof(float); + case DType::FP16: + return sizeof(half); + case DType::BF16: + return sizeof(nv_bfloat16); + default: // handle or get compiler warning + fprintf(stderr, "Unknown datatype\n"); + exit(EXIT_FAILURE); + } +} + +DType dtype_of(float* f) { return DType::FP32; } +DType dtype_of(nv_bfloat16 * f) { return DType::BF16; } +DType dtype_of(half * f) { return DType::FP16; } + + + +// ---------------------------------------------------------------------------- +// Copy, cast functions + +// device functions and the kernel to cast data between types +template +__device__ Td cast_value(Ts val); + +template<> +__device__ float cast_value(float val) { + return val; +} + +template<> +__device__ float cast_value(half val) { + return __half2float(val); +} + +template<> +__device__ float cast_value(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template +__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n, ptrdiff_t stride_dst, ptrdiff_t stride_src) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + // need to try grid stride looping for more perf later + if (idx < n) { + dst[idx + stride_dst * blockIdx.y] = cast_value(src[idx + stride_src * blockIdx.y]); + } +} + +// ---------------------------------------------------------------------------- +// Warp/Block communication primitives + +// warp-level reduction for summing values +__device__ inline float warpReduceSum(float val) { + for (int offset = 16; offset > 0; offset /= 2) { + val += __shfl_xor_sync(0xFFFFFFFF, val, offset); + } + return val; +} +// warp-level reduction for finding the maximum value +__device__ inline float warpReduceMax(float val) { + for (int offset = 16; offset > 0; offset /= 2) { + val = fmaxf(val, __shfl_xor_sync(0xFFFFFFFF, val, offset)); + } + return val; +} +// requires all 32 threads in the warp to be active, but should work for any block size +// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes +// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end +// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1 +using reduction_func_t = float (*) (float); +template +__device__ inline float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) { + // two reductions of up to 1024 threads: + // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle) + __shared__ float shared_val[WARP_SIZE]; + const int lane_id = threadIdx.x % WARP_SIZE; + const int warp_id = threadIdx.x / WARP_SIZE; + const int num_warps = blockDim.x / WARP_SIZE; + + float warp_val = warp_reduction(val); + if (lane_id == 0) { shared_val[warp_id] = warp_val; } + __syncthreads(); + warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds; + float block_val = warp_reduction(warp_val); + + if (final_sync) { + __syncthreads(); // only needed in loops when effectively reusing shared memory etc. + } + return block_val; +} + +// Performs a _deterministic_ sum reduction. determinism is achieved by requiring that only +// a single block be used. +template +__global__ void global_sum_single_block_kernel(float* result, const Float* values, size_t count) { + assert(gridDim.x == 1); // only a single block! + float thread_sum = 0; + for(size_t index = threadIdx.x; index < count; index += blockDim.x) { + thread_sum += (float)values[index]; + } + + float reduction = blockReduce(thread_sum, true); + if(threadIdx.x == 0) { + *result = reduction; + } +} + +template +void global_sum_deterministic(float* result, const Float* values, int count, cudaStream_t stream) { + global_sum_single_block_kernel<<<1, 1024, 0, stream>>>(result, values, count); + cudaCheck(cudaGetLastError()); +} + +// ---------------------------------------------------------------------------- +// memory management + +// allocate memory, preferrably on the device +// returns a status code. 0 = OK, 1 = fell back to managed memory +int cudaMallocConditionallyManaged(void** out, size_t bytes, const char *file, int line) { + // try to allocate + cudaError_t err = cudaMalloc(out, bytes); + if(err == cudaErrorMemoryAllocation) { + // if we OOM, fallback to a managed allocation. slower but at least won't crash. + cudaGetLastError(); // reset the error before the next API call + cudaCheck_(cudaMallocManaged(out, bytes), file, line); + cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), file, line); + return 1; + } else { + cudaCheck_(err, file, line); + return 0; + } +} + +#define cudaMallocConditionallyManaged(out, bytes)\ +(cudaMallocConditionallyManaged((void**)out, bytes, __FILE__, __LINE__)) + +// ---------------------------------------------------------------------------- +// Random Number Generation used in Stochastic Rounding + +// SquirrelNoise5 - Squirrel's Raw Noise utilities (version 5) +// This gives us a random number from threadIdx/blockIdx + a single seed for the entire GPU +// todo - possibly overkill and we don't need such high quality random numbers? (tbd) +// http://eiserloh.net/noise/SquirrelNoise5.hpp +__device__ __host__ constexpr unsigned int SquirrelNoise5(unsigned int positionX, unsigned int seed) +{ + constexpr unsigned int SQ5_BIT_NOISE1 = 0xd2a80a3f; // 11010010101010000000101000111111 + constexpr unsigned int SQ5_BIT_NOISE2 = 0xa884f197; // 10101000100001001111000110010111 + constexpr unsigned int SQ5_BIT_NOISE3 = 0x6C736F4B; // 01101100011100110110111101001011 + constexpr unsigned int SQ5_BIT_NOISE4 = 0xB79F3ABB; // 10110111100111110011101010111011 + constexpr unsigned int SQ5_BIT_NOISE5 = 0x1b56c4f5; // 00011011010101101100010011110101 + unsigned int mangledBits = positionX; + mangledBits *= SQ5_BIT_NOISE1; + mangledBits += seed; + mangledBits ^= (mangledBits >> 9); + mangledBits += SQ5_BIT_NOISE2; + mangledBits ^= (mangledBits >> 11); + mangledBits *= SQ5_BIT_NOISE3; + mangledBits ^= (mangledBits >> 13); + mangledBits += SQ5_BIT_NOISE4; + mangledBits ^= (mangledBits >> 15); + mangledBits *= SQ5_BIT_NOISE5; + mangledBits ^= (mangledBits >> 17); + return mangledBits; +} +__device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed) +{ + constexpr unsigned int PRIME_NUMBER = 198491317u; // Large prime number with non-boring bits + unsigned int x = static_cast(indexX); + unsigned int y = static_cast(indexY); + + return SquirrelNoise5(x + (PRIME_NUMBER * y), seed); +} + +// stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift) +__device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) { + // todo - is this stochastic rounding *too good*? can we cut any corners? + // makes sure each thread gets a different random number + unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x * blockDim.x + blockIdx.y, seed); + unsigned int threshold = random & 0xFFFF; + unsigned int float_bits = __float_as_uint(in); + unsigned int rounded_bits = float_bits & 0x0000FFFF; + float_bits = (rounded_bits > threshold) ? (float_bits | 0xFFFF) : (float_bits & ~0xFFFF); + *out = __float2bfloat16_rn(__uint_as_float(float_bits)); +} +__device__ __forceinline__ void stochastic_rounding(float in, half *out, unsigned int random) { + *out = (float)in; // todo - implement this... +} +__device__ __forceinline__ void stochastic_rounding(float in, float *out, unsigned int random) { + *out = in; // dummy function for when floatX is float (FP32 mode) +} + +#endif \ No newline at end of file diff --git a/CUDA/llmcpp/cudnn_att.cpp b/CUDA/llmcpp/cudnn_att.cpp new file mode 100644 index 0000000..0330abe --- /dev/null +++ b/CUDA/llmcpp/cudnn_att.cpp @@ -0,0 +1,297 @@ +// all cudnn-related functions are in this file, so that they don't need to be recompiled everytime +// we change some unrelated piece of the code. +// TODO this currently duplicates some of the utilities from the main file + +#define NOMINMAX +#include +#include "cudnn_att.h" +#include + +namespace fe = cudnn_frontend; + +// Specific configurations based on the enabled precision +#if defined(ENABLE_FP32) +static_assert(false, "cuDNN is not supported in FP32 mode.") +// use fp16 (note: this may require gradient scaler, currently not implemented!) +#elif defined(ENABLE_FP16) +#define CUDNN_16BIT fe::DataType_t::HALF +#else // Default to bfloat16 +#define CUDNN_16BIT fe::DataType_t::BFLOAT16 +#endif + +static cudnnHandle_t cudnn_handle; +static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!) +static void* cudnn_workspace = NULL; + +static void cuDNNCheck(cudnnStatus_t error, const char *file, int line) { + if (error != CUDNN_STATUS_SUCCESS) { + printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, cudnnGetErrorString(error)); + exit(EXIT_FAILURE); + } +}; +#define cuDNNCheck(err) (cuDNNCheck(err, __FILE__, __LINE__)) + +static void checkCudnnFE(const fe::error_object& e, const char *file, int line) { + if(!e.is_good()) { + printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, e.err_msg.c_str()); + exit(EXIT_FAILURE); + } +} +#define checkCudnnFE(err) checkCudnnFE(err, __FILE__, __LINE__) + +enum UIDs { + Q_UID, + K_UID, + V_UID, + Attn_scale_UID, + O_UID, + Stats_UID, + dO_UID, + dQ_UID, + dK_UID, + dV_UID +}; + +// Need a cache because graph->build_operation_graph() is slow but everything else seems fast +using cache_type_fwd = std::map, std::shared_ptr>; +using cache_type_bwd = std::map, std::shared_ptr>; + +// Loosely based on cuDNN frontend samples functions and massively simplified +auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_only) { + + static cache_type_fwd user_maintained_cache_fwd; + + auto key = std::make_tuple(B, H, T, HS, is_inference_only); + + auto it = user_maintained_cache_fwd.find(key); + if (it != user_maintained_cache_fwd.end()) { + return it->second; + } + + auto graph = std::make_shared(); + graph->set_io_data_type(CUDNN_16BIT) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + + // QKV is (B, T, 3, NH, HS) which cuDNN can handle directly without an external permute + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") + .set_dim({B, H, T, HS}) + .set_uid(Q_UID) + .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") + .set_dim({B, H, T, HS}) + .set_uid(K_UID) + .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") + .set_dim({B, H, T, HS}) + .set_uid(V_UID) + .set_stride({3 * H * HS * T, HS, 3 * H * HS, 1})); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_uid(Attn_scale_UID) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + + auto sdpa_options = fe::graph::SDPA_attributes().set_name("flash_attention"); + sdpa_options.set_is_inference(is_inference_only); + sdpa_options.set_attn_scale(attn_scale); + sdpa_options.set_causal_mask(true); + + // Create the graph operation and get the output tensors back + auto [O, stats] = graph->sdpa(Q, K, V, sdpa_options); + + // Output is (B, T, NH, HS) BF16/FP16 and stats for backward pass is (B, NH, T) FP32 + O->set_output(true).set_dim({B, H, T, HS}).set_stride({H * HS * T, HS, H * HS, 1}).set_uid(O_UID); + + assert(stats == nullptr || is_inference_only == false); + if (is_inference_only == false) { + stats->set_output(true).set_data_type(fe::DataType_t::FLOAT) + .set_dim({B, H, T, 1}) + .set_stride({H * T, T, 1, 1}) + .set_uid(Stats_UID); + } + + checkCudnnFE(graph->validate()); + + // Build the operation graph and execution part (this is the VERY SLOW PART) + checkCudnnFE(graph->build_operation_graph(cudnn_handle)); + auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); + checkCudnnFE(graph->check_support(cudnn_handle)); + checkCudnnFE(graph->build_plans(cudnn_handle)); + // Reallocate the workspace if the required size is greater than the current workspace + // In H100 this may be around 16B + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } + + user_maintained_cache_fwd.insert({key, graph}); + + return graph; +} + +auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) { + static cache_type_bwd user_maintained_cache_bwd; + + auto key = std::make_tuple(B, NH, T, HS); + + auto it = user_maintained_cache_bwd.find(key); + if (it != user_maintained_cache_bwd.end()) { + return it->second; + } + + auto graph = std::make_shared(); + graph->set_io_data_type(CUDNN_16BIT) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + + // (B, N, 3, NH, HS) + // must come from inp (which means we also need to convert THAT to FP16) + auto Q = graph->tensor(fe::graph::Tensor_attributes().set_name("Q") + .set_dim({B, NH, T, HS}) + .set_uid(Q_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto K = graph->tensor(fe::graph::Tensor_attributes().set_name("K") + .set_dim({B, NH, T, HS}) + .set_uid(K_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto V = graph->tensor(fe::graph::Tensor_attributes().set_name("V") + .set_dim({B, NH, T, HS}) + .set_uid(V_UID) + .set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1})); + auto O = graph->tensor(fe::graph::Tensor_attributes().set_name("O") + .set_dim({B, NH, T, HS}) + .set_uid(O_UID) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + auto dO = graph->tensor(fe::graph::Tensor_attributes().set_name("dO") + .set_dim({B, NH, T, HS}) + .set_uid(dO_UID) + .set_stride({NH * HS * T, HS, NH * HS, 1})); + + auto stats = graph->tensor(fe::graph::Tensor_attributes().set_name("stats") + .set_dim({B, NH, T, 1}) + .set_uid(Stats_UID) + .set_stride({NH * T, T, 1, 1}) + .set_data_type(fe::DataType_t::FLOAT)); + auto attn_scale = graph->tensor(fe::graph::Tensor_attributes().set_name("attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_uid(Attn_scale_UID) + .set_data_type(fe::DataType_t::FLOAT)); + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward") +#if CUDNN_FRONTEND_MAJOR_VERSION > 1 || CUDNN_FRONTEND_MINOR_VERSION >= 5 + .set_deterministic_algorithm(true) // 1.5+ needs this for determinism +#endif + .set_causal_mask(true) + .set_attn_scale(attn_scale); + + // Create the graph operation and get the output tensors back + auto [dQ, dK, dV] = graph->sdpa_backward(Q, K, V, O, dO, stats, sdpa_backward_options); + + dQ->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dQ_UID); + dK->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dK_UID); + dV->set_output(true).set_dim({B, NH, T, HS}).set_stride({3 * NH * HS * T, HS, 3 * NH * HS, 1}).set_uid(dV_UID); + + checkCudnnFE(graph->validate()); + + // Build the operation graph and execution part (this is the VERY SLOW PART) + checkCudnnFE(graph->build_operation_graph(cudnn_handle)); + auto plans = graph->create_execution_plans({fe::HeurMode_t::A}); + checkCudnnFE(graph->check_support(cudnn_handle)); + checkCudnnFE(graph->build_plans(cudnn_handle)); + + // Reallocate the workspace if the required size is greater than the current workspace + // By default, cuDNN uses up to 256MiB of workspace, so we don't want to just allocate the maximum + if (graph->get_workspace_size() > cudnn_workspace_size) { + if (cudnn_workspace_size > 0) { + cudaCheck(cudaFree(cudnn_workspace)); + } + cudnn_workspace_size = graph->get_workspace_size(); + cudaCheck(cudaMalloc(&cudnn_workspace, cudnn_workspace_size)); + } + + user_maintained_cache_bwd.insert({key, graph}); + return graph; +} + +void attention_forward_cudnn(floatX* out, // output: (B, T, NH, HS) + float* stats, // output for backward pass: (B, NH, T) + floatX* inp, // input: (B, T, 3, NH, HS) QKV + int B, int T, int NH, int C, cudaStream_t stream) { + NVTX_RANGE_FN(); + int HS = C / NH; // number of features per head + bool is_inference_only = (stats == nullptr); + + cuDNNCheck(cudnnSetStream(cudnn_handle, stream)); + + // Get graph and tensors from cache (or generate it on first use) + auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only); + + // Prepare all the tensor pointers for executing the graph + void* devPtrQ = inp; + void* devPtrK = (inp + C); + void* devPtrV = (inp + 2 * C); + float attn_scale_cpu = 1.0 / sqrtf(HS); + void* devPtrO = out; + + // Build variant pack + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {Attn_scale_UID, &attn_scale_cpu}, {O_UID, devPtrO}}; + + // Add the stats tensor unless we are only doing inference (only needed for backward pass) + if (is_inference_only == false) { + variant_pack[Stats_UID] = stats; + } + + // Execute graph + checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); + cudaCheck(cudaGetLastError()); +} + +void attention_backward_cudnn(floatX* dqkvr, // output + floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs + int B, int T, int NH, int C, cudaStream_t stream) { + NVTX_RANGE_FN(); + int HS = C / NH; // number of features per head + + // Get graph and tensors from cache (or generate it on first use) + auto graph = lookup_cache_or_build_graph_bwd(B, NH, T, HS); + + // Prepare all the tensor pointers for executing the graph + void* devPtrQ = qkvr; + void* devPtrK = (qkvr + NH * HS); + void* devPtrV = (qkvr + 2 * NH * HS); + void* devPtrO = o; + void* devPtrdO = dout; + void* devPtrStats = stats; + float attn_scale_cpu = 1.0 / sqrtf(HS); + + void* devPtrdQ = dqkvr; + void* devPtrdK = (dqkvr + NH * HS); + void* devPtrdV = (dqkvr + 2 * NH * HS); + + // Build variant pack that links each tensor to its data pointer + std::unordered_map variant_pack = { + {Q_UID, devPtrQ}, {K_UID, devPtrK}, {V_UID, devPtrV}, {O_UID, devPtrO}, {dO_UID, devPtrdO}, {Stats_UID, devPtrStats}, + {dQ_UID, devPtrdQ}, {dK_UID, devPtrdK}, {dV_UID, devPtrdV}, + {Attn_scale_UID, &attn_scale_cpu}}; + + // Execute graph + cuDNNCheck(cudnnSetStream(cudnn_handle, stream)); + checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace)); + cudaCheck(cudaGetLastError()); +} + +void create_cudnn() { + cuDNNCheck(cudnnCreate(&cudnn_handle)); +} + +void destroy_cudnn() { + if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); } + cuDNNCheck(cudnnDestroy(cudnn_handle)); +} \ No newline at end of file diff --git a/CUDA/llmcpp/dataloader.h b/CUDA/llmcpp/dataloader.h new file mode 100644 index 0000000..0ee0588 --- /dev/null +++ b/CUDA/llmcpp/dataloader.h @@ -0,0 +1,496 @@ +#ifndef DATALOADER_H +#define DATALOADER_H + +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "rand.h" +#ifndef _WIN32 +#include +#endif +#define HEADER_SIZE 256 + +typedef struct +{ + + int process_rank; + int num_processes; + + size_t B; + size_t T; + size_t num_tokens; + size_t shard_num_samples; + + glob_t glob_result; + size_t current_shard_idx; + size_t current_sample_idx; + + FILE *tokens_file; + + uint16_t *buffer; + int *inputs; + int *targets; + + mt19937_state shuffle_rng; + int should_shuffle; + int *shard_indices; + int *intra_shard_indices; + + size_t total_batch_size_bytes; + size_t local_batch_offset_bytes; + size_t header_bytes; + int64_t file_size_bytes; +} DataLoader; + +int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) +{ + if (loader->should_shuffle) + { + shard_index = loader->shard_indices[shard_index]; + } + + const char *filename = loader->glob_result.gl_pathv[shard_index]; + + if (loader->tokens_file != NULL) + { + fcloseCheck(loader->tokens_file); + } + loader->tokens_file = fopenCheck(filename, "rb"); + + int header[HEADER_SIZE]; + freadCheck(header, sizeof(int), HEADER_SIZE, loader->tokens_file); + if (header[0] != 20240520) + { + + printf("---> HINT: Are you passing in a correct file?\n"); + printf("---> HINT: The data encoding may have changed, re-run data prepro or refer again to README.\n"); + exit(EXIT_FAILURE); + } + if (header[1] != 1) + { + printf("Bad version in data file\n"); + exit(EXIT_FAILURE); + } + int64_t ntok = header[2]; // + assert(ntok > 0); + fseekCheck(loader->tokens_file, 0, SEEK_END); + loader->file_size_bytes = ftell(loader->tokens_file); + fseekCheck(loader->tokens_file, 0, SEEK_SET); + int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t); + if (loader->file_size_bytes != expected_file_size) + { + printf("Error: file size is not as expected\n"); + exit(EXIT_FAILURE); + } + + loader->shard_num_samples = (ntok * sizeof(uint16_t) - sizeof(uint16_t)) / loader->total_batch_size_bytes; + return ntok; +} + +void prepare_intra_shard_indices_(DataLoader *loader) +{ + + if (loader->intra_shard_indices != NULL) + { + + free(loader->intra_shard_indices); + } + loader->intra_shard_indices = (int *)mallocCheck(loader->shard_num_samples * sizeof(int)); + init_identity_permutation(loader->intra_shard_indices, (int)loader->shard_num_samples); + random_permutation(loader->intra_shard_indices, (int)loader->shard_num_samples, &loader->shuffle_rng); +} + +void dataloader_reset(DataLoader *loader) +{ + loader->current_shard_idx = 0; + loader->current_sample_idx = 0; + + if (loader->should_shuffle) + { + random_permutation(loader->shard_indices, (int)loader->glob_result.gl_pathc, &loader->shuffle_rng); + } + + dataloader_load_shard_(loader, (int)loader->current_shard_idx); + + if (loader->should_shuffle) + { + prepare_intra_shard_indices_(loader); + } +} + +void dataloader_advance_(DataLoader *loader) +{ + if (loader->current_shard_idx == loader->glob_result.gl_pathc - 1) + { + + dataloader_reset(loader); + return; + } + + loader->current_shard_idx = (loader->current_shard_idx + 1) % loader->glob_result.gl_pathc; + loader->current_sample_idx = 0; + dataloader_load_shard_(loader, (int)loader->current_shard_idx); + + if (loader->should_shuffle) + { + prepare_intra_shard_indices_(loader); + } +} + +void dataloader_init(DataLoader *loader, + const char *filename_pattern, + size_t B, + size_t T, + int process_rank, + int num_processes, + int should_shuffle) +{ + loader->process_rank = process_rank; + loader->num_processes = num_processes; + loader->B = B; + loader->T = T; + loader->tokens_file = NULL; + loader->should_shuffle = should_shuffle; + loader->header_bytes = HEADER_SIZE * sizeof(int); + loader->total_batch_size_bytes = ((loader->num_processes * (loader->B * loader->T)) * sizeof(uint16_t)); + loader->local_batch_offset_bytes = loader->process_rank * loader->B * loader->T * sizeof(uint16_t); + + int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result); + if (glob_status != 0) + { + printf("Error: failed to glob pattern: %s\n", filename_pattern); + exit(EXIT_FAILURE); + } + if (loader->glob_result.gl_pathc == 0) + { + printf("Error: no files found matching the pattern: %s\n", filename_pattern); + exit(EXIT_FAILURE); + } + + if (should_shuffle) + { + mt19937_state shuffle_rng; + manual_seed(&shuffle_rng, 42 + process_rank); + loader->shuffle_rng = shuffle_rng; + loader->shard_indices = (int *)mallocCheck(loader->glob_result.gl_pathc * sizeof(int)); + init_identity_permutation(loader->shard_indices, (int)loader->glob_result.gl_pathc); + loader->intra_shard_indices = NULL; + } + + int64_t ntok_total = 0; + for (int shard_index = 0; shard_index < loader->glob_result.gl_pathc; shard_index++) + { + int64_t shard_ntok = dataloader_load_shard_(loader, shard_index); + + assert(shard_ntok >= (int64_t)(num_processes * B * T + 1)); + ntok_total += shard_ntok; + } + + loader->buffer = (uint16_t *)mallocCheck((B * T + 1) * sizeof(uint16_t)); + loader->inputs = (int *)mallocCheck(B * T * sizeof(int)); + loader->targets = (int *)mallocCheck(B * T * sizeof(int)); + loader->num_tokens = ntok_total; + + dataloader_reset(loader); +} + +void dataloader_load_batch(DataLoader *loader) +{ + assert(!loader->should_shuffle || (loader->should_shuffle && loader->intra_shard_indices != NULL)); + assert(loader->current_sample_idx < loader->shard_num_samples); + size_t idx = loader->should_shuffle ? loader->intra_shard_indices[loader->current_sample_idx] : loader->current_sample_idx; + size_t global_batch_offset_bytes = idx * loader->total_batch_size_bytes; + int64_t current_offset = loader->header_bytes + global_batch_offset_bytes + loader->local_batch_offset_bytes; + + size_t B = loader->B; + size_t T = loader->T; + + fseekCheck(loader->tokens_file, (int)current_offset, SEEK_SET); + freadCheck(loader->buffer, sizeof(uint16_t), B * T + 1, loader->tokens_file); + + for (int i = 0; i < B * T; i++) + { + loader->inputs[i] = (int)loader->buffer[i]; + loader->targets[i] = (int)loader->buffer[i + 1]; + } +} + +void dataloader_next_batch(DataLoader *loader) +{ + + if (loader->current_sample_idx >= loader->shard_num_samples) + { + dataloader_advance_(loader); + } + dataloader_load_batch(loader); + loader->current_sample_idx += 1; +} + +void dataloader_resume(DataLoader *loader, size_t current_shard_idx, size_t current_sample_idx) +{ + + loader->current_shard_idx = current_shard_idx; + loader->current_sample_idx = current_sample_idx; + dataloader_load_shard_(loader, (int)loader->current_shard_idx); +} + +void dataloader_free(DataLoader *loader) +{ + free(loader->buffer); + free(loader->inputs); + free(loader->targets); + if (loader->should_shuffle) + { + free(loader->shard_indices); + free(loader->intra_shard_indices); + } + fcloseCheck(loader->tokens_file); + globfree(&loader->glob_result); +} + +#define ASSUMED_NUM_COMPLETIONS 4 +#define CEIL_DIV(M, N) (((M) + (N) - 1) / (N)) + +typedef struct +{ + + int process_rank; + int num_processes; + + size_t B; + size_t T; + FILE *eval_file; + uint16_t *buffer; + int num_examples; + int num_batches; + int start_example_index; + int end_example_index; + int current_example_index; + int *inputs; + int *targets; + char *mask; + int *label; + int num_completions; +} EvalLoader; + +void evalloader_reset(EvalLoader *loader) +{ + int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes); + int can_fit_examples = (int)(loader->B / ASSUMED_NUM_COMPLETIONS); + if (can_fit_examples == 0) + { + + printf("HellaSwag EvalLoader: batch size %zu is < %d\n", loader->B, ASSUMED_NUM_COMPLETIONS); + printf("---> HINT: Disable HellaSwag eval with -h 0, or increase batch size with -b\n"); + exit(EXIT_FAILURE); + } + loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples); + + loader->start_example_index = examples_per_process * loader->process_rank; + loader->end_example_index = examples_per_process * (loader->process_rank + 1); + + if (loader->end_example_index > loader->num_examples) + { + loader->end_example_index = loader->num_examples; + } + + int64_t header_bytes = HEADER_SIZE * sizeof(int); + fseekCheck(loader->eval_file, (int)header_bytes, SEEK_SET); + for (int i = 0; i < loader->start_example_index; i++) + { + uint16_t example_header[3]; + // read 3 uint16_t values: , , + freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file); + // validate the delimiter + assert(example_header[0] == 65535); // delimiter + // validate the + assert(example_header[2] == i); // should match the loop index + // skip to the next example, keeping in mind that we already read the header + size_t remaining_bytes = example_header[1] - sizeof(uint16_t) * 3; + assert(remaining_bytes > 0); // we expect some bytes in the example + fseekCheck(loader->eval_file, (int)remaining_bytes, SEEK_CUR); + } + // now we are at the start of the example we want to start at, pointing at + loader->current_example_index = loader->start_example_index; +} + +void evalloader_init(EvalLoader *loader, + const char *filename, + size_t B, + size_t T, + int process_rank, + int num_processes) +{ + loader->process_rank = process_rank; + loader->num_processes = num_processes; + loader->B = B; + loader->T = T; + + // open the file and validate the header + loader->eval_file = fopenCheck(filename, "rb"); + // validate the header + int header[HEADER_SIZE]; + freadCheck(header, sizeof(int), HEADER_SIZE, loader->eval_file); + if (header[0] != 20240522) + { + printf("Bad magic in eval file\n"); + exit(EXIT_FAILURE); + } + if (header[1] != 1) + { + printf("Bad version in data file\n"); + exit(EXIT_FAILURE); + } + loader->num_examples = header[2]; // number of examples in the file + assert(loader->num_examples >= num_processes); // avoid headaches for now + size_t longest_example_bytes = header[3]; // longest example in the file + // basic sensibility check we could relax later. but roughly each example + // contains the prompt (or "context") and 4 completions, all of these have to be + // up to T tokens, and their tokens are uint16_t (so 2 bytes/token). + // There's a few more things in each example but they are minor. + // So longest example should be roughly this. Just trying to make sure it's sensible. + assert(longest_example_bytes > 0 && longest_example_bytes < (1 + ASSUMED_NUM_COMPLETIONS) * T * 2); + + // allocate all the space we'll need + int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS); + loader->buffer = (uint16_t *)mallocCheck(longest_example_bytes); + loader->inputs = (int *)calloc(B * T, sizeof(int)); + loader->targets = (int *)calloc(B * T, sizeof(int)); + loader->mask = (char *)mallocCheck(B * T * sizeof(char)); + loader->label = (int *)mallocCheck(can_fit_examples * sizeof(int)); + + // reset the loader, to initialize it + evalloader_reset(loader); +} + +void evalloader_next_example_(EvalLoader *loader, int example_batch_index) +{ + size_t B = loader->B; + size_t T = loader->T; + int batch_dim_offset = example_batch_index * ASSUMED_NUM_COMPLETIONS; + uint16_t example_header[3]; + freadCheck(&example_header[0], sizeof(uint16_t), 3, loader->eval_file); + assert(example_header[0] == 65535); + assert(example_header[2] == loader->current_example_index); + assert(example_header[2] >= loader->start_example_index && example_header[2] < loader->end_example_index); + + size_t example_bytes = example_header[1] - sizeof(uint16_t) * 3; + freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file); + int label = (int)loader->buffer[0]; + int can_fit_examples = (int)(loader->B / ASSUMED_NUM_COMPLETIONS); + assert(label >= 0 && label < ASSUMED_NUM_COMPLETIONS); + assert(example_batch_index >= 0 && example_batch_index < can_fit_examples); + loader->label[example_batch_index] = label; + int num_completions = (int)loader->buffer[1]; + assert(num_completions == ASSUMED_NUM_COMPLETIONS); + assert(batch_dim_offset + num_completions <= B); + loader->num_completions = num_completions; + + int context_length = (int)loader->buffer[2]; + uint16_t *context_tokens_start = &loader->buffer[3]; + assert(context_length > 0 && context_length < T); + for (int b = 0; b < num_completions; b++) + { + for (int i = 0; i < context_length; i++) + { + int boff = batch_dim_offset + b; + int tok_cur = (int)context_tokens_start[i]; + loader->inputs[boff * T + i] = tok_cur; + } + } + uint16_t *completions_iter = loader->buffer + 3 + context_length; + for (int c = 0; c < num_completions; c++) + { + int coff = batch_dim_offset + c; + int completion_length = (int)completions_iter[0]; + uint16_t *completion_tokens_start = completions_iter + 1; + assert(completion_length > 0 && context_length + completion_length < T); + for (int i = 0; i < completion_length; i++) + { + int tok_cur = (int)completion_tokens_start[i]; + + loader->inputs[coff * T + context_length + i] = tok_cur; + + loader->targets[coff * T + context_length + i - 1] = tok_cur; + + loader->mask[coff * T + context_length + i - 1] = 1; + } + completions_iter += 1 + completion_length; + loader->current_example_index += 1; + } + + void evalloader_next_batch(EvalLoader * loader) + { + size_t B = loader->B; + size_t T = loader->T; + memset(loader->mask, 0, B * T * sizeof(char)); + int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS); + for (int i = 0; i < can_fit_examples; i++) + { + if (loader->current_example_index >= loader->end_example_index) + { + break; + } + evalloader_next_example_(loader, i); + } + } + + int evalloader_stat_losses(EvalLoader * loader, float *losses) + { + int correct = 0; + size_t B = loader->B; + size_t T = loader->T; + int can_fit_examples = (int)(B / ASSUMED_NUM_COMPLETIONS); + for (int i = 0; i < can_fit_examples; i++) + { + float min_loss = 0.0f; + int min_loss_index = -1; + char active = 0; + for (int b = 0; b < ASSUMED_NUM_COMPLETIONS; b++) + { + int boff = i * ASSUMED_NUM_COMPLETIONS + b; + float average_loss = 0.0f; + int count = 0; + for (int t = 0; t < T; t++) + { + char mask = loader->mask[boff * T + t]; + if (mask == 1) + { + active = 1; + average_loss += losses[boff * T + t]; + count++; + } + } + if (count > 0) + { + average_loss /= count; + } + if (b == 0 || average_loss < min_loss) + { + min_loss = average_loss; + min_loss_index = b; + } + } + if (active && (min_loss_index == loader->label[i])) + { + correct += 1; + } + } + return correct; + } + + void evalloader_free(EvalLoader * loader) + { + free(loader->buffer); + free(loader->inputs); + free(loader->targets); + free(loader->mask); + free(loader->label); + fcloseCheck(loader->eval_file); + } + +#endif \ No newline at end of file diff --git a/CUDA/llmcpp/layernorm.cuh b/CUDA/llmcpp/layernorm.cuh new file mode 100644 index 0000000..9777d06 --- /dev/null +++ b/CUDA/llmcpp/layernorm.cuh @@ -0,0 +1,505 @@ +/* +LayerNorm CUDA kernel, and also Residual, because sometimes they are fused + +Note in llm.c we try to be clever in the backward pass to conserve memory. +All parameters use a += in the backward pass, so we can do gradient accumulation. +But all activations have = instead of += because these are faster (just read, no write). +This is okay for all activations except for those in the residual stream, where the +gradients have to add. We make sure that we do a += as necessary. +E.g., the layernorms are connected to the residuals so we += in layernorm backward. +*/ + +#include +// llmc internal imports +#include "cuda_common.h" +#include "cuda_utils.cuh" + +// ---------------------------------------------------------------------------- +// CUDA kernels + +__global__ void layernorm_forward_kernel3(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd, + const floatX* __restrict__ inp, const floatX* __restrict__ weight, + const floatX* __restrict__ bias, int N, int C) { + int lane_id = threadIdx.x % WARP_SIZE; + int warp_id = threadIdx.x / WARP_SIZE; + int num_warps = blockDim.x / WARP_SIZE; + + int idx = blockIdx.x * num_warps + warp_id; + if(idx >= N) { return; } // guard + + // the row of input that this group of threads is responsible for + const floatX* x = inp + idx * C; + + // mean + float sum = 0.0f; + for (int i = lane_id; i < C; i += WARP_SIZE) { + sum += (float)x[i]; + } + sum = warpReduceSum(sum); + float m = sum / C; + if(lane_id == 0 && mean != nullptr) { + __stcs(mean + idx, m); + } + + // rstd + sum = 0.0f; + for (int i = lane_id; i < C; i += WARP_SIZE) { + float diff = (float)x[i] - m; + sum += diff * diff; + } + sum = warpReduceSum(sum); + float s = rsqrtf(sum / C + 1e-5f); + if(lane_id == 0 && rstd != nullptr) { + __stcs(rstd + idx, s); + } + + // final normalization and scaling by weight/bias + floatX* o = out + idx * C; + for (int c = lane_id; c < C; c += WARP_SIZE) { + // load and store using the .cs "streaming" hint to the compiler, + // indicating that this data will not be reused soon, and can be streamed through the caches + // this allows the threads to get more cache-hits for the (shared) weight and bias parameters + float n = s * ((float)__ldcs(x+c) - m); + __stcs(o+c, (floatX)(n * (float)weight[c] + (float)bias[c])); + } +} + +__global__ void layernorm_forward_kernel6(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd, + const floatX* __restrict__ inp, const floatX* __restrict__ weight, + const floatX* __restrict__ bias, int N, int C) { + assert(blockDim.x == WARP_SIZE); + + // load weights and biases into shared memory + // do this before we allow any threads to exit! + extern __shared__ char* params[]; + // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so + // let's keep everything as x128 + x128* s_weight = reinterpret_cast(params); + x128* s_bias = reinterpret_cast(params) + (C / x128::size); + x128* s_in = reinterpret_cast(params) + ((2 + threadIdx.y) * C / x128::size); + + int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size; + for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) { + s_weight[i/x128::size] = load128(weight + i); + s_bias[i/x128::size] = load128(bias + i); + } + __syncthreads(); + + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx >= N) { return; } // guard + + // adjust pointers to current token + inp += idx * C; + out += idx * C; + + const float eps = 1e-5f; + float sum = 0.0f; + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 in_data = load128cs(inp + c); + for(int k = 0; k < x128::size; ++k) { + sum += (float)in_data[k]; + } + s_in[c / x128::size] = in_data; + } + + sum = warpReduceSum(sum); + float m = sum / C; + float v = 0.f; + + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 in_data = s_in[c / x128::size]; + for(int k = 0; k < x128::size; ++k) { + v += ((float)in_data[k] - m) * ((float)in_data[k] - m); + } + } + + v = warpReduceSum(v) / C; + float s = rsqrtf(v + eps); + + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 in_data = s_in[c / x128::size]; + const x128 w = s_weight[c / x128::size]; + const x128 b = s_bias[c / x128::size]; + x128 out_data; + for(int k = 0; k < x128::size; ++k) { + float n = s * ((float)in_data[k] - m); // normalized output + float o = n * (float)w[k] + (float)b[k]; // scale and shift it + out_data[k] = (floatX)o; + } + + store128cs(out + c, out_data); + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0 && mean != nullptr) { + __stcs(mean + idx, m); + } + // store the rstd, no need to cache it + if(threadIdx.x == 0 && rstd != nullptr) { + __stcs(rstd + idx, s); + } +} + +__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, float* mean, float* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C) { + assert(blockDim.x == WARP_SIZE); + + // load weights and biases into shared memory + // do this before we allow any threads to exit! + extern __shared__ char* params[]; + // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so + // let's keep everything as x128 + x128* s_weight = reinterpret_cast(params); + x128* s_bias = reinterpret_cast(params) + (C / x128::size); + x128* s_res = reinterpret_cast(params) + ((2 + threadIdx.y) * C / x128::size); + + int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size; + for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) { + s_weight[i/x128::size] = load128(weight + i); + s_bias[i/x128::size] = load128(bias + i); + } + __syncthreads(); + + int idx = blockIdx.x * blockDim.y + threadIdx.y; + if(idx > N) return; + + // adjust pointers to current token + residual += C * idx; + normed += C * idx; + inp1 += C * idx; + inp2 += C * idx; + + const float eps = 1e-5f; + float sum = 0.0f; + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 in1 = load128cs(inp1 + c); + const x128 in2 = load128cs(inp2 + c); + x128 out; + for(int k = 0; k < x128::size; ++k) { + out[k] = (float)in1[k] + (float)in2[k]; + sum += (float)out[k]; + } + store128cs(residual + c, out); + s_res[c / x128::size] = out; + } + + sum = warpReduceSum(sum); + float m = sum / C; + float v = 0.f; + + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 res = s_res[c / x128::size]; + for(int k = 0; k < x128::size; ++k) { + v += ((float)res[k] - m) * ((float)res[k] - m); + } + } + + v = warpReduceSum(v) / C; + float s = rsqrtf(v + eps); + + for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) { + const x128 res = s_res[c / x128::size]; + const x128 w = s_weight[c / x128::size]; + const x128 b = s_bias[c / x128::size]; + x128 out; + for(int k = 0; k < x128::size; ++k) { + float n = s * ((float)res[k] - m); // normalized output + float o = n * (float)w[k] + (float)b[k]; // scale and shift it + out[k] = o; + } + + store128cs(normed + c, out); + } + // cache the mean and rstd for the backward pass later + if(threadIdx.x == 0) { + mean[idx] = m; + rstd[idx] = s; + } +} + +__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) { + int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size; + + x128 packed_out; + x128 packed_inp1 = load128cs(inp1 + idx); + x128 packed_inp2 = load128cs(inp2 + idx); + for (int k = 0; k < packed_inp1.size; k++) { + packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]); + } + store128(out + idx, packed_out); +} + +__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads? + layernorm_backward_kernel10(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, + const floatX* dout, const floatX* inp, const floatX* weight, + const float* mean, const float* rstd, + int B, int T, int C) { + int BLOCK_SIZE = blockDim.x; + int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block + extern __shared__ float shared[]; + + int warpId = threadIdx.x / WARP_SIZE; // warp index within a block + int baseIdx = blockIdx.x * warpsInBlock + warpId; + int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp + int warpsInGrid = gridDim.x * warpsInBlock; + int C_per_iteration = WARP_SIZE * x128::size; + int iterations_C = CEIL_DIV(C, C_per_iteration); // + 2; + + // the first half of shared memory is bias, second is weight + size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size); + float* dbias_shared = shared; + float* dweight_shared = shared + rounded_C; + // warp zero doesn't actually write to the _tmp_shared memory locations, so we don't need to reserve memory + // the obvious solution is to change the addressing below to use (threadId.x-32) as offset, but that causes + // register spills, so instead we mess with the base pointer here, which doesn't increase register usage. + float* dbias_tmp_shared = shared + 2 * rounded_C - WARP_SIZE * f128::size; + float* dweight_tmp_shared = shared + 2 * rounded_C + f128::size * BLOCK_SIZE - 2 * WARP_SIZE * f128::size; + + // init shared memory to zero + for(int i = threadIdx.x * f128::size; i < rounded_C; i += BLOCK_SIZE * f128::size) { + store128(dbias_shared + i, f128::zeros()); + store128(dweight_shared + i, f128::zeros()); + } + __syncthreads(); + + for (int bt = baseIdx; bt < B * T; bt += warpsInGrid) { + const floatX* dout_bt = dout + bt * C; + const floatX* inp_bt = inp +bt * C; + floatX* dinp_bt = dinp + bt * C; + + // first: two reduce operations + float dnorm_mean = 0.0f; + float dnorm_norm_mean = 0.0f; + for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) { + x128 dout128_i = load128(dout_bt + i); + x128 inp128_i = load128(inp_bt + i); + x128 weight128_i = load128(weight + i); + for (int k = 0; k < x128::size; k++) { + float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k]; + dnorm_mean += dnorm_i; + dnorm_norm_mean += dnorm_i * (float)inp128_i[k]; + } + } + + const float mean_bt = mean[bt]; + const float rstd_bt = rstd[bt]; + dnorm_mean = warpReduceSum(dnorm_mean) / C; + dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C * rstd_bt - dnorm_mean * mean_bt * rstd_bt; + + for (int c = 0; c < iterations_C; c++) { + int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration); + + x128 dout128 = x128::zeros(); + x128 inp128 = x128::zeros(); + x128 dinp128 = x128::zeros(); + x128 weight128 = x128::zeros(); + + if(global_index < C) { + dout128 = load128cs(dout_bt + global_index); + inp128 = load128cs(inp_bt + global_index); + dinp128 = load128(dinp_bt + global_index); + weight128 = load128(weight + global_index); + } + + for(int o = 0; o < x128::size / f128::size; ++o) { + f128 dbias_f; + f128 dweight_f; + for(int i = 0; i < f128::size; ++i) { + int x = o * f128::size + i; + float dout_i = (float)dout128[x]; + float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt; + dbias_f[i] = dout_i; + dweight_f[i] = norm_bti * dout_i; + + float dval = 0.0f; + dval += (float) weight128[x] * (float)dout128[x]; // term 1 + dval -= dnorm_mean; // term 2 + dval -= norm_bti * dnorm_norm_mean; // term 3 + dval *= rstd_bt; // final scale + dinp128[x] = (floatX) ((float) dinp128[x] + dval); + } + + if (warpId != 0) { + store128(dbias_tmp_shared + threadIdx.x * f128::size, dbias_f); + // this seems to generate a 64-bit store, instead of 128-bit. + // however, forcing 128-bit (e.g., using inline ptx), results in register + // spilling and much worse performance, so we'll keep it like this for now + // but ideally, we could reduce the register pressure a little. + store128(dweight_tmp_shared + threadIdx.x * f128::size, dweight_f); + } + __syncthreads(); + if (warpId == 0) { + for (int j = 1; j < warpsInBlock; j++) { + f128 dbias_tmp = load128(dbias_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE)); + f128 dweight_tmp = load128(dweight_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE)); + for(int i = 0; i < f128::size; ++i) { + dbias_f[i] += dbias_tmp[i]; + dweight_f[i] += dweight_tmp[i]; + } + } + } + __syncthreads(); + if (warpId == 0) { + f128 db_old = load128(dbias_shared + global_index + f128::size * o); + f128 dw_old = load128(dweight_shared + global_index + f128::size * o); + for(int i = 0; i < f128::size; ++i) { + dbias_f[i] += db_old[i]; + dweight_f[i] += dw_old[i]; + } + store128(dbias_shared + global_index + f128::size * o, dbias_f); + store128(dweight_shared + global_index + f128::size * o, dweight_f); + } + } + if(global_index < C) { + // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing + store128cg(dinp_bt + global_index, dinp128); + } + } + } + __syncthreads(); + // Each block writes its partial sum to global memory + // The last block to finish becomes responsible for summing up all the partial sums + // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel) + unsigned int* scratchFlag = (unsigned int*)(scratch); + // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned + scratch += 32; + float* scratch_dbias = scratch; + float* scratch_dweight = scratch + C; + for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) { + // Write to global memory in the same "shared memory banking friendly" order + store128(scratch_dbias + i + 2*C*blockIdx.x, load128(dbias_shared + i)); + store128(scratch_dweight + i + 2*C*blockIdx.x, load128(dweight_shared + i)); + } + __syncthreads(); + // that portion of shared memory is no longer used, so we can repurpose it for the scratch flag. + unsigned int *tmp_flag = (unsigned int*)(shared + 2*rounded_C); + if (threadIdx.x == 0) { + *tmp_flag = atomicInc(scratchFlag, gridDim.x); + } + __syncthreads(); + if (*tmp_flag == gridDim.x-1) { + // Reduction of the partial sums by the final block + // todo - there isn't enough parallelism even inside that single SM... + // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?! + for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) { + f128 dbias_accum = f128::zeros(); + f128 dweight_accum = f128::zeros(); + + for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) { + int offset = i + 2*C*read_block_idx; + f128 dbias128 = load128(scratch_dbias + offset); + f128 dweight128 = load128(scratch_dweight + offset); + for(int k = 0; k < f128::size; k++) { + dbias_accum[k] += dbias128[k]; + dweight_accum[k] += dweight128[k]; + } + } + store128(dbias_shared + i, dbias_accum); + store128(dweight_shared + i, dweight_accum); + } + __syncthreads(); + + // convert from float/FP32 to floatX/BF16 for the final write + // this is separate because it cannot use as many warps as the above (f128 vs x128) + // todo - if we split this code into another kernel, we could maybe do it at the same time? + for (int c = warpId; c < iterations_C; c += warpsInBlock) { + int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration); + if (global_index >= C) { + break; + } + + x128 dbias128 = load128(dbias + global_index); + x128 dweight128 = load128(dweight + global_index); + for(int o = 0; o < x128::size / f128::size; ++o) { + f128 s_db = load128(dbias_shared + global_index + o * f128::size); + f128 s_dw = load128(dweight_shared + global_index + o * f128::size); + for(int i = 0; i < f128::size; ++i) { + int x = o * f128::size + i; + dbias128[x] = (floatX)(s_db[i] + (float)dbias128[x]); + dweight128[x] = (floatX)(s_dw[i] + (float)dweight128[x]); + } + } + store128(dbias + global_index, dbias128); + store128(dweight + global_index, dweight128); + } + } +} + +// ---------------------------------------------------------------------------- +// kernel launchers + +// similar to `fused_residual_forward5` +void layernorm_forward(floatX* out, float* mean, float* rstd, + floatX* inp, const floatX* weight, const floatX* bias, + int B, int T, int C, cudaStream_t stream) { + NVTX_RANGE_FN(); + const int block_size = 256; + int block_y = block_size / WARP_SIZE; + const int N = B * T; + const int grid_size = CEIL_DIV(N, block_y); + size_t smem = (2 + block_y) * C * sizeof(floatX); + + // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute + // this may fail, in which case we fall back to the smem free implementation. + cudaCheck(cudaGetLastError()); + auto status = cudaFuncSetAttribute(layernorm_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); + cudaCheck(cudaGetLastError()); + if (status == cudaSuccess) { + layernorm_forward_kernel6<<>>(out, mean, rstd, inp, weight, bias, N, C); + } else { + // fall back to the version without shared memory + const int grid_size_fb = CEIL_DIV(N * WARP_SIZE, block_size); + layernorm_forward_kernel3<<>>(out, mean, rstd, inp, weight, bias, N, C); + } + cudaCheck(cudaGetLastError()); +} + +void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N, cudaStream_t stream) { + NVTX_RANGE_FN(); + const int block_size = 256; + assert(N % (block_size * x128::size) == 0); + const int grid_size = CEIL_DIV(N, block_size * x128::size); + residual_forward_kernel<<>>(out, inp1, inp2); + cudaCheck(cudaGetLastError()); +} + +void fused_residual_forward5(floatX* residual, floatX* normed, float* mean, float* rstd, + const floatX* inp1, const floatX* inp2, + const floatX* weight, const floatX* bias, + int N, int C, cudaStream_t stream) { + const int block_size = 256; + int block_y = block_size / WARP_SIZE; + const int grid_size = CEIL_DIV(N, block_y); + size_t smem = (2 + block_y) * C * sizeof(floatX); + + // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute + // this may fail, in which case we fall back to the smem free implementation. + cudaCheck(cudaGetLastError()); + auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem); + cudaCheck(cudaGetLastError()); + if(status == cudaSuccess) { + fused_residual_forward_kernel5<<>>(residual, normed, + mean, rstd, inp1, inp2, + weight, bias, N, C); + } else { + residual_forward(residual, inp1, inp2, N*C, stream); + layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C, stream); + } + cudaCheck(cudaGetLastError()); +} + +void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch, + const floatX* dout, const floatX* inp, const floatX* weight, const float* mean, const float* rstd, + int B, int T, int C, cudaStream_t stream) { + NVTX_RANGE_FN(); + const int block_size = 512; + const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3 + const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount; + size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size); + size_t shared_mem_size = (2 * rounded_C + 2 * (block_size - 32) * f128::size) * sizeof(float); + + cudaCheck(cudaMemsetAsync(scratch, 0, 1 * sizeof(float), stream)); // only need to reset the flag to 0 + layernorm_backward_kernel10<<>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C); + cudaCheck(cudaGetLastError()); +} diff --git a/CUDA/llmcpp/mfu.h b/CUDA/llmcpp/mfu.h new file mode 100644 index 0000000..1c40b7b --- /dev/null +++ b/CUDA/llmcpp/mfu.h @@ -0,0 +1,244 @@ +#ifndef MFU_H +#define MFU_H + +#include +#include +#include +#if __has_include() +#define USE_NVML 1 +#include +#else +#define USE_NVML 0 +#endif + +// tied to enum PrecisionMode, in a future refactor make them the same +#define MFUH_PRECISION_FP32 0 +#define MFUH_PRECISION_FP16 1 +#define MFUH_PRECISION_BF16 2 + +#if USE_NVML +inline void nvml_check(nvmlReturn_t status, const char *file, int line) { + if (status != NVML_SUCCESS) { + printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status)); + exit(EXIT_FAILURE); + } +}; +#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__)) +#endif + + +typedef struct { + float TF_32; // tensor-core performance 32 bit + float BF_16_32; // bf16 with 32 bit accumulate + float FP_16_32; // fp16 with 32 bit accumulate + float FP_16_16; // fp16 with 16 bit accumulate + float FP_8_32; // and so on + float FP_8_16; + float CLOCK; // clock frequency from the spec sheet + float CORES; // #TCs from the spec sheet +} PerfData; + +// basic default data from the nvidia whitepapers +static const PerfData VOLTA = {125.0f, -1.f, 125.f, -1.f, -1.f, -1.f, 1530.f, 640.f}; +static const PerfData AMPERE_DATACENTER = {156.f, 312.f, 312.f, 312.f, -1.f, -1.f, 1410.f, 432.f}; +static const PerfData AMPERE_CONSUMER = {40.f, 80.f, 80.f, 160.f, -1.f, -1.f, 1860.f, 336.f}; +static const PerfData HOPPER = {378.f, 756.f, 756.f, 756.f, 1513.f, 1513.f, 1620.f, 456.f}; +static const PerfData ADA = {82.6f, 165.2f, 165.2f, 330.3f, 330.3f, 660.6f, 2520.f, 512.f}; + +typedef struct { + const char* name; + const PerfData* perf_data; + float new_cores; + float new_mhz; +} GPUEntry; + +// the overrides for each specific GPU +static GPUEntry gpu_db[] = { + {"Tesla V100-SXM2-16GB", &VOLTA, 640, 1530}, + {"Tesla V100-PCIE-32GB", &VOLTA, 640, 1530}, + {"NVIDIA A100-PCIE-40GB", &ERE_DATACENTER, 432, 1410}, + {"NVIDIA A100-PCIE-80GB", &ERE_DATACENTER, 432, 1410}, + {"NVIDIA A100-SXM4-40GB", &ERE_DATACENTER, 432, 1410}, + {"NVIDIA A100-SXM4-80GB", &ERE_DATACENTER, 432, 1410}, + {"NVIDIA RTX A2000", &ERE_CONSUMER, 104, 1200}, + {"NVIDIA RTX A4000", &ERE_CONSUMER, 192, 1560}, + {"NVIDIA RTX A4500", &ERE_CONSUMER, 224, 1650}, + {"NVIDIA RTX A5000", &ERE_CONSUMER, 256, 1695}, + {"NVIDIA RTX A5500", &ERE_CONSUMER, 320, 1770}, + {"NVIDIA RTX A6000", &ERE_CONSUMER, 336, 1800}, + {"NVIDIA GeForce RTX 3090 Ti", &ERE_CONSUMER, 336, 1860}, + {"NVIDIA GeForce RTX 3090", &ERE_CONSUMER, 328, 1695}, + {"NVIDIA GeForce RTX 3080 Ti", &ERE_CONSUMER, 320, 1665}, + {"NVIDIA GeForce RTX 3080", &ERE_CONSUMER, 272, 1710}, + {"NVIDIA GeForce RTX 3070 Ti", &ERE_CONSUMER, 192, 1770}, + {"NVIDIA GeForce RTX 3070", &ERE_CONSUMER, 184, 1725}, + {"NVIDIA GeForce RTX 3060 Ti", &ERE_CONSUMER, 152, 1665}, + {"NVIDIA GeForce RTX 3060", &ERE_CONSUMER, 112, 1777}, + {"NVIDIA RTX A2000 ADA", &ADA, 88, 2130}, + {"NVIDIA RTX A4000 ADA", &ADA, 192, 2175}, + {"NVIDIA RTX A4500 ADA", &ADA, 224, 2580}, + {"NVIDIA RTX A5000 ADA", &ADA, 400, 2550}, + {"NVIDIA RTX A5880 ADA", &ADA, 440, 2460}, + {"NVIDIA RTX A6000 ADA", &ADA, 568, 2505}, + {"NVIDIA GeForce RTX 4090", &ADA, 512, 2520}, + {"NVIDIA GeForce RTX 4080 SUPER", &ADA, 320, 2550}, + {"NVIDIA GeForce RTX 4080", &ADA, 304, 2505}, + {"NVIDIA GeForce RTX 4070 Ti SUPER", &ADA, 264, 2610}, + {"NVIDIA GeForce RTX 4070 Ti", &ADA, 240, 2610}, + {"NVIDIA GeForce RTX 4070 SUPER", &ADA, 224, 2475}, + {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475}, + {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475}, + {"NVIDIA GeForce RTX 4060 Ti", &ADA, 136, 2535}, + {"NVIDIA GeForce RTX 4060", &ADA, 96, 2460}, + {"NVIDIA H100 PCIe", &HOPPER, 456, 1620}, + {"NVIDIA H100 80GB HBM3", &HOPPER, 528, 1830}, // HBM3 = SXM5 +}; + +float get_flops_promised(const char* device, int precision_mode) { + /* + This function is used to estimate the Model Flops Utilization (MFU) + basically we have to figure out how many flops the GPU can do per second. + Note that this is not a simple endeavor and may well go wrong! The details are tricky. + The returned value is in units of 1e12. + + For the non-top models, actual performance numbers aren't that easy to find, e.g., + here https://www.techpowerup.com/gpu-specs/rtx-a4000.c3756, does "Theoretical Performance" + seems to be without tensor cores. + + So, instead we use that all these cards just use the same types of tensor cores in different + numbers and at different frequencies. Then we just need to look up these two easily accesible + numbers for all the other GPUs. + linear scaling seems to work: comparing spec sheet and calculation: + 4080: 304TCs, 2505 GHz; 97.5TFlops = 165.2/512*304 /2520 * 2505 + + Original numbers for the top GPUS are from. + https://resources.nvidia.com/en-us-tensor-core + https://images.nvidia.com/aem-dam/Solutions/geforce/ada/nvidia-ada-gpu-architecture.pdf + */ + + // validate the precision mode as one of the three possible values + if (!(precision_mode == MFUH_PRECISION_FP32 || precision_mode == MFUH_PRECISION_FP16 || precision_mode == MFUH_PRECISION_BF16)) { + fprintf(stderr, "Invalid precision mode: %d\n", precision_mode); + return -1.0f; + } + + // do a linear search until you find our GPU, then calculate the flops promised + int num_gpu_entries = sizeof(gpu_db) / sizeof(gpu_db[0]); + for (int i = 0; i < num_gpu_entries; i++) { + if (strcmp(gpu_db[i].name, device) == 0) { + const PerfData* perf_data = gpu_db[i].perf_data; + + // look up the default flops value for the given precision mode + float value = -1.0f; + if (precision_mode == MFUH_PRECISION_BF16) { value = perf_data->BF_16_32; } + if (precision_mode == MFUH_PRECISION_FP32) { value = perf_data->TF_32; } + if (precision_mode == MFUH_PRECISION_FP16) { value = perf_data->FP_16_32; } + + // we'd get here if we're e.g. trying to use BF16 on Volta GPU or something... + if (value < 0.0f) { + fprintf(stderr, "No data for GPU %s and precision mode %d\n", device, precision_mode); + return -1.0f; + } + + // adjust flops based on the specific core count and clock frequency of this GPU + float new_cores = gpu_db[i].new_cores; + float new_mhz = gpu_db[i].new_mhz; + float adjusted = value * (new_cores / perf_data->CORES) * (new_mhz / perf_data->CLOCK); + return adjusted; + } + } + + return -1.0f; // ¯\_(ツ)_/¯ +} + +struct GPUUtilInfo { + unsigned int clock; + unsigned int max_clock; + unsigned int power; + unsigned int power_limit; + unsigned int fan; + unsigned int temperature; + unsigned int temp_slowdown; + + float gpu_utilization; + float mem_utilization; + const char* throttle_reason; +}; + +// lazily initialize nvml and generate a handle to the GPU +#if USE_NVML +nvmlDevice_t nvml_get_device() { + static bool needs_init = true; + static nvmlDevice_t device; + if(needs_init) { + needs_init = false; + nvmlCheck(nvmlInit()); + nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device)); + } + return device; +} + +// convert throttle reason bitfield into a text reason. +// this is a lossy conversion; we just want to give some idea of what is happening +const char* get_throttle_reason(unsigned long long bits) { + if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) { + return "power cap"; + } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) { + return "thermal cap"; + } else if (bits & (nvmlClocksThrottleReasonAll)) { + return "other cap"; + } else { + return "no cap"; + } +} + +// gather data for a GPUUtilInfo object +GPUUtilInfo get_gpu_utilization_info() { + GPUUtilInfo info; + nvmlDevice_t device = nvml_get_device(); + // query different infos directly + nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock)); + nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock)); + nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit)); + nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power)); + nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature)); + nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown)); + unsigned long long throttle; + nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle)); + info.throttle_reason = get_throttle_reason(throttle); + nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan)); + + // for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples + // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128, + // and have no memory management required + constexpr const int BUFFER_LIMIT = 128; + nvmlSample_t buffer[BUFFER_LIMIT]; + nvmlValueType_t v_type; + unsigned int sample_count = BUFFER_LIMIT; + nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)); + float gpu_utilization = 0.f; + for(unsigned i = 0; i < sample_count; ++i) { + gpu_utilization += (float)buffer[i].sampleValue.uiVal; + } + gpu_utilization /= (float)sample_count; + + // sample count may have been modified by the query above; reset back to buffer size + sample_count = BUFFER_LIMIT; + nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer)); + float mem_utilization = 0.f; + for(unsigned i = 0; i < sample_count; ++i) { + mem_utilization += (float)buffer[i].sampleValue.uiVal; + } + mem_utilization /= (float)sample_count; + + info.gpu_utilization = gpu_utilization; + info.mem_utilization = mem_utilization; + return info; +} +#else +GPUUtilInfo get_gpu_utilization_info() { + fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking."); + exit(EXIT_FAILURE); +} +#endif +#endif // MFU_H diff --git a/CUDA/llmcpp/rand.h b/CUDA/llmcpp/rand.h new file mode 100644 index 0000000..b66aa04 --- /dev/null +++ b/CUDA/llmcpp/rand.h @@ -0,0 +1,240 @@ +/* +Mersenne Twisters implementation, numerically identical to torch. + +Example usage: + + mt19937_state state; + manual_seed(&state, 137); + printf("%u\n", randint32(&state)); + printf("%u\n", randint32(&state)); + printf("%u\n", randint32(&state)); + printf("%u\n", randint32(&state)); + printf("%u\n", randint32(&state)); + + float t8[8]; + normal_(t8, 8, 0, 1, &state); + for (int i = 0; i < 8; i++) { + printf("%f\n", t8[i]); + } + printf("%u\n", randint32(&state)); + + float t16[16]; + normal_(t16, 16, 0, 1, &state); + for (int i = 0; i < 16; i++) { + printf("%f\n", t16[i]); + } + printf("%u\n", randint32(&state)); + +PyTorch reference (producing identical results): + + import torch + torch.manual_seed(137) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + t = torch.zeros(8); + t.normal_() + for i in range(len(t)) : + print(t[i].item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + t = torch.zeros(16); + t.normal_() + for i in range(len(t)) : + print(t[i].item()) + print(torch.randint(0, 0xFFFFFFFF, [1]).item()) + +Both output: + + 4053805790 + 2173880614 + 380293709 + 1237255315 + 2986595568 + 0.7947664260864258 + 1.4369317293167114 + - 0.2292192131280899 + 0.47556325793266296 + - 0.6334410905838013 + - 0.5791953802108765 + - 0.0925704762339592 + - 0.8659197092056274 + 2186503452 + - 1.2813878059387207 + - 2.646395683288574 + - 0.06569503247737885 + 0.2180829495191574 + - 0.46536165475845337 + - 0.33108410239219666 + 2.5485482215881348 + 0.10425379872322083 + 0.8460659980773926 + 0.9462448358535767 + - 0.2913765013217926 + 0.34313806891441345 + - 1.1186704635620117 + - 0.18305328488349915 + - 2.3153159618377686 + 0.3961987793445587 + 2756748748 +*/ + +#ifndef RAND_H +#define RAND_H + +#include + +#define MERSENNE_STATE_M 397u +#define MERSENNE_STATE_N 624u + +#define LMASK 0x7ffffffful +#define UMASK 0x80000000ul + +// Copyright(c) Makoto Matsumoto and Takuji Nishimura + +// This implementation follows PyTorch so that we are numerically identical when running verification tests. + +typedef struct { + unsigned long long seed_; + int left_; + unsigned int next_; + unsigned int state_[MERSENNE_STATE_N]; + unsigned int MATRIX_A[2]; +} mt19937_state; + +void manual_seed(mt19937_state* state, unsigned int seed) { + state->MATRIX_A[0] = 0x0u; + state->MATRIX_A[1] = 0x9908b0df; + state->state_[0] = seed & 0xffffffff; + for (unsigned int j = 1; j < MERSENNE_STATE_N; j++) { + state->state_[j] = 1812433253 * (state->state_[j - 1] ^ (state->state_[j - 1] >> 30)) + j; + state->state_[j] &= 0xffffffff; + } + state->left_ = 1; + state->next_ = 0; +} + +void next_state(mt19937_state* state) { + state->left_ = MERSENNE_STATE_N; + state->next_ = 0; + unsigned int y, j; + for (j = 0; j < MERSENNE_STATE_N - MERSENNE_STATE_M; j++) { + y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK); + state->state_[j] = state->state_[j + MERSENNE_STATE_M] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; + } + for (; j < MERSENNE_STATE_N - 1; j++) { + y = (state->state_[j] & UMASK) | (state->state_[j + 1] & LMASK); + state->state_[j] = state->state_[j + (MERSENNE_STATE_M - MERSENNE_STATE_N)] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; + } + y = (state->state_[MERSENNE_STATE_N - 1] & UMASK) | (state->state_[0] & LMASK); + state->state_[MERSENNE_STATE_N - 1] = state->state_[MERSENNE_STATE_M - 1] ^ (y >> 1) ^ state->MATRIX_A[y & 0x1]; +} + +unsigned int randint32(mt19937_state* state) { + if (!state) return 0; + if (state->MATRIX_A[0] != 0 || state->MATRIX_A[1] != 0x9908b0df) manual_seed(state, 5489); // auto-initialize + if (--state->left_ <= 0) { + next_state(state); + } + unsigned int y = state->state_[state->next_++]; + y ^= y >> 11; + y ^= (y << 7) & 0x9d2c5680; + y ^= (y << 15) & 0xefc60000; + y ^= y >> 18; + return y; +} + +inline unsigned long long randint64(mt19937_state* state) { + return (((unsigned long long)(randint32(state)) << 32) | randint32(state)); +} + +inline float randfloat32(mt19937_state* state) { + return (randint32(state) & ((1ull << 24) - 1)) * (1.0f / (1ull << 24)); +} + +inline double randfloat64(mt19937_state* state) { + return (randint64(state) & ((1ull << 53) - 1)) * (1.0 / (1ull << 53)); +} + +void uniform_(float* data, unsigned int numel, float from, float to, mt19937_state* state) { + for (unsigned int t = 0; t < numel; t++) { + data[t] = randfloat32(state) * (to - from) + from; + } +} + +// Box-Muller transform: maps uniform random numbers to Gaussian distributed numbers +// https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform +void normal_fill_16(float* data, float mean, float std) { + #define EPSILONE 1e-12f + for (unsigned int t = 0; t < 8; t++) { + float u1 = 1 - data[t]; + float u2 = data[t + 8]; + float radius = sqrtf(-2 * logf(u1 + EPSILONE)); + float theta = (float) (2.0 * M_PI * u2); + data[t] = (radius * cosf(theta) * std + mean); + data[t + 8] = (radius * sinf(theta) * std + mean); + } +} + +void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937_state* state) { + for (unsigned int t = 0; t < numel; t++) { + data[t] = randfloat32(state); + } + for (unsigned int i = 0; i < numel - 15; i += 16) { + normal_fill_16(data + i, mean, std); + } + if (numel % 16 != 0) { + // recompute the last 16 values + data = data + numel - 16; + for (unsigned int i = 0; i < 16; i++) { + data[i] = randfloat32(state); + } + normal_fill_16(data, mean, std); + } +} + +void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) { + #define EPSILONE 1e-12f + if (numel >= 16) { + normal_fill(data, numel, mean, std, state); + } + else { + double next_double_normal_sample = 0.0; // make compiler warning happy, won't be used + int has_next_double_normal_sample = 0; + for (unsigned int t = 0; t < numel; t++) { + if (has_next_double_normal_sample) { + data[t] = (float)(next_double_normal_sample * std + mean); + has_next_double_normal_sample = 0; + continue; + } + // for numel < 16 we draw a double (float64) + float u1 = (float) randfloat64(state); + float u2 = (float) randfloat64(state); + float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE)); + float theta = (float) (2.0 * M_PI * u1); + next_double_normal_sample = radius * sinf(theta); + has_next_double_normal_sample = 1; + data[t] = (radius * cosf(theta) * std + mean); + } + } +} + +void init_identity_permutation(int *data, int numel) { + for (int i = 0; i < numel; i++) { + data[i] = i; + } +} + +void random_permutation(int* data, int numel, mt19937_state* state) { + for (int i = numel - 1; i > 0; i--) { + // pick an index j in [0, i] with equal probability + int j = randint32(state) % (i + 1); + // swap i <-> j + int tmp = data[i]; + data[i] = data[j]; + data[j] = tmp; + } +} + +#endif \ No newline at end of file diff --git a/CUDA/llmcpp/utils.h b/CUDA/llmcpp/utils.h new file mode 100644 index 0000000..775534c --- /dev/null +++ b/CUDA/llmcpp/utils.h @@ -0,0 +1,223 @@ +/* + This file contains utilities shared between the different training scripts. + In particular, we define a series of macros xxxCheck that call the corresponding + C standard library function and check its return code. If an error was reported, + the program prints some debug information and exits. +*/ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include +// implementation of dirent for Windows is in dev/unistd.h +#ifndef _WIN32 +#include +#include +#endif + +// ---------------------------------------------------------------------------- +// fread convenience utils, with nice handling of error checking using macros +// simple replace fopen, fread, fclose, fseek +// with fopenCheck, freadCheck, fcloseCheck, fseekCheck + +extern inline FILE *fopen_check(const char *path, const char *mode, const char *file, int line) { + FILE *fp = fopen(path, mode); + if (fp == NULL) { + fprintf(stderr, "Error: Failed to open file '%s' at %s:%d\n", path, file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + fprintf(stderr, " Path: %s\n", path); + fprintf(stderr, " Mode: %s\n", mode); + fprintf(stderr, "---> HINT 1: dataset files/code have moved to dev/data recently (May 20, 2024). You may have to mv them from the legacy data/ dir to dev/data/(dataset), or re-run the data preprocessing script. Refer back to the main README\n"); + fprintf(stderr, "---> HINT 2: possibly try to re-run `python train_gpt2.py`\n"); + exit(EXIT_FAILURE); + } + return fp; +} + +#define fopenCheck(path, mode) fopen_check(path, mode, __FILE__, __LINE__) + +extern inline void fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) { + size_t result = fread(ptr, size, nmemb, stream); + if (result != nmemb) { + if (feof(stream)) { + fprintf(stderr, "Error: Unexpected end of file at %s:%d\n", file, line); + } else if (ferror(stream)) { + fprintf(stderr, "Error: File read error at %s:%d\n", file, line); + } else { + fprintf(stderr, "Error: Partial read at %s:%d. Expected %zu elements, read %zu\n", + file, line, nmemb, result); + } + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + fprintf(stderr, " Expected elements: %zu\n", nmemb); + fprintf(stderr, " Read elements: %zu\n", result); + exit(EXIT_FAILURE); + } +} + +#define freadCheck(ptr, size, nmemb, stream) fread_check(ptr, size, nmemb, stream, __FILE__, __LINE__) + +extern inline void fclose_check(FILE *fp, const char *file, int line) { + if (fclose(fp) != 0) { + fprintf(stderr, "Error: Failed to close file at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + exit(EXIT_FAILURE); + } +} + +#define fcloseCheck(fp) fclose_check(fp, __FILE__, __LINE__) + +extern inline void sclose_check(int sockfd, const char *file, int line) { + if (close(sockfd) != 0) { + fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + exit(EXIT_FAILURE); + } +} + +#define scloseCheck(sockfd) sclose_check(sockfd, __FILE__, __LINE__) + +#ifdef _WIN32 +extern inline void closesocket_check(int sockfd, const char *file, int line) { + if (closesocket(sockfd) != 0) { + fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + exit(EXIT_FAILURE); + } +} + +#define closesocketCheck(sockfd) closesocket_check(sockfd, __FILE__, __LINE__) +#endif + +extern inline void fseek_check(FILE *fp, long off, int whence, const char *file, int line) { + if (fseek(fp, off, whence) != 0) { + fprintf(stderr, "Error: Failed to seek in file at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " Offset: %ld\n", off); + fprintf(stderr, " Whence: %d\n", whence); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + exit(EXIT_FAILURE); + } +} + +#define fseekCheck(fp, off, whence) fseek_check(fp, off, whence, __FILE__, __LINE__) + +extern inline void fwrite_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) { + size_t result = fwrite(ptr, size, nmemb, stream); + if (result != nmemb) { + if (feof(stream)) { + fprintf(stderr, "Error: Unexpected end of file at %s:%d\n", file, line); + } else if (ferror(stream)) { + fprintf(stderr, "Error: File write error at %s:%d\n", file, line); + } else { + fprintf(stderr, "Error: Partial write at %s:%d. Expected %zu elements, wrote %zu\n", + file, line, nmemb, result); + } + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + fprintf(stderr, " Expected elements: %zu\n", nmemb); + fprintf(stderr, " Written elements: %zu\n", result); + exit(EXIT_FAILURE); + } +} + +#define fwriteCheck(ptr, size, nmemb, stream) fwrite_check(ptr, size, nmemb, stream, __FILE__, __LINE__) + +// ---------------------------------------------------------------------------- +// malloc error-handling wrapper util + +extern inline void *malloc_check(size_t size, const char *file, int line) { + void *ptr = malloc(size); + if (ptr == NULL) { + fprintf(stderr, "Error: Memory allocation failed at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + fprintf(stderr, " Size: %zu bytes\n", size); + exit(EXIT_FAILURE); + } + return ptr; +} + +#define mallocCheck(size) malloc_check(size, __FILE__, __LINE__) + + +// ---------------------------------------------------------------------------- +// check that all tokens are within range +extern inline void token_check(const int* tokens, int token_count, int vocab_size, const char *file, int line) { + for(int i = 0; i < token_count; i++) { + if(!(0 <= tokens[i] && tokens[i] < vocab_size)) { + fprintf(stderr, "Error: Token out of vocabulary at %s:%d\n", file, line); + fprintf(stderr, "Error details:\n"); + fprintf(stderr, " File: %s\n", file); + fprintf(stderr, " Line: %d\n", line); + fprintf(stderr, " Token: %d\n", tokens[i]); + fprintf(stderr, " Position: %d\n", i); + fprintf(stderr, " Vocab: %d\n", vocab_size); + exit(EXIT_FAILURE); + } + } +} +#define tokenCheck(tokens, count, vocab) token_check(tokens, count, vocab, __FILE__, __LINE__) + +// ---------------------------------------------------------------------------- +// I/O ops + +extern inline void create_dir_if_not_exists(const char *dir) { + if (dir == NULL) { return; } + struct stat st = {0}; + if (stat(dir, &st) == -1) { + if (mkdir(dir, 0700) == -1) { + printf("ERROR: could not create directory: %s\n", dir); + exit(EXIT_FAILURE); + } + printf("created directory: %s\n", dir); + } +} + +extern inline int find_max_step(const char* output_log_dir) { + // find the DONE file in the log dir with highest step count + if (output_log_dir == NULL) { return -1; } + DIR* dir; + struct dirent* entry; + int max_step = -1; + dir = opendir(output_log_dir); + if (dir == NULL) { return -1; } + while ((entry = readdir(dir)) != NULL) { + if (strncmp(entry->d_name, "DONE_", 5) == 0) { + int step = atoi(entry->d_name + 5); + if (step > max_step) { + max_step = step; + } + } + } + closedir(dir); + return max_step; +} + +extern inline int ends_with_bin(const char* str) { + // checks if str ends with ".bin". could be generalized in the future. + if (str == NULL) { return 0; } + size_t len = strlen(str); + const char* suffix = ".bin"; + size_t suffix_len = strlen(suffix); + if (len < suffix_len) { return 0; } + int suffix_matches = strncmp(str + len - suffix_len, suffix, suffix_len) == 0; + return suffix_matches; +} + +#endif \ No newline at end of file diff --git a/CUDA/llmcpp/zero.cuh b/CUDA/llmcpp/zero.cuh new file mode 100644 index 0000000..e6c5b6e --- /dev/null +++ b/CUDA/llmcpp/zero.cuh @@ -0,0 +1,597 @@ +/* +Utilities for ZeRO sharding +*/ + +#ifndef LLMC_ZERO_CUH +#define LLMC_ZERO_CUH + +#include +#include +#include +#include +#include + +#ifdef MULTI_GPU +#include +#ifdef USE_MPI +#include +#endif +#endif + +// defines: fcloseCheck, fwriteCheck, scloseCheck, sclosesocketCheck +#include "utils.h" + +// ---------------------------------------------------------------------------- +// Multi-GPU related +#ifdef MULTI_GPU + +#if defined(ENABLE_FP32) +const ncclDataType_t ncclFloatX = ncclFloat; +#elif defined(ENABLE_FP16) +const ncclDataType_t ncclFloatX = ncclHalf; +#else // Default to bfloat16 +const ncclDataType_t ncclFloatX = ncclBfloat16; +#endif + +void nccl_check(ncclResult_t status, const char *file, int line) { + if (status != ncclSuccess) { + printf("[NCCL ERROR] at file %s:%d:\n%s\n", file, line, ncclGetErrorString(status)); + exit(EXIT_FAILURE); + } +} +#define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__)) + +#ifdef USE_MPI +void mpi_check(int status, const char *file, int line) { + if (status != MPI_SUCCESS) { + char mpi_error[4096]; + int mpi_error_len = 0; + assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) == MPI_SUCCESS); + printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len, mpi_error); + exit(EXIT_FAILURE); + } +} +#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__)) +#endif + +#endif // MULTI_GPU + +// ---------------------------------------------------------------------------- +// Parameters specific to training on multiple GPUs. +typedef struct { + int process_rank; // Rank of this process among all processes. 0 if no multi-GPU. + int num_processes; // Total number of processes. 1 if no multi-GPU. + int local_device_idx; // This process GPU index on current machine. 0 if no multi-GPU. + + // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html + // 0-Disabled + // 1-Optimizer State Sharding (OSS) + // 2-Optimizer + Gradient State Sharding (SDP) + // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP) + int zero_stage; + size_t shard_num_parameters; +#ifdef MULTI_GPU + ncclComm_t nccl_comm; // NCCL communication primitive, used for collective multi-GPU work. + cudaStream_t nccl_stream; // CUDA Stream to perform NCCL operations. + cudaEvent_t compute_nccl_sync; // Event used to synchronize NCCL with the compute + float* unified_buffer; +#endif +} MultiGpuConfig; + +// one global variable to hold the multi-GPU configuration for this process +// inline, so we can include this header multiple times without getting multiple definitions +inline MultiGpuConfig multi_gpu_config; + +#ifdef MULTI_GPU + +#ifdef _WIN32 +void send_nccl_id_to_clients_windows(ncclUniqueId *nccl_id, SOCKET client_sockets[], int num_clients) { + for (int i = 0; i < num_clients; ++i) { + if (send(client_sockets[i], (const char *)nccl_id, sizeof(*nccl_id), 0) == SOCKET_ERROR) { + printf("Failed to send nccl_id"); + WSACleanup(); + exit(EXIT_FAILURE); + } + closesocketCheck(client_sockets[i]); + } +} +#else +void send_nccl_id_to_clients(ncclUniqueId *nccl_id, int client_sockets[], int num_clients) { + for (int i = 0; i < num_clients; ++i) { + if (send(client_sockets[i], nccl_id, sizeof(*nccl_id), 0) == -1) { + printf("Failed to send nccl_id"); + exit(EXIT_FAILURE); + } + scloseCheck(client_sockets[i]); + } +} +#endif + +#ifdef _WIN32 +// Same as get_nccl_id_via_tcp but for Windows +ncclUniqueId get_nccl_id_via_tcp_windows(MultiGpuConfig* result, const char* server_ip) { + ncclUniqueId nccl_id; + + int SERVER_PORT = 12345; // hardcoded an arbitrary port number between 1024 and 49151 (registered ports) + WSADATA wsaData; + if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { + printf("WSAStartup failed"); + exit(EXIT_FAILURE); + } + + if (result->process_rank == 0) { + ncclCheck(ncclGetUniqueId(&nccl_id)); + + int MAX_CLIENTS = result->num_processes - 1; + SOCKET client_sockets[MAX_CLIENTS]; + int num_clients = 0; + SOCKET server_socket, new_socket; + struct sockaddr_in address; + int addrlen = sizeof(address); + + // Step 1) create a server TCP socket + if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) { + printf("Socket failed"); + WSACleanup(); + exit(EXIT_FAILURE); + } + + // Step 2) set the server address and port + address.sin_family = AF_INET; // IPv4 + address.sin_addr.s_addr = inet_addr(server_ip); + address.sin_port = htons(SERVER_PORT); + + // Step 3) bind the socket to the address and port + if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) == SOCKET_ERROR) { + printf("Bind failed"); + closesocketCheck(server_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + + // Step 4) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server + if (listen(server_socket, MAX_CLIENTS) == SOCKET_ERROR) { + printf("Listen failed"); + closesocketCheck(server_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + + // Step 5) accept connections from clients + printf("Waiting for clients to connect...\n"); + while (num_clients < MAX_CLIENTS) { + if ((new_socket = accept(server_socket, (struct sockaddr *)&address, &addrlen)) == INVALID_SOCKET) { + printf("Accept failed"); + closesocketCheck(server_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + client_sockets[num_clients++] = new_socket; + printf("Client %d connected\n", num_clients); + } + + // Step 6) send the NCCL ID to all clients + send_nccl_id_to_clients_windows(&nccl_id, client_sockets, num_clients); + printf("NCCL ID sent to all clients\n"); + + closesocketCheck(server_socket); + } else { + int num_connection_attempts = 5; + int time_to_sleep = 2; + SOCKET client_socket; + struct sockaddr_in serv_addr; + + // Step 1) create a client TCP socket + if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) { + printf("Socket creation error"); + WSACleanup(); + exit(EXIT_FAILURE); + } + + // Step 2) set the server address and port + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(SERVER_PORT); + if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) { + printf("Invalid address or address not supported"); + closesocketCheck(client_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + + // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails + while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == SOCKET_ERROR) { + printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep); + if (--num_connection_attempts == 0) { + printf("Failed to connect to the server\n"); + closesocketCheck(client_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + Sleep(time_to_sleep * 1000); + } + + // Step 4) receive the NCCL ID from the server + if (recv(client_socket, (char *)&nccl_id, sizeof(nccl_id), 0) <= 0) { + printf("Failed to receive nccl_id"); + closesocketCheck(client_socket); + WSACleanup(); + exit(EXIT_FAILURE); + } + + printf("Received NCCL ID\n"); + closesocketCheck(client_socket); + } + + WSACleanup(); + return nccl_id; +} +#else +ncclUniqueId get_nccl_id_via_tcp(MultiGpuConfig* result, const char* server_ip) { + ncclUniqueId nccl_id; + + int SERVER_PORT = 12345; // hardcoded an arbitrary port number between 1024 and 49151 (registered ports) + if (result->process_rank == 0) { + ncclCheck(ncclGetUniqueId(&nccl_id)); + + int MAX_CLIENTS = result->num_processes - 1; + int client_sockets[MAX_CLIENTS]; + int num_clients = 0; + int server_socket, new_socket; + struct sockaddr_in address; + int addrlen = sizeof(address); + int opt = 1; + + // Step 1) create a server TCP socket + if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + printf("Socket failed"); + exit(EXIT_FAILURE); + } + + // Step 2) set socket options + // SOL_SOCKET - means that option is configured at socket level + // SO_REUSEADDR - allows to bind to an address which is in a TIME_WAIT state (already used by another socket) - useful when restarting the server + // SO_REUSEPORT - allows to bind to the same port multiple times + if (setsockopt(server_socket, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)) < 0) { + printf("Setsockopt failed"); + exit(EXIT_FAILURE); + } + + // Step 3) set the server address and port + address.sin_family = AF_INET; // IPv4 + address.sin_addr.s_addr = inet_addr(server_ip); // alternatively use INADDR_ANY to bind to all interfaces, currently we only allow ethernet + address.sin_port = htons(SERVER_PORT); + + // Step 4) bind the socket to the address and port + if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) < 0) { + printf("Bind failed"); + exit(EXIT_FAILURE); + } + + // Step 5) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server + if (listen(server_socket, MAX_CLIENTS) < 0) { + printf("Listen failed"); + exit(EXIT_FAILURE); + } + + // Step 6) accept connections from clients + printf("Waiting for clients to connect...\n"); + while (num_clients < MAX_CLIENTS) { + if ((new_socket = accept(server_socket, (struct sockaddr *)&address, (socklen_t*)&addrlen)) < 0) { + printf("Accept failed"); + exit(EXIT_FAILURE); + } + client_sockets[num_clients++] = new_socket; + printf("Client %d connected\n", num_clients); + } + + // Step 7) send the NCCL ID to all clients + send_nccl_id_to_clients(&nccl_id, client_sockets, num_clients); + printf("NCCL ID sent to all clients\n"); + + scloseCheck(server_socket); + } else { + int num_connection_attempts = 5; + int time_to_sleep = 2; + int client_socket; + struct sockaddr_in serv_addr; + + // Step 1) create a client TCP socket + if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + printf("Socket creation error"); + exit(EXIT_FAILURE); + } + + // Step 2) set the server address and port + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(SERVER_PORT); + if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) { + printf("Invalid address or address not supported"); + exit(EXIT_FAILURE); + } + + // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails + while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep); + if (--num_connection_attempts == 0) { + printf("Failed to connect to the server\n"); + exit(EXIT_FAILURE); + } + sleep(time_to_sleep); + } + + // Step 4) receive the NCCL ID from the server + if (recv(client_socket, &nccl_id, sizeof(nccl_id), 0) <= 0) { + printf("Failed to receive nccl_id"); + exit(EXIT_FAILURE); + } + + printf("Received NCCL ID\n"); + scloseCheck(client_socket); + } + + return nccl_id; +} +#endif + +ncclUniqueId get_nccl_id_via_fs(MultiGpuConfig* result, char* fs_path) { + // Works assuming that the filesystem is shared among all processes + ncclUniqueId nccl_id; + FILE* idFile; + static char filename[1024]; + snprintf(filename, sizeof(filename), "%s/ncclUniqueId.sync", fs_path); + + if (result->process_rank != 0) { // client processse should wait for the server to write to the file + // This is a naive and not 100% robust way to synchronize the processes but it should work almost always + sleep(2); + } + + if (result->process_rank == 0) { + ncclCheck(ncclGetUniqueId(&nccl_id)); + idFile = fopen(filename, "wb"); + assert(idFile != NULL); + fwriteCheck(&nccl_id, sizeof(nccl_id), 1, idFile); + fcloseCheck(idFile); + } else { + // Other ranks wait until the file is available and read the unique ID + do { + sleep(1); // 1 second + idFile = fopen(filename, "rb"); + if (idFile != NULL) break; + } while (idFile == NULL); + freadCheck(&nccl_id, sizeof(nccl_id), 1, idFile); + fcloseCheck(idFile); + } + + return nccl_id; +} + +#ifdef USE_MPI +// Determine which GPU this process should use. +// Processes on the same machines use different GPU indicies. Processes on other machines don't. +// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread +int multi_gpu_get_local_device_idx(int process_rank, int num_processes) { + char hostname[1024]; + hostname[1023] = '\0'; + // All processes on the same machine will share the same hostname. + gethostname(hostname, 1023); + for (int i=0; i < 1024; i++) { + if (hostname[i] == '.') { + hostname[i] = '\0'; + break; + } + } + uint64_t hostname_hash = 5381u; + for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5u) + hostname_hash) ^ hostname[c]; } + + // Distribute all hostname hashes to all processes. + uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t)); + all_hostsname_hashes[process_rank] = hostname_hash; + mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD)); + + // Identify which GPU we need to use. + int local_device_idx = 0; + for (int current_process = 0; current_process < num_processes; ++current_process) { + if (current_process == process_rank) { + // Found my gpu, local_device_idx now has my target GPU index. + break; + } + if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) { + // This process ID runs on the same machine, but it's not me, skip this GPU + local_device_idx++; + } + } + + free(all_hostsname_hashes); + return local_device_idx; +} +#endif + +#endif + +MultiGpuConfig multi_gpu_config_init(int num_processes, int process_rank, int gpus_per_node, char* server_ip, char* fs_path, char* init_method) { +#ifdef MULTI_GPU + MultiGpuConfig result; + ncclUniqueId nccl_id; + // Get nccl_id using MPI, TCP, or FS (file system synchronization) methods + // On newer slurm versions (slurm-wlm package) PMIx is disabled so we can not use MPI for NCCL init in multi node setup + if (strcmp(init_method, "mpi") == 0) { + #ifdef USE_MPI + mpiCheck(MPI_Init(NULL, NULL)); + mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank)); + mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes)); + result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes); + if (result.process_rank == 0) { + ncclCheck(ncclGetUniqueId(&nccl_id)); + } + mpiCheck(MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD)); + #else + printf("MPI support is disabled. Please enable MPI support to use MPI-based NCCL-init method.\n"); + exit(EXIT_FAILURE); + #endif + } else { + result.process_rank = process_rank; + result.num_processes = num_processes; + result.local_device_idx = process_rank % gpus_per_node; + if (strcmp(init_method, "tcp") == 0) { + #ifdef _WIN32 + nccl_id = get_nccl_id_via_tcp_windows(&result, server_ip); + #else + nccl_id = get_nccl_id_via_tcp(&result, server_ip); + #endif + } else if (strcmp(init_method, "fs") == 0) { + nccl_id = get_nccl_id_via_fs(&result, fs_path); + } else { + printf("Invalid NCCL-init method\n"); + exit(EXIT_FAILURE); + } + } + cudaCheck(cudaSetDevice(result.local_device_idx)); + ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank)); + cudaCheck(cudaStreamCreate(&result.nccl_stream)); + // event without timing for maximum performance + cudaCheck(cudaEventCreate(&result.compute_nccl_sync, cudaEventDisableTiming)); + nvtxNameCudaStreamA(result.nccl_stream, "nccl stream"); + nvtxNameCudaEventA(result.compute_nccl_sync, "nccl compute sync"); + cudaCheck(cudaMallocManaged(&result.unified_buffer, sizeof(float))); + return result; +#else + printf("Multi-GPU support is disabled. Using a single GPU.\n"); + cudaCheck(cudaSetDevice(0)); + MultiGpuConfig result; + result.process_rank = 0; + result.num_processes = 1; + result.local_device_idx = 0; + return result; +#endif +} + +void multi_gpu_config_free(MultiGpuConfig* config) { +#ifdef MULTI_GPU + ncclCheck(ncclCommDestroy(config->nccl_comm)); + cudaCheck(cudaStreamDestroy(config->nccl_stream)); + cudaCheck(cudaEventDestroy(config->compute_nccl_sync)); + cudaCheck(cudaFree(config->unified_buffer)); + #ifdef USE_MPI + mpiCheck(MPI_Finalize()); + #endif +#endif +} + +void multi_gpu_barrier(const MultiGpuConfig* config) { +#ifdef MULTI_GPU + if (config->num_processes > 1) { + ncclCheck(ncclAllReduce(config->unified_buffer, config->unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream)); + } + cudaCheck(cudaDeviceSynchronize()); +#endif +} + +// Offset and size of a tensor shard +typedef struct { + ptrdiff_t offset; + size_t size; +} ShardInfo; + +// Get info about sharding for a tensor of elements many numbers +ShardInfo multi_gpu_get_shard_offset(size_t elements, const MultiGpuConfig* config, int shard_at_stage) { + const int nproc = config->num_processes; + if(config->zero_stage >= shard_at_stage) { + if (elements % nproc != 0) { + fprintf(stderr, "Number of elements %zu must be a multiple of the number of processes %d\n", elements, nproc); + exit(EXIT_FAILURE); + } + return {(ptrdiff_t) (config->process_rank * (elements / nproc)), elements / nproc}; + } else { + return {0, elements}; + } +} + +// Block NCCL stream until computations on compute_stream are done, then aggregate multiple pointers in an NCCL group. +// This can work either as an all-reduce (i.e., no ZeRo), or a reduce-scatter (ZeRO 1). +// The awkward `(&pointers)[N]` syntax ensures we are capturing the parameters as sized arrays, so that it becomes impossible +// to call this function if pointers and pointers_sizes do not match. +template +void multi_gpu_async_reduce_gradient( + floatX* const (&pointers)[N], const size_t (&pointers_sizes)[N], + MultiGpuConfig* config, cudaStream_t compute_stream) { + if (config->num_processes == 1) { + return; // no multi-GPU, just exit. + } + +#ifdef MULTI_GPU + NVTX_RANGE_FN(); + // mark an event on the compute stream, and immediately wait on this in the nccl stream + // this means that the nccl stream won't start executing before all compute kernels that + // have been submitted before this point have finished. + // by using an event instead of cudaSyncStream, we avoid having to synchronize the host, and + // can enqueue new work to the GPU right away. + cudaCheck(cudaEventRecord(config->compute_nccl_sync, compute_stream)); + cudaCheck(cudaStreamWaitEvent(config->nccl_stream, config->compute_nccl_sync)); + ncclCheck(ncclGroupStart()); // NCCL group: aggregate all pointers in a single NCCL GPU kernel. + for (int i = 0; i < N; ++i) { + if(config->zero_stage == 0) { + ncclCheck(ncclAllReduce( + pointers[i], pointers[i], + pointers_sizes[i], + ncclFloatX, ncclAvg, + config->nccl_comm, config->nccl_stream + )); + } else if(config->zero_stage == 1) { + assert(pointers_sizes[i] % config->num_processes == 0); + size_t shard_size = pointers_sizes[i] / config->num_processes; + ptrdiff_t shard_offset = (ptrdiff_t)shard_size * config->process_rank; + ncclCheck(ncclReduceScatter( + pointers[i], pointers[i] + shard_offset, + shard_size, + ncclFloatX, ncclAvg, + config->nccl_comm, config->nccl_stream + )); + } + } + ncclCheck(ncclGroupEnd()); +#endif +} + +// convenience macro that only prints if the rank of process is zero +#define printf0(...) if (::multi_gpu_config.process_rank == 0) { printf(__VA_ARGS__); } + +void set_zero_configs(MultiGpuConfig* config, int zero_stage, size_t total_parameters) { + config->zero_stage = 0; + config->shard_num_parameters = total_parameters; + // Check the Zero Stage and define sharding parameters + if (zero_stage == 0) { + printf0("| Zero Optimization is disabled |\n"); + } + else if (zero_stage == 1) { + if (total_parameters % config->num_processes != 0) { + printf0("| Zero Optimization is disabled, Can't equally partition parameters |\n"); + config->zero_stage = 0; + } + else { + config->zero_stage = 1; + config->shard_num_parameters = total_parameters / config->num_processes; + } + } + else{ + printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported |\n"); + config->zero_stage = 0; + } +} + +// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled. +float multi_gpu_cpu_float_sum(float value, MultiGpuConfig* config) { +#ifdef MULTI_GPU + if (config->num_processes == 1) return value; + + float* unified_buffer = config->unified_buffer; + *unified_buffer = value; + ncclCheck(ncclAllReduce(unified_buffer, unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream)); + cudaCheck(cudaDeviceSynchronize()); + return *unified_buffer; +#else + return value; +#endif +} + +#endif + diff --git a/README.md b/README.md index 56f99cc..b441cf4 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ -# Quadtrix.cpp +# Quadtrix.cpp (llm.cpp) -

+

image -

+


+ + + [![Release](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/release.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/release.yml) [![Package](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/docker-publish.yml) + [![CI](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/ci.yml/badge.svg)](https://github.com/Eamon2009/Quadtrix.cpp/actions/workflows/ci.yml) A local large language model with a modular, multi-path execution architecture. Train, run inference, and serve a chat interface — all from a single repository, across bare-metal C++, PyTorch, and a React frontend. diff --git a/benchmark/results/python_benchmark.csv b/benchmark/results/python_benchmark.csv new file mode 100644 index 0000000..c264086 --- /dev/null +++ b/benchmark/results/python_benchmark.csv @@ -0,0 +1,13 @@ +suite,name,backend,batch_size,sequence_length,tokens,avg_ms,median_ms,min_ms,max_ms,p90_ms,p95_ms,std_ms,tokens_per_sec,samples,loss,memory_mb,notes +data,tokenizer_encode,python,0,0,220975,169.76018999121152,164.62069997214712,124.44350001169369,211.44290000665933,204.09656001720577,207.76973001193255,29.81756930091779,1301689.165236207,10,,188.39453125, +data,batch_sample_to_device,python,4,32,128,0.34600000944919884,0.2575500402599573,0.2452000044286251,0.8668999653309584,0.48601999878883345,0.6764599820598955,0.1852238791693057,369942.18642873614,10,,189.2734375, +primitive,matmul_3d_1x16,python,1,16,16,0.028490001568570733,0.026749970857053995,0.024800014216452837,0.04350004019215703,0.03351001651026308,0.038505028351210044,0.005415998067507546,561600.5306805843,10,,181.234375, +primitive,matmul_3d_4x32,python,4,32,128,0.047069991705939174,0.043849984649568796,0.03890000516548753,0.07130001904442906,0.05185997579246759,0.06157999741844831,0.008578937902311791,2719354.632557738,10,,181.296875, +primitive,attention_scores_4x32,python,4,32,128,0.11958999675698578,0.10689999908208847,0.10410003596916795,0.20840001525357366,0.12946999049745497,0.16893500287551425,0.030181239200475163,1070323.6346774376,10,,181.93359375, +forward,batch1_seq8,python,1,8,8,16.073119995417073,15.318600024329498,14.594200009014457,20.715299993753433,17.489159997785464,19.102229995769445,1.798887105385644,497.7253950870173,10,10.797359466552734,166.43359375, +forward,batch1_seq32,python,1,32,32,21.528740011854097,21.653899981174618,20.405600022058934,22.147400013636798,22.095200035255402,22.1213000244461,0.548371285312407,1486.3851754622074,10,10.882255554199219,190.01171875, +forward,batch4_seq32,python,4,32,128,44.681840017437935,45.51370002445765,37.46199997840449,54.08870003884658,48.26489001279697,51.17679502582177,4.5654932173684655,2864.6984983171146,10,10.885703086853027,253.171875, +training,adamw_step_b4_s32,python,4,32,128,229.80256001465023,207.2436999878846,200.93890000134706,321.9230000395328,279.9100400414318,300.9165200404823,46.404669570312535,556.9998871720134,5,10.602718353271484,392.30078125, +generation,empty,python,1,1,32,563.3423800056335,548.9804000244476,466.00820001913235,704.8150000046007,643.7829400005285,674.2989700025645,72.57013670803387,56.80382150492566,10,,218.44140625, +generation,short,python,1,6,32,524.1239399998449,524.1038500098512,493.7280000303872,561.7482999805361,549.8817999905441,555.8150499855401,20.612269243289685,61.054261326070076,10,,218.47265625, +generation,long,python,1,32,32,561.3779200008139,560.0390000035986,545.9933000383899,574.2078999755904,570.0918399612419,572.1498699684162,7.699534842668483,57.00259817834233,10,,218.14453125, diff --git a/init.py b/init.py new file mode 100644 index 0000000..e870447 --- /dev/null +++ b/init.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import os +import signal +import subprocess +import sys +import time +import webbrowser +from pathlib import Path + + +ROOT = Path(__file__).resolve().parent +BACKEND = ROOT / "backend" +FRONTEND = ROOT / "frontend" +DEFAULT_CHECKPOINT = ROOT / "engine" / "best_model.pt" + + +def npm_command() -> str: + return "npm.cmd" if os.name == "nt" else "npm" + + +def python_command() -> str: + venv_python = ROOT / ".venv" / ("Scripts/python.exe" if os.name == "nt" else "bin/python") + return str(venv_python) if venv_python.exists() else sys.executable + + +def start_process(name: str, command: list[str], cwd: Path, env: dict[str, str]) -> subprocess.Popen: + print(f"[start] {name}: {' '.join(command)}") + return subprocess.Popen(command, cwd=str(cwd), env=env) + + +def stop_process(process: subprocess.Popen) -> None: + if process.poll() is not None: + return + if os.name == "nt": + process.terminate() + else: + process.send_signal(signal.SIGTERM) + try: + process.wait(timeout=8) + except subprocess.TimeoutExpired: + process.kill() + + +def main() -> int: + api_port = os.environ.get("API_PORT", "3001") + frontend_port = os.environ.get("FRONTEND_PORT", "5173") + checkpoint = Path(os.environ.get("TORCH_CHECKPOINT_PATH", str(DEFAULT_CHECKPOINT))).resolve() + + if not checkpoint.exists(): + print(f"[error] .pt checkpoint not found: {checkpoint}") + print(" Set TORCH_CHECKPOINT_PATH to your best_model.pt file.") + return 1 + + backend_env = os.environ.copy() + backend_env.update( + { + "API_PORT": api_port, + "CORS_ORIGINS": f"http://localhost:{frontend_port},http://127.0.0.1:{frontend_port}", + "TORCH_CHECKPOINT_PATH": str(checkpoint), + } + ) + + frontend_env = os.environ.copy() + frontend_env.update( + { + "VITE_API_BASE_URL": f"http://localhost:{api_port}", + "VITE_TORCH_ONLY": "1", + } + ) + + backend = start_process( + "backend (.pt)", + [python_command(), "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", api_port, "--reload"], + BACKEND, + backend_env, + ) + frontend = start_process( + "frontend", + [npm_command(), "run", "dev", "--", "--port", frontend_port], + FRONTEND, + frontend_env, + ) + + url = f"http://localhost:{frontend_port}" + print(f"[ready] frontend: {url}") + print(f"[ready] backend : http://localhost:{api_port}") + print("[mode] PyTorch .pt only") + print("[stop] Press Ctrl+C to stop both servers.") + + if os.environ.get("NO_BROWSER") != "1": + time.sleep(2) + webbrowser.open(url) + + try: + while True: + if backend.poll() is not None: + print(f"[exit] backend stopped with code {backend.returncode}") + return backend.returncode or 1 + if frontend.poll() is not None: + print(f"[exit] frontend stopped with code {frontend.returncode}") + return frontend.returncode or 1 + time.sleep(1) + except KeyboardInterrupt: + print("\n[stop] stopping servers...") + return 0 + finally: + stop_process(frontend) + stop_process(backend) + + +if __name__ == "__main__": + raise SystemExit(main())