diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c8702b8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,370 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux +# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos,pycharm,git,linux + +### Git ### +# Created by git for backups. To disable backups in Git: +# $ git config --global mergetool.keepBackup false +*.orig + +# Created by git when using merge tools for conflicts +*.BACKUP.* +*.BASE.* +*.LOCAL.* +*.REMOTE.* +*_BACKUP_*.txt +*_BASE_*.txt +*_LOCAL_*.txt +*_REMOTE_*.txt + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# Support for Project snippet scope +.vscode/*.code-snippets + +# Ignore code-workspaces +*.code-workspace + +# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux + +.dockerignore +Dockerfile diff --git a/.github/workflows/build-container.yaml b/.github/workflows/build-container.yaml new file mode 100644 index 0000000..44d878a --- /dev/null +++ b/.github/workflows/build-container.yaml @@ -0,0 +1,43 @@ +# A github action that builds a container image for the project. + +name: Build Container + +on: + push: + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + + paths: + # This is the entire list of files that will trigger the workflow. + - Dockerfile + - pyproject.toml + - requirements-gpu.txt + - .github/workflows/build-container.yaml + - .github/workflows/compute-tag.yaml + +jobs: + compute_tag: + uses: ./.github/workflows/compute-tag.yaml + + docker: + runs-on: ubuntu-latest + needs: compute_tag + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v5 + with: + push: true + # This is the name of the image that will be pushed to Docker Hub. If the branch is main, the image will be tagged as latest. Else, it will be tagged as the branch name. + tags: ${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${{ needs.compute_tag.outputs.image_tag }} + cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${{ needs.compute_tag.outputs.image_tag }} + cache-to: type=inline diff --git a/.github/workflows/build-site.yaml b/.github/workflows/build-site.yaml index eabaf89..3e47f36 100644 --- a/.github/workflows/build-site.yaml +++ b/.github/workflows/build-site.yaml @@ -7,31 +7,36 @@ jobs: build-docs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 + with: + submodules: 'true' ############################################## # Skip caching if using a local runner. - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 if: ${{ !env.ACT }} with: python-version: '3.10' cache: 'pip' cache-dependency-path: "pyproject.toml" - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 if: ${{ env.ACT }} with: python-version: '3.10' ############################################## - - name: Install Dependencies - run: pip install -e ".[build_docs]" + - name: Install specific pip. + run: pip install pip==23.0.0 + + - name: Install doc requirements. + run: pip install mkdocs-material mkdocstrings[python] - name: Build mkdocs site working-directory: docs run: mkdocs build - name: Upload the built site. - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: ${{ !env.ACT }} with: name: site diff --git a/.github/workflows/compute-tag.yaml b/.github/workflows/compute-tag.yaml new file mode 100644 index 0000000..fccf6f2 --- /dev/null +++ b/.github/workflows/compute-tag.yaml @@ -0,0 +1,45 @@ +name: Compute the docker tag for this branch + +on: + workflow_call: + inputs: + # description: 'If true, the tag will be latest if the docker image tag does not exist' + latest_on_noexist: + required: false + type: string + default: 'false' + outputs: + image_tag: + description: 'The tag to use for the docker image' + value: ${{ jobs.compute_tag.outputs.image_tag }} + + +jobs: + compute_tag: + runs-on: ubuntu-latest + outputs: + image_tag: ${{ steps.set_tag.outputs.tag }} + steps: + - id: set_tag + run: | + branch_name="${{ github.head_ref }}" + if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then + echo "tag=latest" >> $GITHUB_OUTPUT + elif [[ "${{ github.event_name }}" == "pull_request" ]]; then + sanitized_branch_name="${branch_name//\//-}" + # If latest_on_noexist is true, set the tag to latest if the tag does not exist. + if [[ "${{ inputs.latest_on_noexist }}" == "true" ]]; then + # Check if the tag exists using docker manifest. + if ! docker manifest inspect ${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${sanitized_branch_name} > /dev/null 2>&1; then + echo "tag=latest" >> $GITHUB_OUTPUT + else + echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT + fi + else + echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT + fi + else + sanitized_branch_name="${GITHUB_REF#refs/heads/}" + sanitized_branch_name="${sanitized_branch_name//\//-}" + echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT + fi diff --git a/.github/workflows/deploy-site.yaml b/.github/workflows/deploy-site.yaml index 98dd7a1..fb275cb 100644 --- a/.github/workflows/deploy-site.yaml +++ b/.github/workflows/deploy-site.yaml @@ -18,18 +18,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Download Site Artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: name: site path: docs/site/ - name: Setup Pages if: ${{ !env.ACT }} - uses: actions/configure-pages@v1 + uses: actions/configure-pages@v5 - name: Upload Artifact to Pages if: ${{ !env.ACT }} - uses: actions/upload-pages-artifact@v1 + uses: actions/upload-pages-artifact@v3 with: path: docs/site/ diff --git a/.github/workflows/merge-request.yaml b/.github/workflows/merge-request.yaml index 8e227ad..d33dcf3 100644 --- a/.github/workflows/merge-request.yaml +++ b/.github/workflows/merge-request.yaml @@ -8,9 +8,18 @@ on: workflow_dispatch: jobs: + compute_tag: + uses: ./.github/workflows/compute-tag.yaml + with: + latest_on_noexist: 'true' + test: uses: ./.github/workflows/run-tests.yaml + needs: compute_tag with: install_string: .[develop] + # Get the image tag from the compute_tag job. + image_tag: ${{ needs.compute_tag.outputs.image_tag }} + build_site: uses: ./.github/workflows/build-site.yaml diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml index 2b14daa..857c021 100644 --- a/.github/workflows/push.yaml +++ b/.github/workflows/push.yaml @@ -8,10 +8,18 @@ on: workflow_dispatch: jobs: + compute_tag: + uses: ./.github/workflows/compute-tag.yaml + with: + latest_on_noexist: 'true' test: uses: ./.github/workflows/run-tests.yaml + needs: compute_tag with: install_string: .[develop] + # Get the image tag from the compute_tag job. + image_tag: ${{ needs.compute_tag.outputs.image_tag }} + build_site: uses: ./.github/workflows/build-site.yaml deploy_site: diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml index 5426c1d..797e0f7 100644 --- a/.github/workflows/run-tests.yaml +++ b/.github/workflows/run-tests.yaml @@ -6,30 +6,33 @@ on: install_string: required: True type: string + image_tag: + required: True + type: string + default: "latest" jobs: test: runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 + container: + # Image tag is "latest" if the branch is main, else it is the branch name. + image: beisner/python_ml_project_template:${{ inputs.image_tag }} - ############################################## - # Skip caching if using a local runner. - - uses: actions/setup-python@v4 - if: ${{ !env.ACT }} - with: - python-version: '3.10' - cache: 'pip' - cache-dependency-path: "pyproject.toml" - - uses: actions/setup-python@v4 - if: ${{ env.ACT }} + defaults: + run: + working-directory: /opt/baeisner/code + + steps: + - uses: actions/checkout@v4 with: - python-version: '3.10' - ############################################## + submodules: 'true' - - name: Install package - run: pip install "${{ inputs.install_string }}" + # Link the code from the default checkout directory to the correct directory. + # Use the github workspace variable to get the correct directory. + # Can't use the checkout action to checkout to a different directory, so we have to simlink. + - name: Move code to correct directory + run: rm -rf /opt/baeisner/code && ln -s $GITHUB_WORKSPACE /opt/baeisner/code - name: Code Quality run: python -m black src/ tests/ --check diff --git a/.gitignore b/.gitignore index 5047b86..12cce24 100644 --- a/.gitignore +++ b/.gitignore @@ -367,3 +367,22 @@ cython_debug/ # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux .idea/ + +# In general, should be ignored. +data/ + +# These are generated by default by lightning, but our settings should nuke. +# THESE SHOULD NO LONGER BE GENERATED! +# checkpoints/ +# lightning_logs/ + +# Generated by wandb. Should be under logs, except for artifacts which is toplevel so as to be shared. +# wandb/ +wandb_artifacts/ + +# Generated by hydra. +# outputs/ +logs/ + +# Generated for pushing to seuss. +.singularity_images/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 65b0961..9376e6b 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,7 +1,10 @@ { "editor.formatOnSave": true, - "python.formatting.provider": "black", + "python.formatting.provider": "none", "editor.codeActionsOnSave": { - "source.organizeImports": true + "source.organizeImports": "explicit" + }, + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter" } } diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..56b5814 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,55 @@ +# Use the official Ubuntu 20.04 image as the base +FROM ubuntu:20.04 + +# Set environment variables to avoid interactive prompts during installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install necessary dependencies +RUN apt-get update && \ + apt-get install -y curl git build-essential libssl-dev zlib1g-dev libbz2-dev \ + git \ + libreadline-dev libsqlite3-dev wget llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev python-openssl && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install pyenv +ENV CODING_ROOT="/opt/baeisner" + +WORKDIR $CODING_ROOT +RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv + +ENV PYENV_ROOT="$CODING_ROOT/.pyenv" +ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH" + +# Install Python 3.10 using pyenv +RUN pyenv install 3.10.0 +RUN pyenv global 3.10.0 + +# Install PyTorch with CUDA support (make sure to adjust this depending on your CUDA version) +RUN pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118/ + +# Make the working directory the home directory +RUN mkdir $CODING_ROOT/code +WORKDIR $CODING_ROOT/code + +# Only copy in the source code that is necessary for the dependencies to install +COPY ./src $CODING_ROOT/code/src +COPY ./setup.py $CODING_ROOT/code/setup.py +COPY ./pyproject.toml $CODING_ROOT/code/pyproject.toml +RUN pip install -e .[develop] + +# Changes to the configs and scripts will not require a rebuild +COPY ./configs $CODING_ROOT/code/configs +COPY ./scripts $CODING_ROOT/code/scripts + +RUN git config --global --add safe.directory /root/code + +# Make a data directory. +RUN mkdir $CODING_ROOT/data + +# Make a logs directory. +RUN mkdir $CODING_ROOT/logs + +# Set up the entry point +CMD ["python", "-c", "import torch; print(torch.cuda.is_available())"] diff --git a/README.md b/README.md index db2156e..9c24e63 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ -# python_pkg_template +# python_ml_project_template -This is a template for a python package with the following features: +This is a template for a Python Machine Learning project with the following features: + +* [Weights and Biases](wandb.ai) support, for experiment tracking and visualization +* [Hydra](https://hydra.cc/) support, for configuration management +* [Pytorch Lightning](https://www.pytorchlightning.ai/) support, for training and logging + +In addition, it contains all the good features from the original version of this repository (and is a proper Python package): * Installable via `pip install`. Anyone can point directly to this Github repository and install your project, either as a regular dependency or as an editable one. * Uses the new [PEP 518, officially-recommended pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) structure for defining project structure and dependencies (instead of requirements.txt) @@ -13,4 +19,75 @@ This is a template for a python package with the following features: * On a Pull Request: install dependencies, run style checks, run Python tests * After merge: same a Pull Request, but also deploy the docs site to the projects Github Pages URL!!!! -All that needs doing is replacing all occurances of `python_pkg_template` and `python-pkg-template` with the name of your package(including the folder `src/python_pkg_template`), the rest should work out of the box! +All that needs doing is replacing all occurances of `python_ml_project_template` and `python-ml-project-template` with the name of your package(including the folder `src/python_ml_project_template`), the rest should work out of the box! + +## Installation + +First, we'll need to install platform-specific dependencies for Pytorch. See [here](https://pytorch.org/get-started/locally/) for more details. For example, if we want to use CUDA 11.8 with Pytorch 2. + +```bash + +pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118/ + +``` + +Then, we can install the package itself: + +```bash + +pip install -e ".[develop,notebook]" + +``` + +Then we install pre-commit hooks: + +```bash + +pre-commit install + +``` + +## Docker + +To build the docker image, run: + +```bash +docker build -t /python-ml-project-template . +``` + +To run the training script locally, run: + +```bash +WANDB_API_KEY= +# Optional: mount current directory to run / test new code. +# Mount data directory to access data. +docker run \ + -v $(pwd)/data:/opt/baeisner/data \ + -v $(pwd)/logs:/opt/baeisner/logs \ + --gpus all \ + -e WANDB_API_KEY=$WANDB_API_KEY \ + -e WANDB_DOCKER_IMAGE=python-ml-project-template \ + python-ml-project-template python scripts/train.py \ + dataset.data_dir=/root/data \ + log_dir=/root/logs +``` + +To push this: + +```bash +docker push /python-ml-project-template:latest +``` + +## Using the CI. + +Set up pushing to docker: + +Put the following secrets in the Github repository: +* `DOCKERHUB_USERNAME`: Your Dockerhub username +* `DOCKERHUB_TOKEN`: Your Dockerhub token + +You'll also need to Ctrl-F replace instances of beisner and baeisner with appropriate usernames. + +## Running on Clusters + +* [Autobot](autobot.md) diff --git a/autobot.md b/autobot.md new file mode 100644 index 0000000..d039f5b --- /dev/null +++ b/autobot.md @@ -0,0 +1,49 @@ +# Instructions for running this thing on Autobot. + + +0. Before you do anything, make sure you've built your docker image and pushed it to dockerhub!!! + +1. ssh into autobot: + + ``` + ssh @autobot.vision.cs.cmu.edu + ``` + + a. *YOU ONLY NEED TO DO THIS ONCE*: Add your wandb API key to your bashrc: + + ```bash + echo 'export WANDB_API_KEY="your_api_key_here"' >> ~/.bashrc + source ~/.bashrc + ``` + +2. Find a node on http://autobot.vision.cs.cmu.edu/mtcmon/ which has open GPUs. + +3. SSH into that node: + + ``` + ssh autobot-0-33 + ``` + + a. *YOU ONLY NEED TO DO THIS ONCE*: Create some scratch directories for your data and logs. + + ```bash + mkdir -p /scratch/$(whoami)/data + mkdir -p /scratch/$(whoami)/logs + ``` +4. Run a training job like so. Don't worry about building or installing. You can modify the files here to map to whatever you want. In future iterations of this, we'll make this easier to do (aka by using a hydra singularity condfig file or something so you don't have to explictly map as arguments). + + You can also change which GPU you want access to using CUDA_VISIBLE_DEVICES below. + + ```bash + SINGULARITYENV_CUDA_VISIBLE_DEVICES=0 \ + SINGULARITYENV_WANDB_DOCKER_IMAGE=python-ml-project-template \ + singularity exec \ + --nv \ + --pwd /opt/$(whoami)/code \ + -B /scratch/$(whoami)/data:/opt/data \ + -B /scratch/$(whoami)/logs:/opt/logs \ + docker://beisner/python-ml-project-template \ + python scripts/train.py \ + dataset.data_dir=/opt/data \ + log_dir=/opt/logs + ``` diff --git a/cluster/build_push_sif_seuss.bash b/cluster/build_push_sif_seuss.bash new file mode 100755 index 0000000..fd31008 --- /dev/null +++ b/cluster/build_push_sif_seuss.bash @@ -0,0 +1,33 @@ +#!/bin/bash +# Build a docker image, convert it to as singularity image, and push it to the seuss cluster. +# Right now, this is a total hack since we can't actually build the docker image on the cluster, +# nor can we build the singularity image on the cluster. So we build the docker image locally, +# convert it to a singularity image locally, and then push it to the cluster. + +# Whole script fails if any command fails. + +set -e + +# Set some variables. +dockerhub_username=beisner +project_name=python_ml_project_template +scs_username=baeisner + +# Get paths. +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +root_dir=$(realpath ${script_dir}/..) + +# Compute a good tag for the image, which will be /:-scratch. +sanitized_branch_name=`${script_dir}/sanitize_branch_name.bash` + +# Build the docker image. +docker build -t ${dockerhub_username}/${project_name}:${sanitized_branch_name}-scratch . + +# Convert the docker image to a singularity image, and save it in the .singularity_images directory. +mkdir -p ${root_dir}/.singularity_images +sif_name=${root_dir}/.singularity_images/${project_name}_${sanitized_branch_name}-scratch.sif +singularity build ${sif_name} docker-daemon://$dockerhub_username}/${project_name}:${sanitized_branch_name}-scratch + +# Rsync the singularity image to the seuss cluster. +rsync -avz --progress ${sif_name} ${scs_username}@seuss.ri.cmu.edu:/home/${scs_username}/singularity_images/ diff --git a/cluster/sanitize_branch_name.bash b/cluster/sanitize_branch_name.bash new file mode 100755 index 0000000..645302c --- /dev/null +++ b/cluster/sanitize_branch_name.bash @@ -0,0 +1,10 @@ +#!/bin/bash + +# Sanitize a branch name for use in a docker image tag. + +branch_name=$(git branch | grep \* | cut -d ' ' -f2) + +# Sanitize by replacing all slashes with underscores. +sanitized_branch_name=$(echo $branch_name | sed 's/\//_/g') + +echo $sanitized_branch_name diff --git a/configs/_logging.yaml b/configs/_logging.yaml new file mode 100644 index 0000000..bec3e19 --- /dev/null +++ b/configs/_logging.yaml @@ -0,0 +1,32 @@ +# Where logs go, i.e. the top folder. +log_dir: ${hydra:runtime.cwd}/logs + +output_dir: ${hydra:runtime.output_dir} + +# This has to come from above. +job_type: ??? + +hydra: + run: + dir: ${log_dir}/${hydra.job.name}/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: ${log_dir}/${hydra.job.name}/sweep/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + job: + chdir: True + name: ${job_type} + +lightning: + checkpoint_dir: ${output_dir}/checkpoints + +wandb: + entity: r-pad + project: python_ml_project_template + + # Group is for grouping runs together (i.e. a train run and an eval run). + group: ??? + + # Where to dump wandb logs, etc. + save_dir: ${output_dir} + # Put artifacts at the toplevel so that we don't have to re-download each time... + artifact_dir: ${hydra:runtime.cwd}/wandb_artifacts diff --git a/configs/dataset/cifar10.yaml b/configs/dataset/cifar10.yaml new file mode 100644 index 0000000..829725b --- /dev/null +++ b/configs/dataset/cifar10.yaml @@ -0,0 +1,4 @@ +name: cifar10 +data_dir: ${hydra:runtime.cwd}/data +image_size: 32 +num_classes: 10 diff --git a/configs/eval.yaml b/configs/eval.yaml new file mode 100644 index 0000000..dfb94bc --- /dev/null +++ b/configs/eval.yaml @@ -0,0 +1,39 @@ +mode: eval + +# This is somewhat arbitrary. +job_type: ${mode}_${dataset.name} + +defaults: + # Each of these have their own configuration parameters. + - dataset: cifar10 + - model: vit + + # A set of inference settings for the model. Note that these may be different + # from / or a subset of the training settings. This is that we don't have to + # provide, like, a learning rater or something to eval. + - inference: ${dataset}_${model} + + # Simple shared imports. + - _logging + + # Override. + - _self_ + +seed: 42 + +# This is the checkpoint that we're evaluating. You can change this to whatever you need, +# like if you want multiple checkpoints simultaneously, etc. +checkpoint: + # If we want to load a model for a specific run, we can change that here. + run_id: ??? + reference: ${wandb.entity}/${wandb.project}/model-${checkpoint.run_id}:best + +resources: + num_workers: 4 + gpus: + - 0 + +wandb: + # The group ***should*** be the same as the training group (so it can be bundled) + # nicely in the UI. But you might have a one-off eval or something. + group: ??? diff --git a/configs/inference/cifar10_vit.yaml b/configs/inference/cifar10_vit.yaml new file mode 100644 index 0000000..1750891 --- /dev/null +++ b/configs/inference/cifar10_vit.yaml @@ -0,0 +1 @@ +batch_size: 128 diff --git a/configs/model/vit.yaml b/configs/model/vit.yaml new file mode 100644 index 0000000..681b719 --- /dev/null +++ b/configs/model/vit.yaml @@ -0,0 +1,8 @@ +name: vit +hidden_dim: 512 +num_heads: 8 +num_layers: 6 +patch_size: 4 +representation_size: 256 +mlp_dim: 2048 +dropout: 0.2 diff --git a/configs/train.yaml b/configs/train.yaml new file mode 100644 index 0000000..356c535 --- /dev/null +++ b/configs/train.yaml @@ -0,0 +1,29 @@ +mode: train + +# This is somewhat arbitrary. +job_type: ${mode}_${dataset.name} + +defaults: + # Each of these have their own configuration parameters. + - dataset: cifar10 + - model: vit + + # We assume a different training config for each dataset/model pair. + - training: ${dataset}_${model} + + # Simple shared imports. + - _logging + + # Override. + - _self_ + +seed: 42 + +resources: + num_workers: 4 + gpus: + - 0 + +wandb: + # Assume no group provided, we will create a default one. + group: Null diff --git a/configs/training/cifar10_vit.yaml b/configs/training/cifar10_vit.yaml new file mode 100644 index 0000000..4324b00 --- /dev/null +++ b/configs/training/cifar10_vit.yaml @@ -0,0 +1,3 @@ +lr: 3e-4 +batch_size: 128 +epochs: 100 diff --git a/docs/docs/index.md b/docs/docs/index.md index 155bb54..2b65e56 100644 --- a/docs/docs/index.md +++ b/docs/docs/index.md @@ -1,4 +1,4 @@ -# python_pkg_template +# python_ml_project_template Some sample text for the website. diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index a2f913a..07fb1e8 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: python_pkg_template +site_name: python_ml_project_template theme: name: material plugins: diff --git a/pyproject.toml b/pyproject.toml index c930d6f..e0dcf4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,24 @@ [project] -name = "python-pkg-template" +name = "python-ml-project-template" version = "0.1.0" description = "A Python Package Template" readme = "README.md" requires-python = ">=3.6" -license = {file = "LICENSE.txt"} -authors = [ - {email = "baeisner@andrew.cmu.edu", name = "Ben Eisner"} +license = { file = "LICENSE.txt" } +authors = [{ email = "baeisner@andrew.cmu.edu", name = "Ben Eisner" }] +dependencies = [ + "hydra-core == 1.3.2", + "lightning == 2.0.3", + "omegaconf == 2.3.0", + "pandas", + "torch == 2.0.1", # CUDA 11.8 + "torchmetrics", + "torchvision == 0.15.2", # CUDA 11.8 + "wandb == 0.15.4", ] -dependencies = [] [build-system] -requires = [ - "setuptools >= 62.3.2", - "setuptools-scm", - "wheel", -] +requires = ["setuptools >= 62.3.2", "setuptools-scm", "wheel"] build-backend = "setuptools.build_meta" [project.optional-dependencies] @@ -24,38 +27,42 @@ develop = [ "black == 23.3.0", "isort == 5.12.0", "mypy == 1.3.0", + "pandas-stubs == 2.0.2.230605", "pylint == 2.17.4", "pytest == 7.3.2", "pre-commit == 3.3.3", ] -notebooks = [ - "jupyter", -] -build_docs = [ - "mkdocs-material", - "mkdocstrings[python]", -] +notebooks = ["jupyter"] +build_docs = ["mkdocs-material", "mkdocstrings[python]"] # This is required to allow us to have notebooks/ at the top level. [tool.setuptools.packages.find] where = ["src"] [tool.setuptools.package-data] -python_pkg_template = ["py.typed"] +python_ml_project_template = ["py.typed"] [tool.isort] profile = "black" +known_third_party = "wandb" [tool.mypy] -python_version = 3.8 +python_version = "3.10" warn_return_any = true warn_unused_configs = true mypy_path = "src" namespace_packages = true explicit_package_bases = true -# # Uncomment this when you have imports for mypy to ignore. -# [[tool.mypy.overrides]] -# module = [ -# ] -# ignore_missing_imports = true +[[tool.mypy.overrides]] +module = ["torchvision.*"] +ignore_missing_imports = true + +[tool.pytest.ini_options] +testpaths = "tests" + +[tool.pylint] +known-third-party = "wandb" + +[tool.pylint.TYPECHECK] +generated-members = 'torch.*' diff --git a/scripts/eval.py b/scripts/eval.py new file mode 100644 index 0000000..c112d12 --- /dev/null +++ b/scripts/eval.py @@ -0,0 +1,180 @@ +import hydra +import lightning as L +import omegaconf +import torch +import torch.utils._pytree as pytree +import wandb + +from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule +from python_ml_project_template.metrics.classification import get_metrics +from python_ml_project_template.models.classifier import ClassifierInferenceModule +from python_ml_project_template.utils.script_utils import ( + PROJECT_ROOT, + create_model, + flatten_outputs, + match_fn, +) + + +@torch.no_grad() +@hydra.main(config_path="../configs", config_name="eval", version_base="1.3") +def main(cfg): + ###################################################################### + # Torch settings. + ###################################################################### + + # Make deterministic + reproducible. + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Since most of us are training on 3090s+, we can use mixed precision. + torch.set_float32_matmul_precision("medium") + + # Global seed for reproducibility. + L.seed_everything(42) + + ###################################################################### + # Create the datamodule. + # Should be the same one as in training, but we're gonna use val+test + # dataloaders. + ###################################################################### + + datamodule = CIFAR10DataModule( + root=cfg.dataset.data_dir, + batch_size=cfg.inference.batch_size, + num_workers=cfg.resources.num_workers, + ) + # Gotta call this in order to establish the dataloaders. + datamodule.setup("predict") + + ###################################################################### + # Set up logging in WandB. + # This is a different job type (eval), but we want it all grouped + # together. Notice that we use our own logging here (not lightning). + ###################################################################### + + # Create a run. + run = wandb.init( + entity=cfg.wandb.entity, + project=cfg.wandb.project, + dir=cfg.wandb.save_dir, + config=omegaconf.OmegaConf.to_container( + cfg, resolve=True, throw_on_missing=True + ), + job_type=cfg.job_type, + save_code=True, # This just has the main script. + group=cfg.wandb.group, + ) + + # Log the code. + wandb.run.log_code( + root=PROJECT_ROOT, + include_fn=match_fn( + dirs=["configs", "scripts", "src"], + extensions=[".py", ".yaml"], + ), + ) + + ###################################################################### + # Create the network(s) which will be evaluated (same as training). + # You might want to put this into a "create_network" function + # somewhere so train and eval can be the same. + # + # We'll also load the weights. + ###################################################################### + + network = create_model( + image_size=cfg.dataset.image_size, + num_classes=cfg.dataset.num_classes, + model_cfg=cfg.model, + ) + + # Get the checkpoint file. If it's a wandb reference, download. + # Otherwise look to disk. + checkpoint_reference = cfg.checkpoint.reference + if checkpoint_reference.startswith(cfg.wandb.entity): + # download checkpoint locally (if not already cached) + artifact_dir = cfg.wandb.artifact_dir + artifact = run.use_artifact(checkpoint_reference, type="model") + ckpt_file = artifact.get_path("model.ckpt").download(root=artifact_dir) + else: + ckpt_file = checkpoint_reference + + # Load the network weights. + ckpt = torch.load(ckpt_file) + network.load_state_dict( + {k.partition(".")[2]: v for k, v, in ckpt["state_dict"].items()} + ) + + ###################################################################### + # Create an inference module, which is basically just a bare-bones + # class which runs the model. In this example, we only implement + # the "predict_step" function, which may not be the blessed + # way to do it vis a vis lightning, but whatever. + # + # If this is a downstream application or something, you might + # want to implement a different interface (like with a "predict" + # function), so you can pass in un-batched observations from an + # environment, for instance. + ###################################################################### + + model = ClassifierInferenceModule(network) + + ###################################################################### + # Create the trainer. + # Bit of a misnomer here, we're not doing training. But we are gonna + # use it to set up the model appropriately and do all the batching + # etc. + # + # If this is a different kind of downstream eval, chuck this block. + ###################################################################### + + trainer = L.Trainer( + accelerator="gpu", + devices=cfg.resources.gpus, + precision="16-mixed", + logger=False, + ) + + ###################################################################### + # Run the model on the train/val/test sets. + # This outputs a list of dictionaries, one for each batch. This + # is annoying to work with, so later we'll flatten. + # + # If a downstream eval, you can swap it out with whatever the eval + # function is. + ###################################################################### + + train_outputs, val_outputs, test_outputs = trainer.predict( + model, + dataloaders=[ + *datamodule.val_dataloader(), # There are two different loaders (train_val and val). + datamodule.test_dataloader(), + ], + ) + + for outputs_list, name in [ + (train_outputs, "train"), + (val_outputs, "val"), + (test_outputs, "test"), + ]: + # Put everything on CPU, and flatten a list of dicts into one dict. + out_cpu = [pytree.tree_map(lambda x: x.cpu(), o) for o in outputs_list] + outputs = flatten_outputs(out_cpu) + + # Compute the metrics. + metrics = get_metrics(outputs["preds"], outputs["labels"]) + global_acc = metrics["global_acc"] + macro_acc = metrics["macro_acc"] + acc_df = metrics["acc_df"] + + # Log the metrics + table to wandb. + run.summary[f"{name}_true_accuracy"] = global_acc + run.summary[f"{name}_class_balanced_accuracy"] = macro_acc + + table = wandb.Table(dataframe=acc_df) + run.log({f"{name}_accuracy_table": table}) + + +if __name__ == "__main__": + main() diff --git a/scripts/train.py b/scripts/train.py new file mode 100644 index 0000000..07e14d0 --- /dev/null +++ b/scripts/train.py @@ -0,0 +1,186 @@ +import json + +import hydra +import lightning as L +import omegaconf +import torch +import wandb +from lightning.pytorch.callbacks import ModelCheckpoint +from lightning.pytorch.loggers import WandbLogger + +from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule +from python_ml_project_template.models.classifier import ClassifierTrainingModule +from python_ml_project_template.utils.script_utils import ( + PROJECT_ROOT, + LogPredictionSamplesCallback, + create_model, + match_fn, +) + + +@hydra.main(config_path="../configs", config_name="train", version_base="1.3") +def main(cfg): + print( + json.dumps( + omegaconf.OmegaConf.to_container(cfg, resolve=True, throw_on_missing=False), + sort_keys=True, + indent=4, + ) + ) + ###################################################################### + # Torch settings. + ###################################################################### + + # Make deterministic + reproducible. + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + # Since most of us are training on 3090s+, we can use mixed precision. + torch.set_float32_matmul_precision("medium") + + # Global seed for reproducibility. + L.seed_everything(cfg.seed) + + ###################################################################### + # Create the datamodule. + # The datamodule is responsible for all the data loading, including + # downloading the data, and splitting it into train/val/test. + # + # This could be swapped out for a different datamodule in-place, + # or with an if statement, or by using hydra.instantiate. + ###################################################################### + + datamodule = CIFAR10DataModule( + root=cfg.dataset.data_dir, + batch_size=cfg.training.batch_size, + num_workers=cfg.resources.num_workers, + ) + + ###################################################################### + # Create the network(s) which will be trained by the Training Module. + # The network should (ideally) be lightning-independent. This allows + # us to use the network in other projects, or in other training + # configurations. + # + # This might get a bit more complicated if we have multiple networks, + # but we can just customize the training module and the Hydra configs + # to handle that case. No need to over-engineer it. You might + # want to put this into a "create_network" function somewhere so train + # and eval can be the same. + # + # If it's a custom network, a good idea is to put the custom network + # in `python_ml_project_template.nets.my_net`. + ###################################################################### + + # Model architecture is dataset-dependent, so we have a helper + # function to create the model (while separating out relevant vals). + network = create_model( + image_size=cfg.dataset.image_size, + num_classes=cfg.dataset.num_classes, + model_cfg=cfg.model, + ) + + ###################################################################### + # Create the training module. + # The training module is responsible for all the different parts of + # training, including the network, the optimizer, the loss function, + # and the logging. + ###################################################################### + + model = ClassifierTrainingModule(network, training_cfg=cfg.training) + + ###################################################################### + # Set up logging in WandB. + # This is a bit complicated, because we want to log the codebase, + # the model, and the checkpoints. + ###################################################################### + + # If no group is provided, then we should create a new one (so we can allocate) + # evaluations to this group later. + if cfg.wandb.group is None: + id = wandb.util.generate_id() + group = "experiment-" + id + else: + group = cfg.wandb.group + + logger = WandbLogger( + entity=cfg.wandb.entity, + project=cfg.wandb.project, + log_model=True, # Only log the last checkpoint to wandb, and only the LAST model checkpoint. + save_dir=cfg.wandb.save_dir, + config=omegaconf.OmegaConf.to_container( + cfg, resolve=True, throw_on_missing=True + ), + job_type=cfg.job_type, + save_code=True, # This just has the main script. + group=group, + ) + + ###################################################################### + # Create the trainer. + # The trainer is responsible for running the training loop, and + # logging the results. + # + # There are a few callbacks (which we could customize): + # - LogPredictionSamplesCallback: Logs some examples from the dataset, + # and the model's predictions. + # - ModelCheckpoint #1: Saves the latest model. + # - ModelCheckpoint #2: Saves the best model (according to validation + # loss), and logs it to wandb. + ###################################################################### + + trainer = L.Trainer( + accelerator="gpu", + devices=cfg.resources.gpus, + precision="16-mixed", + max_epochs=cfg.training.epochs, + logger=logger, + callbacks=[ + # Callback which logs whatever visuals (i.e. dataset examples, preds, etc.) we want. + LogPredictionSamplesCallback(logger), + # This checkpoint callback saves the latest model during training, i.e. so we can resume if it crashes. + # It saves everything, and you can load by referencing last.ckpt. + ModelCheckpoint( + dirpath=cfg.lightning.checkpoint_dir, + filename="{epoch}-{step}", + monitor="step", + mode="max", + save_weights_only=False, + save_last=True, + ), + # This checkpoint will get saved to WandB. The Callback mechanism in lightning is poorly designed, so we have to put it last. + ModelCheckpoint( + dirpath=cfg.lightning.checkpoint_dir, + filename="{epoch}-{step}-{val_loss:.2f}-weights-only", + monitor="val_loss", + mode="min", + save_weights_only=True, + ), + ], + ) + + ###################################################################### + # Log the code to wandb. + # This is somewhat custom, you'll have to edit this to include whatever + # additional files you want, but basically it just logs all the files + # in the project root inside dirs, and with extensions. + ###################################################################### + + # Log the code used to train the model. Make sure not to log too much, because it will be too big. + wandb.run.log_code( + root=PROJECT_ROOT, + include_fn=match_fn( + dirs=["configs", "scripts", "src"], + extensions=[".py", ".yaml"], + ), + ) + + ###################################################################### + # Train the model. + ###################################################################### + + trainer.fit(model, datamodule=datamodule) + + +if __name__ == "__main__": + main() diff --git a/src/python_pkg_template/__init__.py b/src/python_ml_project_template/__init__.py similarity index 100% rename from src/python_pkg_template/__init__.py rename to src/python_ml_project_template/__init__.py diff --git a/src/python_pkg_template/py.typed b/src/python_ml_project_template/datasets/__init__.py similarity index 100% rename from src/python_pkg_template/py.typed rename to src/python_ml_project_template/datasets/__init__.py diff --git a/src/python_ml_project_template/datasets/cifar10.py b/src/python_ml_project_template/datasets/cifar10.py new file mode 100644 index 0000000..3cd7573 --- /dev/null +++ b/src/python_ml_project_template/datasets/cifar10.py @@ -0,0 +1,102 @@ +import lightning as L +import torch +import torch.utils.data as data +import torchvision as tv +from torchvision import transforms as T + + +class CIFAR10DataModule(L.LightningDataModule): + def __init__(self, root, batch_size, num_workers): + super().__init__() + self.root = root + self.batch_size = batch_size + self.num_workers = num_workers + + def prepare_data(self): + # Anything that needs to be done to download. + tv.datasets.CIFAR10(self.root, train=True, download=True) + tv.datasets.CIFAR10(self.root, train=False, download=True) + + def setup(self, stage: str): + # Set up data augmentation. + train_transform = T.Compose( + [ + T.RandomHorizontalFlip(), + T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)), + T.ToTensor(), + T.Normalize( + [0.49139968, 0.48215841, 0.44653091], + [0.24703223, 0.24348513, 0.26158784], + ), + ] + ) + + test_transform = T.Compose( + [ + T.ToTensor(), + T.Normalize( + [0.49139968, 0.48215841, 0.44653091], + [0.24703223, 0.24348513, 0.26158784], + ), + ] + ) + + # We want to split the training set into train and val. But we don't want transforms on val. + # So we create two datasets, and make sure that the split is consistent between them. + train_dataset = tv.datasets.CIFAR10( + self.root, train=True, transform=train_transform + ) + val_dataset = tv.datasets.CIFAR10( + self.root, train=True, transform=test_transform + ) + generator = torch.Generator().manual_seed(42) + self.train_set, _ = torch.utils.data.random_split( + train_dataset, [45000, 5000], generator=generator + ) + train_val_set, val_set = torch.utils.data.random_split( + val_dataset, [45000, 5000], generator=generator + ) + self.train_val_set = train_val_set + self.val_set = val_set + + # Test set. + self.test_set = tv.datasets.CIFAR10( + self.root, train=False, transform=test_transform + ) + + def train_dataloader(self): + return data.DataLoader( + self.train_set, + batch_size=self.batch_size, + shuffle=True, + drop_last=True, + pin_memory=True, + num_workers=self.num_workers, + ) + + def val_dataloader(self): + return [ + data.DataLoader( + self.train_val_set, + batch_size=self.batch_size, + shuffle=False, + drop_last=False, + num_workers=self.num_workers, + ), + data.DataLoader( + self.val_set, + batch_size=self.batch_size, + shuffle=False, + drop_last=False, + num_workers=self.num_workers, + ), + ] + + def test_dataloader(self): + return data.DataLoader( + self.test_set, + batch_size=self.batch_size, + shuffle=False, + drop_last=False, + num_workers=self.num_workers, + ) diff --git a/src/python_ml_project_template/metrics/__init__.py b/src/python_ml_project_template/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python_ml_project_template/metrics/classification.py b/src/python_ml_project_template/metrics/classification.py new file mode 100644 index 0000000..68ca4bc --- /dev/null +++ b/src/python_ml_project_template/metrics/classification.py @@ -0,0 +1,22 @@ +import pandas as pd +import torchmetrics.functional.classification as tfc + + +def get_metrics(preds, labels): + # "True" accuracy, aka on the true distribution (without considering class imbalance). + global_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="micro") + + # Per-class accuracy, averaged over all classes with equal weight. + macro_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="macro") + + # Per-class accuracy, not averaged over all classes. + class_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="none") + + # Create a dataframe with the per-label accuracies. + acc_df = pd.DataFrame(class_acc[None], columns=[str(i) for i in range(10)]) + + return { + "global_acc": global_acc, + "macro_acc": macro_acc, + "acc_df": acc_df, + } diff --git a/src/python_ml_project_template/models/__init__.py b/src/python_ml_project_template/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python_ml_project_template/models/classifier.py b/src/python_ml_project_template/models/classifier.py new file mode 100644 index 0000000..f567782 --- /dev/null +++ b/src/python_ml_project_template/models/classifier.py @@ -0,0 +1,61 @@ +from typing import Any + +import lightning as L +import torch.nn.functional as F +from torch import optim + + +class ClassifierTrainingModule(L.LightningModule): + def __init__(self, network, training_cfg) -> None: + super().__init__() + self.network = network + self.lr = training_cfg.lr + + def forward(self, x): + self.network(x) + + def configure_optimizers(self): + optimizer = optim.AdamW(self.parameters(), lr=self.lr) + lr_scheduler = optim.lr_scheduler.MultiStepLR( + optimizer, milestones=[100, 150], gamma=0.1 + ) + return [optimizer], [lr_scheduler] + + def _calculate_loss(self, batch, mode="train"): + imgs, labels = batch + preds = self.network(imgs) + loss = F.cross_entropy(preds, labels) + acc = (preds.argmax(dim=-1) == labels).float().mean() + + istrain = mode == "train" + self.log("%s_loss" % mode, loss, prog_bar=istrain, add_dataloader_idx=False) + self.log("%s_acc" % mode, acc, add_dataloader_idx=False) + return {"loss": loss, "acc": acc, "preds": preds} + + def training_step(self, batch, batch_idx): + loss = self._calculate_loss(batch, mode="train") + return loss + + def validation_step(self, batch, batch_idx, dataloader_idx=0): + if dataloader_idx == 0: + mode = "train_val" + else: + mode = "val" + return self._calculate_loss(batch, mode=mode) + + def test_step(self, batch, batch_idx): + return self._calculate_loss(batch, mode="test") + + +class ClassifierInferenceModule(L.LightningModule): + def __init__(self, network) -> None: + super().__init__() + self.network = network + + def forward(self, x): + self.network(x) + + def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any: + imgs, labels = batch + preds = self.network(imgs) + return {"preds": preds, "labels": labels} diff --git a/src/python_ml_project_template/nets/__init__.py b/src/python_ml_project_template/nets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/python_ml_project_template/py.typed b/src/python_ml_project_template/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/python_ml_project_template/utils/script_utils.py b/src/python_ml_project_template/utils/script_utils.py new file mode 100644 index 0000000..ebb68b5 --- /dev/null +++ b/src/python_ml_project_template/utils/script_utils.py @@ -0,0 +1,95 @@ +import os +import pathlib +from typing import Dict, List, Sequence, Union, cast + +import torch +import torch.utils._pytree as pytree +import torchvision as tv +import wandb +from lightning.pytorch import Callback +from pytorch_lightning.loggers import WandbLogger + +PROJECT_ROOT = str(pathlib.Path(__file__).parent.parent.parent.parent.resolve()) + + +def create_model(image_size, num_classes, model_cfg): + if model_cfg.name == "vit": + return tv.models.VisionTransformer( + image_size=image_size, + num_classes=num_classes, + hidden_dim=model_cfg.hidden_dim, + num_heads=model_cfg.num_heads, + num_layers=model_cfg.num_layers, + patch_size=model_cfg.patch_size, + representation_size=model_cfg.representation_size, + mlp_dim=model_cfg.mlp_dim, + dropout=model_cfg.dropout, + ) + else: + raise ValueError("not a valid model name") + + +# This matching function +def match_fn(dirs: Sequence[str], extensions: Sequence[str], root: str = PROJECT_ROOT): + def _match_fn(path: pathlib.Path): + in_dir = any([str(path).startswith(os.path.join(root, d)) for d in dirs]) + + if not in_dir: + return False + + if not any([str(path).endswith(e) for e in extensions]): + return False + + return True + + return _match_fn + + +TorchTree = Dict[str, Union[torch.Tensor, "TorchTree"]] + + +def flatten_outputs(outputs: List[TorchTree]) -> TorchTree: + """Flatten a list of dictionaries into a single dictionary.""" + + # Concatenate all leaf nodes in the trees. + flattened_outputs = [pytree.tree_flatten(output) for output in outputs] + flattened_list = [o[0] for o in flattened_outputs] + flattened_spec = flattened_outputs[0][1] # Spec definitely should be the same... + cat_flat = [torch.cat(x) for x in list(zip(*flattened_list))] + output_dict = pytree.tree_unflatten(cat_flat, flattened_spec) + return cast(TorchTree, output_dict) + + +class LogPredictionSamplesCallback(Callback): + def __init__(self, logger: WandbLogger): + self.logger = logger + + def on_validation_batch_end( + self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0 + ): + """Called when the validation batch ends.""" + + # `outputs` comes from `LightningModule.validation_step` + # which corresponds to our model predictions in this case + + # Let's log 20 sample image predictions from the first batch + if batch_idx == 0: + n = 20 + x, y = batch + images = [img for img in x[:n]] + outs = outputs["preds"][:n].argmax(dim=1) + captions = [ + f"Ground Truth: {y_i} - Prediction: {y_pred}" + for y_i, y_pred in zip(y[:n], outs) + ] + + # Option 1: log images with `WandbLogger.log_image` + self.logger.log_image(key="sample_images", images=images, caption=captions) + + # Option 2: log images and predictions as a W&B Table + columns = ["image", "ground truth", "prediction"] + data = [ + [wandb.Image(x_i), y_i, y_pred] + for x_i, y_i, y_pred in list(zip(x[:n], y[:n], outs)) + ] + self.logger.log_table(key="sample_table", columns=columns, data=data)