From 61e668d7e1e33d22a125a30a7eae5e2a65e94a2a Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Fri, 16 Jun 2023 22:30:34 -0400
Subject: [PATCH 01/11] set up the repo for ml experiments

---
 README.md                                            | 12 +++++++++---
 docs/docs/index.md                                   |  2 +-
 docs/mkdocs.yml                                      |  2 +-
 pyproject.toml                                       |  4 ++--
 .../__init__.py                                      |  0
 .../py.typed                                         |  0
 6 files changed, 13 insertions(+), 7 deletions(-)
 rename src/{python_pkg_template => python_ml_project_template}/__init__.py (100%)
 rename src/{python_pkg_template => python_ml_project_template}/py.typed (100%)

diff --git a/README.md b/README.md
index db2156e..3ea56f7 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
-# python_pkg_template
+# python_ml_project_template
 
-This is a template for a python package with the following features:
+This is a template for a Python Machine Learning project with the following features:
+
+* [Weights and Biases](wandb.ai) support, for experiment tracking and visualization
+* [Hydra](https://hydra.cc/) support, for configuration management
+* [Pytorch Lightning](https://www.pytorchlightning.ai/) support, for training and logging
+
+In addition, it contains all the good features from the original version of this repository (and is a proper Python package):
 
 * Installable via `pip install`. Anyone can point directly to this Github repository and install your project, either as a regular dependency or as an editable one.
 * Uses the new [PEP 518, officially-recommended pyproject.toml](https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/) structure for defining project structure and dependencies (instead of requirements.txt)
@@ -13,4 +19,4 @@ This is a template for a python package with the following features:
     * On a Pull Request: install dependencies, run style checks, run Python tests
     * After merge: same a Pull Request, but also deploy the docs site to the projects Github Pages URL!!!!
 
-All that needs doing is replacing all occurances of `python_pkg_template` and `python-pkg-template` with the name of your package(including the folder `src/python_pkg_template`), the rest should work out of the box!
+All that needs doing is replacing all occurances of `python_ml_project_template` and `python-ml-project-template` with the name of your package(including the folder `src/python_ml_project_template`), the rest should work out of the box!
diff --git a/docs/docs/index.md b/docs/docs/index.md
index 155bb54..2b65e56 100644
--- a/docs/docs/index.md
+++ b/docs/docs/index.md
@@ -1,4 +1,4 @@
-# python_pkg_template
+# python_ml_project_template
 
 Some sample text for the website.
 
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index a2f913a..07fb1e8 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: python_pkg_template
+site_name: python_ml_project_template
 theme:
   name: material
 plugins:
diff --git a/pyproject.toml b/pyproject.toml
index c930d6f..d2dd489 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "python-pkg-template"
+name = "python-ml-project-template"
 version = "0.1.0"
 description = "A Python Package Template"
 readme = "README.md"
@@ -41,7 +41,7 @@ build_docs = [
 where = ["src"]
 
 [tool.setuptools.package-data]
-python_pkg_template = ["py.typed"]
+python_ml_project_template = ["py.typed"]
 
 [tool.isort]
 profile = "black"
diff --git a/src/python_pkg_template/__init__.py b/src/python_ml_project_template/__init__.py
similarity index 100%
rename from src/python_pkg_template/__init__.py
rename to src/python_ml_project_template/__init__.py
diff --git a/src/python_pkg_template/py.typed b/src/python_ml_project_template/py.typed
similarity index 100%
rename from src/python_pkg_template/py.typed
rename to src/python_ml_project_template/py.typed

From 8e35e8f8a8f7fc5b7812e53566e6e1a2f8a166f0 Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Fri, 16 Jun 2023 22:49:53 -0400
Subject: [PATCH 02/11] first version of the training script on CIFAR10 with a
 ViT

first version of training script, probably not great because we want to do something a bit more complicated than MNIST

switch to CIFAR10, ViT
---
 .gitignore            |   7 ++
 .vscode/settings.json |   5 +-
 README.md             |  26 +++++++
 pyproject.toml        |   7 +-
 scripts/train.py      | 155 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 scripts/train.py

diff --git a/.gitignore b/.gitignore
index 5047b86..8db63d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -367,3 +367,10 @@ cython_debug/
 # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux
 
 .idea/
+
+# In general, should be ignored.
+data/
+
+# These are generated by default by lightning, but bad. We want to restructure.
+checkpoints/
+lightning_logs/
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 65b0961..0c4b36b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,7 +1,10 @@
 {
     "editor.formatOnSave": true,
-    "python.formatting.provider": "black",
+    "python.formatting.provider": "none",
     "editor.codeActionsOnSave": {
         "source.organizeImports": true
+    },
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
     }
 }
diff --git a/README.md b/README.md
index 3ea56f7..e3fb3dc 100644
--- a/README.md
+++ b/README.md
@@ -20,3 +20,29 @@ In addition, it contains all the good features from the original version of this
     * After merge: same a Pull Request, but also deploy the docs site to the projects Github Pages URL!!!!
 
 All that needs doing is replacing all occurances of `python_ml_project_template` and `python-ml-project-template` with the name of your package(including the folder `src/python_ml_project_template`), the rest should work out of the box!
+
+## Installation
+
+First, we'll need to install platform-specific dependencies for Pytorch. See [here](https://pytorch.org/get-started/locally/) for more details. For example, if we want to use CUDA 11.8 with Pytorch 2.
+
+```bash
+
+pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118/
+
+```
+
+Then, we can install the package itself:
+
+```bash
+
+pip install -e ".[develop,notebook]"
+
+```
+
+Then we install pre-commit hooks:
+
+```bash
+
+pre-commit install
+
+```
diff --git a/pyproject.toml b/pyproject.toml
index d2dd489..8954769 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,12 @@ license = {file = "LICENSE.txt"}
 authors = [
   {email = "baeisner@andrew.cmu.edu", name = "Ben Eisner"}
 ]
-dependencies = []
+dependencies = [
+  "lightning == 2.0.3",
+  # CUDA 11.8
+  "torch == 2.0.1",
+  "torchvision == 0.15.2",
+]
 
 [build-system]
 requires = [
diff --git a/scripts/train.py b/scripts/train.py
new file mode 100644
index 0000000..bab9d87
--- /dev/null
+++ b/scripts/train.py
@@ -0,0 +1,155 @@
+# This file is based on https://github.com/Lightning-AI/lightning#hello-simple-model
+
+from typing import Any
+
+import lightning as L
+import torch
+import torch.nn.functional as F
+import torch.utils.data as data
+import torchvision as tv
+from torch import optim
+from torchvision import transforms as T
+
+# TODOs:
+# * Switch to CIFAR10
+# * Add a DataModule
+# * Add hydra configs
+# * Align the checkpoints and log files
+# * Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
+# * Add an eval script which loads from wandb, and outputs an artifact.
+
+
+class ClassifierModule(L.LightningModule):
+    def __init__(self, network, lr: float) -> None:
+        super().__init__()
+        self.network = network
+        self.lr = lr
+
+    def forward(self, x):
+        self.network(x)
+
+    def configure_optimizers(self) -> Any:
+        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
+        lr_scheduler = optim.lr_scheduler.MultiStepLR(
+            optimizer, milestones=[100, 150], gamma=0.1
+        )
+        return [optimizer], [lr_scheduler]
+
+    def _calculate_loss(self, batch, mode="train"):
+        imgs, labels = batch
+        preds = self.network(imgs)
+        loss = F.cross_entropy(preds, labels)
+        acc = (preds.argmax(dim=-1) == labels).float().mean()
+
+        self.log("%s_loss" % mode, loss, prog_bar=mode == "train")
+        self.log("%s_acc" % mode, acc)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        loss = self._calculate_loss(batch, mode="train")
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        self._calculate_loss(batch, mode="val")
+
+    def test_step(self, batch, batch_idx):
+        self._calculate_loss(batch, mode="test")
+
+
+def main():
+    # Global seed for reproducibility.
+    L.seed_everything(42)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    # Since most of us are training on 3090s+, we can use mixed precision.
+    torch.set_float32_matmul_precision("medium")
+
+    # Set up data augmentation.
+    train_transform = T.Compose(
+        [
+            T.RandomHorizontalFlip(),
+            T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
+            T.ToTensor(),
+            T.Normalize(
+                [0.49139968, 0.48215841, 0.44653091],
+                [0.24703223, 0.24348513, 0.26158784],
+            ),
+        ]
+    )
+
+    test_transform = T.Compose(
+        [
+            T.ToTensor(),
+            T.Normalize(
+                [0.49139968, 0.48215841, 0.44653091],
+                [0.24703223, 0.24348513, 0.26158784],
+            ),
+        ]
+    )
+
+    root = "./data"
+
+    # We want to split the training set into train and val. But we don't want transforms on val.
+    L.seed_everything(42)
+    train_dataset = tv.datasets.CIFAR10(
+        root, train=True, transform=train_transform, download=True
+    )
+    L.seed_everything(42)
+    val_dataset = tv.datasets.CIFAR10(
+        root, train=True, transform=test_transform, download=True
+    )
+    L.seed_everything(42)
+    train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
+    L.seed_everything(42)
+    _, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])
+
+    # Test set.
+    L.seed_everything(42)
+    test_set = tv.datasets.CIFAR10(
+        root, train=False, transform=test_transform, download=True
+    )
+
+    # Loaders.
+    train_loader = data.DataLoader(
+        train_set,
+        batch_size=128,
+        shuffle=True,
+        drop_last=True,
+        pin_memory=True,
+        num_workers=4,
+    )
+    val_loader = data.DataLoader(
+        val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4
+    )
+    test_loader = data.DataLoader(
+        test_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4
+    )
+
+    network = tv.models.VisionTransformer(
+        image_size=32,
+        hidden_dim=512,
+        num_heads=8,
+        num_layers=6,
+        patch_size=4,
+        # num_channels=3,
+        # num_patches=64,
+        num_classes=10,
+        representation_size=256,
+        mlp_dim=2048,
+        dropout=0.2,
+    )
+    model = ClassifierModule(network, lr=3e-4)
+
+    trainer = L.Trainer(
+        accelerator="gpu",
+        devices=1,
+        precision="16-mixed",
+        max_epochs=180,
+        logger=False,
+    )
+    trainer.fit(model, train_loader, val_loader)
+
+
+if __name__ == "__main__":
+    main()

From 7d6398b896afa7ced20faa782b9acf6f61a6c93b Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Sun, 18 Jun 2023 19:08:30 -0400
Subject: [PATCH 03/11] added logging to weights and biases

added image logging, nice
---
 .gitignore                                    |  3 +
 pyproject.toml                                |  5 ++
 scripts/train.py                              | 85 +++++++++++++++----
 .../utils/script_utils.py                     | 60 +++++++++++++
 4 files changed, 135 insertions(+), 18 deletions(-)
 create mode 100644 src/python_ml_project_template/utils/script_utils.py

diff --git a/.gitignore b/.gitignore
index 8db63d6..c65f631 100644
--- a/.gitignore
+++ b/.gitignore
@@ -374,3 +374,6 @@ data/
 # These are generated by default by lightning, but bad. We want to restructure.
 checkpoints/
 lightning_logs/
+
+# Generated by wandb, but we'll nuke this soon:
+wandb/
diff --git a/pyproject.toml b/pyproject.toml
index 8954769..23acb2e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
   # CUDA 11.8
   "torch == 2.0.1",
   "torchvision == 0.15.2",
+  "wandb == 0.15.4",
 ]
 
 [build-system]
@@ -50,6 +51,7 @@ python_ml_project_template = ["py.typed"]
 
 [tool.isort]
 profile = "black"
+known_third_party = "wandb"
 
 [tool.mypy]
 python_version = 3.8
@@ -64,3 +66,6 @@ explicit_package_bases = true
 # module = [
 # ]
 # ignore_missing_imports = true
+
+[tool.pylint]
+known-third-party = "wandb"
diff --git a/scripts/train.py b/scripts/train.py
index bab9d87..cb4bcd1 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,22 +1,30 @@
-# This file is based on https://github.com/Lightning-AI/lightning#hello-simple-model
-
-from typing import Any
-
 import lightning as L
 import torch
 import torch.nn.functional as F
 import torch.utils.data as data
 import torchvision as tv
+import wandb
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.loggers import WandbLogger
 from torch import optim
 from torchvision import transforms as T
 
+from python_ml_project_template.utils.script_utils import (
+    PROJECT_ROOT,
+    LogPredictionSamplesCallback,
+    match_fn,
+)
+
 # TODOs:
-# * Switch to CIFAR10
-# * Add a DataModule
-# * Add hydra configs
-# * Align the checkpoints and log files
-# * Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
-# * Add an eval script which loads from wandb, and outputs an artifact.
+# [x] Switch to CIFAR10
+# [ ] Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
+#     - [x] Add a callback to save the model to wandb.
+#     - [x] Add a callback to save the codebase to wandb.
+#     - [ ] Add an image logging example.
+# [ ] Add a DataModule
+# [ ] Add an eval script which loads from wandb, and outputs an artifact.
+# [ ] Add hydra configs
+# [ ] Align the checkpoints and log files
 
 
 class ClassifierModule(L.LightningModule):
@@ -28,7 +36,7 @@ def __init__(self, network, lr: float) -> None:
     def forward(self, x):
         self.network(x)
 
-    def configure_optimizers(self) -> Any:
+    def configure_optimizers(self):
         optimizer = optim.AdamW(self.parameters(), lr=self.lr)
         lr_scheduler = optim.lr_scheduler.MultiStepLR(
             optimizer, milestones=[100, 150], gamma=0.1
@@ -43,17 +51,17 @@ def _calculate_loss(self, batch, mode="train"):
 
         self.log("%s_loss" % mode, loss, prog_bar=mode == "train")
         self.log("%s_acc" % mode, acc)
-        return loss
+        return {"loss": loss, "acc": acc, "preds": preds}
 
     def training_step(self, batch, batch_idx):
         loss = self._calculate_loss(batch, mode="train")
         return loss
 
     def validation_step(self, batch, batch_idx):
-        self._calculate_loss(batch, mode="val")
+        return self._calculate_loss(batch, mode="val")
 
     def test_step(self, batch, batch_idx):
-        self._calculate_loss(batch, mode="test")
+        return self._calculate_loss(batch, mode="test")
 
 
 def main():
@@ -132,8 +140,6 @@ def main():
         num_heads=8,
         num_layers=6,
         patch_size=4,
-        # num_channels=3,
-        # num_patches=64,
         num_classes=10,
         representation_size=256,
         mlp_dim=2048,
@@ -141,13 +147,56 @@ def main():
     )
     model = ClassifierModule(network, lr=3e-4)
 
+    save_dir = "./wandb"
+    checkpoint_dir = "./checkpoints"
+
+    logger = WandbLogger(
+        project="lightning-hydra-template",
+        entity="r-pad",
+        log_model=True,  # Only log the last checkpoint to wandb, and only the LAST model checkpoint.
+        save_dir=save_dir,
+        config={"testit": "wat"},
+    )
+
     trainer = L.Trainer(
         accelerator="gpu",
         devices=1,
         precision="16-mixed",
-        max_epochs=180,
-        logger=False,
+        max_epochs=500,
+        logger=logger,
+        callbacks=[
+            LogPredictionSamplesCallback(logger),
+            # This checkpoint callback saves the latest model during training, i.e. so we can resume if it crashes.
+            # It saves everything, and you can load by referencing last.ckpt.
+            ModelCheckpoint(
+                checkpoint_dir,
+                filename="{epoch}-{step}",
+                monitor="step",
+                mode="max",
+                save_weights_only=False,
+                save_last=True,
+            ),
+            # This checkpoint will get saved to WandB. The Callback mechanism in lightning is poorly designed, so we have to put it last.
+            ModelCheckpoint(
+                checkpoint_dir,
+                filename="{epoch}-{step}-{val_loss:.2f}-weights-only",
+                monitor="val_loss",
+                mode="min",
+                save_weights_only=True,
+            ),
+        ],
     )
+
+    # Log the code used to train the model. Make sure not to log too much, because it will be too big.
+    wandb.run.log_code(
+        root=PROJECT_ROOT,
+        include_fn=match_fn(
+            dirs=["configs", "scripts", "src"],
+            extensions=[".py", ".yaml"],
+        ),
+    )
+
+    # Run training.
     trainer.fit(model, train_loader, val_loader)
 
 
diff --git a/src/python_ml_project_template/utils/script_utils.py b/src/python_ml_project_template/utils/script_utils.py
new file mode 100644
index 0000000..80bbd50
--- /dev/null
+++ b/src/python_ml_project_template/utils/script_utils.py
@@ -0,0 +1,60 @@
+import os
+import pathlib
+from typing import Sequence
+
+import wandb
+from lightning.pytorch import Callback
+from pytorch_lightning.loggers import WandbLogger
+
+PROJECT_ROOT = str(pathlib.Path(__file__).parent.parent.parent.parent.resolve())
+
+
+# This matching function
+def match_fn(dirs: Sequence[str], extensions: Sequence[str], root: str = PROJECT_ROOT):
+    def _match_fn(path: pathlib.Path):
+        in_dir = any([str(path).startswith(os.path.join(root, d)) for d in dirs])
+
+        if not in_dir:
+            return False
+
+        if not any([str(path).endswith(e) for e in extensions]):
+            return False
+
+        return True
+
+    return _match_fn
+
+
+class LogPredictionSamplesCallback(Callback):
+    def __init__(self, logger: WandbLogger):
+        self.logger = logger
+
+    def on_validation_batch_end(
+        self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx=0
+    ):
+        """Called when the validation batch ends."""
+
+        # `outputs` comes from `LightningModule.validation_step`
+        # which corresponds to our model predictions in this case
+
+        # Let's log 20 sample image predictions from the first batch
+        if batch_idx == 0:
+            n = 20
+            x, y = batch
+            images = [img for img in x[:n]]
+            outs = outputs["preds"][:n].argmax(dim=1)
+            captions = [
+                f"Ground Truth: {y_i} - Prediction: {y_pred}"
+                for y_i, y_pred in zip(y[:n], outs)
+            ]
+
+            # Option 1: log images with `WandbLogger.log_image`
+            self.logger.log_image(key="sample_images", images=images, caption=captions)
+
+            # Option 2: log images and predictions as a W&B Table
+            columns = ["image", "ground truth", "prediction"]
+            data = [
+                [wandb.Image(x_i), y_i, y_pred]
+                for x_i, y_i, y_pred in list(zip(x[:n], y[:n], outs))
+            ]
+            self.logger.log_table(key="sample_table", columns=columns, data=data)

From c46bce1f1ac96c3c352795eaa3a85f9be880f90f Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Tue, 20 Jun 2023 15:09:35 -0400
Subject: [PATCH 04/11] refactor into datamodule

---
 scripts/train.py | 155 ++++++++++++++++++++++++++++-------------------
 1 file changed, 92 insertions(+), 63 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index cb4bcd1..074d49f 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -17,16 +17,102 @@
 
 # TODOs:
 # [x] Switch to CIFAR10
-# [ ] Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
+# [x] Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
 #     - [x] Add a callback to save the model to wandb.
 #     - [x] Add a callback to save the codebase to wandb.
-#     - [ ] Add an image logging example.
-# [ ] Add a DataModule
+#     - [x] Add an image logging example.
+# [x] Add a DataModule
 # [ ] Add an eval script which loads from wandb, and outputs an artifact.
 # [ ] Add hydra configs
 # [ ] Align the checkpoints and log files
 
 
+class CIFAR10DataModule(L.LightningDataModule):
+    def __init__(self, root, batch_size, num_workers):
+        super().__init__()
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def prepare_data(self):
+        # Anything that needs to be done to download.
+        tv.datasets.CIFAR10(self.root, train=True, download=True)
+        tv.datasets.CIFAR10(self.root, train=False, download=True)
+
+    def setup(self, stage: str):
+        # Set up data augmentation.
+        train_transform = T.Compose(
+            [
+                T.RandomHorizontalFlip(),
+                T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
+                T.ToTensor(),
+                T.Normalize(
+                    [0.49139968, 0.48215841, 0.44653091],
+                    [0.24703223, 0.24348513, 0.26158784],
+                ),
+            ]
+        )
+
+        test_transform = T.Compose(
+            [
+                T.ToTensor(),
+                T.Normalize(
+                    [0.49139968, 0.48215841, 0.44653091],
+                    [0.24703223, 0.24348513, 0.26158784],
+                ),
+            ]
+        )
+
+        # We want to split the training set into train and val. But we don't want transforms on val.
+        # So we create two datasets, and make sure that the split is consistent between them.
+        train_dataset = tv.datasets.CIFAR10(
+            self.root, train=True, transform=train_transform
+        )
+        val_dataset = tv.datasets.CIFAR10(
+            self.root, train=True, transform=test_transform
+        )
+        generator = torch.Generator().manual_seed(42)
+        self.train_set, _ = torch.utils.data.random_split(
+            train_dataset, [45000, 5000], generator=generator
+        )
+        _, self.val_set = torch.utils.data.random_split(
+            val_dataset, [45000, 5000], generator=generator
+        )
+
+        # Test set.
+        self.test_set = tv.datasets.CIFAR10(
+            self.root, train=False, transform=test_transform
+        )
+
+    def train_dataloader(self):
+        return data.DataLoader(
+            self.train_set,
+            batch_size=self.batch_size,
+            shuffle=True,
+            drop_last=True,
+            pin_memory=True,
+            num_workers=self.num_workers,
+        )
+
+    def val_dataloader(self):
+        return data.DataLoader(
+            self.val_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=self.num_workers,
+        )
+
+    def test_dataloader(self):
+        return data.DataLoader(
+            self.test_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=self.num_workers,
+        )
+
+
 class ClassifierModule(L.LightningModule):
     def __init__(self, network, lr: float) -> None:
         super().__init__()
@@ -73,67 +159,8 @@ def main():
     # Since most of us are training on 3090s+, we can use mixed precision.
     torch.set_float32_matmul_precision("medium")
 
-    # Set up data augmentation.
-    train_transform = T.Compose(
-        [
-            T.RandomHorizontalFlip(),
-            T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
-            T.ToTensor(),
-            T.Normalize(
-                [0.49139968, 0.48215841, 0.44653091],
-                [0.24703223, 0.24348513, 0.26158784],
-            ),
-        ]
-    )
-
-    test_transform = T.Compose(
-        [
-            T.ToTensor(),
-            T.Normalize(
-                [0.49139968, 0.48215841, 0.44653091],
-                [0.24703223, 0.24348513, 0.26158784],
-            ),
-        ]
-    )
-
     root = "./data"
 
-    # We want to split the training set into train and val. But we don't want transforms on val.
-    L.seed_everything(42)
-    train_dataset = tv.datasets.CIFAR10(
-        root, train=True, transform=train_transform, download=True
-    )
-    L.seed_everything(42)
-    val_dataset = tv.datasets.CIFAR10(
-        root, train=True, transform=test_transform, download=True
-    )
-    L.seed_everything(42)
-    train_set, _ = torch.utils.data.random_split(train_dataset, [45000, 5000])
-    L.seed_everything(42)
-    _, val_set = torch.utils.data.random_split(val_dataset, [45000, 5000])
-
-    # Test set.
-    L.seed_everything(42)
-    test_set = tv.datasets.CIFAR10(
-        root, train=False, transform=test_transform, download=True
-    )
-
-    # Loaders.
-    train_loader = data.DataLoader(
-        train_set,
-        batch_size=128,
-        shuffle=True,
-        drop_last=True,
-        pin_memory=True,
-        num_workers=4,
-    )
-    val_loader = data.DataLoader(
-        val_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4
-    )
-    test_loader = data.DataLoader(
-        test_set, batch_size=128, shuffle=False, drop_last=False, num_workers=4
-    )
-
     network = tv.models.VisionTransformer(
         image_size=32,
         hidden_dim=512,
@@ -147,6 +174,8 @@ def main():
     )
     model = ClassifierModule(network, lr=3e-4)
 
+    datamodule = CIFAR10DataModule(root, batch_size=128, num_workers=4)
+
     save_dir = "./wandb"
     checkpoint_dir = "./checkpoints"
 
@@ -197,7 +226,7 @@ def main():
     )
 
     # Run training.
-    trainer.fit(model, train_loader, val_loader)
+    trainer.fit(model, datamodule=datamodule)
 
 
 if __name__ == "__main__":

From f350768c4a5b92bcd74602da653747bcbeeb1977 Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Wed, 21 Jun 2023 14:58:29 -0400
Subject: [PATCH 05/11] add an eval script

---
 .gitignore                                    |   1 +
 pyproject.toml                                |   2 +
 scripts/eval.py                               | 159 ++++++++++++++++++
 scripts/train.py                              | 118 +++----------
 .../datasets/__init__.py                      |   0
 .../datasets/cifar10.py                       | 102 +++++++++++
 6 files changed, 286 insertions(+), 96 deletions(-)
 create mode 100644 scripts/eval.py
 create mode 100644 src/python_ml_project_template/datasets/__init__.py
 create mode 100644 src/python_ml_project_template/datasets/cifar10.py

diff --git a/.gitignore b/.gitignore
index c65f631..be7fef9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -376,4 +376,5 @@ checkpoints/
 lightning_logs/
 
 # Generated by wandb, but we'll nuke this soon:
+artifacts/
 wandb/
diff --git a/pyproject.toml b/pyproject.toml
index 23acb2e..a7b98d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,8 +10,10 @@ authors = [
 ]
 dependencies = [
   "lightning == 2.0.3",
+  "pandas",
   # CUDA 11.8
   "torch == 2.0.1",
+  "torchmetrics",
   "torchvision == 0.15.2",
   "wandb == 0.15.4",
 ]
diff --git a/scripts/eval.py b/scripts/eval.py
new file mode 100644
index 0000000..991c10c
--- /dev/null
+++ b/scripts/eval.py
@@ -0,0 +1,159 @@
+from typing import Any
+
+import lightning as L
+import pandas as pd
+import torch
+import torchmetrics.functional.classification as tfc
+import torchvision as tv
+import wandb
+
+from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule
+from python_ml_project_template.utils.script_utils import PROJECT_ROOT, match_fn
+
+
+class ClassifierEvalModule(L.LightningModule):
+    def __init__(self, network) -> None:
+        super().__init__()
+        self.network = network
+
+    def forward(self, x):
+        self.network(x)
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+        imgs, labels = batch
+        preds = self.network(imgs)
+        return {"preds": preds, "labels": labels}
+
+
+@torch.no_grad()
+def main():
+    # Global seed for reproducibility.
+    L.seed_everything(42)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+    # Since most of us are training on 3090s+, we can use mixed precision.
+    torch.set_float32_matmul_precision("medium")
+    # run_id = "pjf0nfg6"
+    run_id = "0v36p8tn"
+    checkpoint_reference = f"r-pad/lightning-hydra-template/model-{run_id}:v0"
+
+    # download checkpoint locally (if not already cached)
+    run = wandb.init(
+        entity="r-pad",
+        project="lightning-hydra-template",
+        job_type="eval",
+        group=f"experiment-{run_id}",
+        config={"eval config": "wat"},
+    )
+
+    wandb.run.log_code(
+        root=PROJECT_ROOT,
+        include_fn=match_fn(
+            dirs=["configs", "scripts", "src"],
+            extensions=[".py", ".yaml"],
+        ),
+    )
+
+    network = tv.models.VisionTransformer(
+        image_size=32,
+        hidden_dim=512,
+        num_heads=8,
+        num_layers=6,
+        patch_size=4,
+        num_classes=10,
+        representation_size=256,
+        mlp_dim=2048,
+        dropout=0.2,
+    )
+
+    artifact = run.use_artifact(checkpoint_reference, type="model")
+    # artifact_dir = artifact.download()
+    ckpt_file = artifact.get_path("model.ckpt").download()
+
+    ckpt = torch.load(ckpt_file)
+
+    network.load_state_dict(
+        {k.partition(".")[2]: v for k, v, in ckpt["state_dict"].items()}
+    )
+
+    model = ClassifierEvalModule(network)
+
+    root = "./data"
+    datamodule = CIFAR10DataModule(root, batch_size=128, num_workers=4)
+    # Gotta call this in order to establish the dataloaders.
+    datamodule.setup("predict")
+
+    trainer = L.Trainer(
+        accelerator="gpu",
+        devices=1,
+        precision="16-mixed",
+        max_epochs=1,
+        logger=False,
+    )
+
+    train_preds, val_preds, test_preds = trainer.predict(
+        model,
+        dataloaders=[
+            *datamodule.val_dataloader(),  # There are two different loaders (train_val and val).
+            datamodule.test_dataloader(),
+        ],
+    )
+
+    # Each of these is a list of dictionaries, where each dictionary is the output of the predict_step method.
+    # We can use the `preds` and `labels` keys to calculate metrics.
+    # For example, we can calculate the accuracy like so:
+
+    for pred_list, name in [
+        (train_preds, "train"),
+        (val_preds, "val"),
+        (test_preds, "test"),
+    ]:
+        pass
+
+        all_preds = torch.cat([x["preds"].cpu() for x in pred_list])
+        all_labels = torch.cat([x["labels"].cpu() for x in pred_list])
+        global_acc = (
+            tfc.multiclass_accuracy(
+                all_preds,
+                all_labels,
+                num_classes=10,
+                average="micro",
+            )
+            .numpy()
+            .item()
+        )
+
+        macro_acc = (
+            tfc.multiclass_accuracy(
+                all_preds,
+                all_labels,
+                num_classes=10,
+                average="micro",
+            )
+            .numpy()
+            .item()
+        )
+
+        # We also want to log per-label accuracies.
+        acc_per_label = tfc.multiclass_accuracy(
+            all_preds,
+            all_labels,
+            num_classes=10,
+            average="none",
+        ).numpy()
+
+        # Create a dataframe with the per-label accuracies, as well as the global and macro accuracies.
+        # The columns of the table should be the labels, and there should only be a single row.
+        acc_df = pd.DataFrame(acc_per_label[None], columns=[str(i) for i in range(10)])
+
+        # Log the dataframe to wandb.
+        table = wandb.Table(dataframe=acc_df)
+        run.log({f"{name}_accuracy_table": table})
+
+        run.summary[f"{name}_true_accuracy"] = global_acc
+        run.summary[f"{name}_class_balanced_accuracy"] = macro_acc
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/train.py b/scripts/train.py
index 074d49f..2dab2bd 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,14 +1,13 @@
 import lightning as L
 import torch
 import torch.nn.functional as F
-import torch.utils.data as data
 import torchvision as tv
 import wandb
 from lightning.pytorch.callbacks import ModelCheckpoint
 from lightning.pytorch.loggers import WandbLogger
 from torch import optim
-from torchvision import transforms as T
 
+from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule
 from python_ml_project_template.utils.script_utils import (
     PROJECT_ROOT,
     LogPredictionSamplesCallback,
@@ -22,98 +21,13 @@
 #     - [x] Add a callback to save the codebase to wandb.
 #     - [x] Add an image logging example.
 # [x] Add a DataModule
-# [ ] Add an eval script which loads from wandb, and outputs an artifact.
+# [x] Add an eval script which loads from wandb, and outputs a table.
+# [ ] Fix grouping stuff...
 # [ ] Add hydra configs
 # [ ] Align the checkpoints and log files
 
 
-class CIFAR10DataModule(L.LightningDataModule):
-    def __init__(self, root, batch_size, num_workers):
-        super().__init__()
-        self.root = root
-        self.batch_size = batch_size
-        self.num_workers = num_workers
-
-    def prepare_data(self):
-        # Anything that needs to be done to download.
-        tv.datasets.CIFAR10(self.root, train=True, download=True)
-        tv.datasets.CIFAR10(self.root, train=False, download=True)
-
-    def setup(self, stage: str):
-        # Set up data augmentation.
-        train_transform = T.Compose(
-            [
-                T.RandomHorizontalFlip(),
-                T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
-                T.ToTensor(),
-                T.Normalize(
-                    [0.49139968, 0.48215841, 0.44653091],
-                    [0.24703223, 0.24348513, 0.26158784],
-                ),
-            ]
-        )
-
-        test_transform = T.Compose(
-            [
-                T.ToTensor(),
-                T.Normalize(
-                    [0.49139968, 0.48215841, 0.44653091],
-                    [0.24703223, 0.24348513, 0.26158784],
-                ),
-            ]
-        )
-
-        # We want to split the training set into train and val. But we don't want transforms on val.
-        # So we create two datasets, and make sure that the split is consistent between them.
-        train_dataset = tv.datasets.CIFAR10(
-            self.root, train=True, transform=train_transform
-        )
-        val_dataset = tv.datasets.CIFAR10(
-            self.root, train=True, transform=test_transform
-        )
-        generator = torch.Generator().manual_seed(42)
-        self.train_set, _ = torch.utils.data.random_split(
-            train_dataset, [45000, 5000], generator=generator
-        )
-        _, self.val_set = torch.utils.data.random_split(
-            val_dataset, [45000, 5000], generator=generator
-        )
-
-        # Test set.
-        self.test_set = tv.datasets.CIFAR10(
-            self.root, train=False, transform=test_transform
-        )
-
-    def train_dataloader(self):
-        return data.DataLoader(
-            self.train_set,
-            batch_size=self.batch_size,
-            shuffle=True,
-            drop_last=True,
-            pin_memory=True,
-            num_workers=self.num_workers,
-        )
-
-    def val_dataloader(self):
-        return data.DataLoader(
-            self.val_set,
-            batch_size=self.batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=self.num_workers,
-        )
-
-    def test_dataloader(self):
-        return data.DataLoader(
-            self.test_set,
-            batch_size=self.batch_size,
-            shuffle=False,
-            drop_last=False,
-            num_workers=self.num_workers,
-        )
-
-
-class ClassifierModule(L.LightningModule):
+class ClassifierTrainingModule(L.LightningModule):
     def __init__(self, network, lr: float) -> None:
         super().__init__()
         self.network = network
@@ -135,16 +49,21 @@ def _calculate_loss(self, batch, mode="train"):
         loss = F.cross_entropy(preds, labels)
         acc = (preds.argmax(dim=-1) == labels).float().mean()
 
-        self.log("%s_loss" % mode, loss, prog_bar=mode == "train")
-        self.log("%s_acc" % mode, acc)
+        istrain = mode == "train"
+        self.log("%s_loss" % mode, loss, prog_bar=istrain, add_dataloader_idx=False)
+        self.log("%s_acc" % mode, acc, add_dataloader_idx=False)
         return {"loss": loss, "acc": acc, "preds": preds}
 
     def training_step(self, batch, batch_idx):
         loss = self._calculate_loss(batch, mode="train")
         return loss
 
-    def validation_step(self, batch, batch_idx):
-        return self._calculate_loss(batch, mode="val")
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        if dataloader_idx == 0:
+            mode = "train_val"
+        else:
+            mode = "val"
+        return self._calculate_loss(batch, mode=mode)
 
     def test_step(self, batch, batch_idx):
         return self._calculate_loss(batch, mode="test")
@@ -172,26 +91,33 @@ def main():
         mlp_dim=2048,
         dropout=0.2,
     )
-    model = ClassifierModule(network, lr=3e-4)
+    model = ClassifierTrainingModule(network, lr=3e-4)
 
     datamodule = CIFAR10DataModule(root, batch_size=128, num_workers=4)
 
     save_dir = "./wandb"
     checkpoint_dir = "./checkpoints"
 
+    # Create a new wandb run.
+    id = wandb.util.generate_id()
+    group = "experiment-" + id
+
     logger = WandbLogger(
         project="lightning-hydra-template",
         entity="r-pad",
         log_model=True,  # Only log the last checkpoint to wandb, and only the LAST model checkpoint.
         save_dir=save_dir,
         config={"testit": "wat"},
+        job_type="train",
+        group=group,
+        id=id,
     )
 
     trainer = L.Trainer(
         accelerator="gpu",
         devices=1,
         precision="16-mixed",
-        max_epochs=500,
+        max_epochs=1,
         logger=logger,
         callbacks=[
             LogPredictionSamplesCallback(logger),
diff --git a/src/python_ml_project_template/datasets/__init__.py b/src/python_ml_project_template/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python_ml_project_template/datasets/cifar10.py b/src/python_ml_project_template/datasets/cifar10.py
new file mode 100644
index 0000000..3cd7573
--- /dev/null
+++ b/src/python_ml_project_template/datasets/cifar10.py
@@ -0,0 +1,102 @@
+import lightning as L
+import torch
+import torch.utils.data as data
+import torchvision as tv
+from torchvision import transforms as T
+
+
+class CIFAR10DataModule(L.LightningDataModule):
+    def __init__(self, root, batch_size, num_workers):
+        super().__init__()
+        self.root = root
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+
+    def prepare_data(self):
+        # Anything that needs to be done to download.
+        tv.datasets.CIFAR10(self.root, train=True, download=True)
+        tv.datasets.CIFAR10(self.root, train=False, download=True)
+
+    def setup(self, stage: str):
+        # Set up data augmentation.
+        train_transform = T.Compose(
+            [
+                T.RandomHorizontalFlip(),
+                T.RandomResizedCrop((32, 32), scale=(0.8, 1.0), ratio=(0.9, 1.1)),
+                T.ToTensor(),
+                T.Normalize(
+                    [0.49139968, 0.48215841, 0.44653091],
+                    [0.24703223, 0.24348513, 0.26158784],
+                ),
+            ]
+        )
+
+        test_transform = T.Compose(
+            [
+                T.ToTensor(),
+                T.Normalize(
+                    [0.49139968, 0.48215841, 0.44653091],
+                    [0.24703223, 0.24348513, 0.26158784],
+                ),
+            ]
+        )
+
+        # We want to split the training set into train and val. But we don't want transforms on val.
+        # So we create two datasets, and make sure that the split is consistent between them.
+        train_dataset = tv.datasets.CIFAR10(
+            self.root, train=True, transform=train_transform
+        )
+        val_dataset = tv.datasets.CIFAR10(
+            self.root, train=True, transform=test_transform
+        )
+        generator = torch.Generator().manual_seed(42)
+        self.train_set, _ = torch.utils.data.random_split(
+            train_dataset, [45000, 5000], generator=generator
+        )
+        train_val_set, val_set = torch.utils.data.random_split(
+            val_dataset, [45000, 5000], generator=generator
+        )
+        self.train_val_set = train_val_set
+        self.val_set = val_set
+
+        # Test set.
+        self.test_set = tv.datasets.CIFAR10(
+            self.root, train=False, transform=test_transform
+        )
+
+    def train_dataloader(self):
+        return data.DataLoader(
+            self.train_set,
+            batch_size=self.batch_size,
+            shuffle=True,
+            drop_last=True,
+            pin_memory=True,
+            num_workers=self.num_workers,
+        )
+
+    def val_dataloader(self):
+        return [
+            data.DataLoader(
+                self.train_val_set,
+                batch_size=self.batch_size,
+                shuffle=False,
+                drop_last=False,
+                num_workers=self.num_workers,
+            ),
+            data.DataLoader(
+                self.val_set,
+                batch_size=self.batch_size,
+                shuffle=False,
+                drop_last=False,
+                num_workers=self.num_workers,
+            ),
+        ]
+
+    def test_dataloader(self):
+        return data.DataLoader(
+            self.test_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            drop_last=False,
+            num_workers=self.num_workers,
+        )

From b62d7af9011bbd8d3506a46e4879c2b1d90ca9cd Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Wed, 21 Jun 2023 16:48:59 -0400
Subject: [PATCH 06/11] Add in Hydra configs and pretty things up

add some explanation

add in some nice hydra configurations
---
 .gitignore                                    |  19 +-
 configs/_logging.yaml                         |  32 +++
 configs/dataset/cifar10.yaml                  |   4 +
 configs/eval.yaml                             |  39 +++
 configs/inference/cifar10_vit.yaml            |   1 +
 configs/model/vit.yaml                        |   8 +
 configs/train.yaml                            |  29 +++
 configs/training/cifar10_vit.yaml             |   3 +
 pyproject.toml                                |  10 +-
 scripts/eval.py                               | 231 ++++++++++--------
 scripts/train.py                              | 207 +++++++++-------
 .../metrics/__init__.py                       |   0
 .../metrics/classification.py                 |  22 ++
 .../models/__init__.py                        |   0
 .../models/classifier.py                      |  61 +++++
 .../nets/__init__.py                          |   0
 .../utils/script_utils.py                     |  37 ++-
 17 files changed, 497 insertions(+), 206 deletions(-)
 create mode 100644 configs/_logging.yaml
 create mode 100644 configs/dataset/cifar10.yaml
 create mode 100644 configs/eval.yaml
 create mode 100644 configs/inference/cifar10_vit.yaml
 create mode 100644 configs/model/vit.yaml
 create mode 100644 configs/train.yaml
 create mode 100644 configs/training/cifar10_vit.yaml
 create mode 100644 src/python_ml_project_template/metrics/__init__.py
 create mode 100644 src/python_ml_project_template/metrics/classification.py
 create mode 100644 src/python_ml_project_template/models/__init__.py
 create mode 100644 src/python_ml_project_template/models/classifier.py
 create mode 100644 src/python_ml_project_template/nets/__init__.py

diff --git a/.gitignore b/.gitignore
index be7fef9..5623d3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -371,10 +371,15 @@ cython_debug/
 # In general, should be ignored.
 data/
 
-# These are generated by default by lightning, but bad. We want to restructure.
-checkpoints/
-lightning_logs/
-
-# Generated by wandb, but we'll nuke this soon:
-artifacts/
-wandb/
+# These are generated by default by lightning, but our settings should nuke.
+# THESE SHOULD NO LONGER BE GENERATED!
+# checkpoints/
+# lightning_logs/
+
+# Generated by wandb. Should be under logs, except for artifacts which is toplevel so as to be shared.
+# wandb/
+wandb_artifacts/
+
+# Generated by hydra.
+# outputs/
+logs/
diff --git a/configs/_logging.yaml b/configs/_logging.yaml
new file mode 100644
index 0000000..bec3e19
--- /dev/null
+++ b/configs/_logging.yaml
@@ -0,0 +1,32 @@
+# Where logs go, i.e. the top folder.
+log_dir: ${hydra:runtime.cwd}/logs
+
+output_dir: ${hydra:runtime.output_dir}
+
+# This has to come from above.
+job_type: ???
+
+hydra:
+  run:
+    dir: ${log_dir}/${hydra.job.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: ${log_dir}/${hydra.job.name}/sweep/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  job:
+    chdir: True
+    name: ${job_type}
+
+lightning:
+  checkpoint_dir: ${output_dir}/checkpoints
+
+wandb:
+  entity: r-pad
+  project: python_ml_project_template
+
+  # Group is for grouping runs together (i.e. a train run and an eval run).
+  group: ???
+
+  # Where to dump wandb logs, etc.
+  save_dir: ${output_dir}
+  # Put artifacts at the toplevel so that we don't have to re-download each time...
+  artifact_dir: ${hydra:runtime.cwd}/wandb_artifacts
diff --git a/configs/dataset/cifar10.yaml b/configs/dataset/cifar10.yaml
new file mode 100644
index 0000000..829725b
--- /dev/null
+++ b/configs/dataset/cifar10.yaml
@@ -0,0 +1,4 @@
+name: cifar10
+data_dir: ${hydra:runtime.cwd}/data
+image_size: 32
+num_classes: 10
diff --git a/configs/eval.yaml b/configs/eval.yaml
new file mode 100644
index 0000000..dfb94bc
--- /dev/null
+++ b/configs/eval.yaml
@@ -0,0 +1,39 @@
+mode: eval
+
+# This is somewhat arbitrary.
+job_type: ${mode}_${dataset.name}
+
+defaults:
+  # Each of these have their own configuration parameters.
+  - dataset: cifar10
+  - model: vit
+
+  # A set of inference settings for the model. Note that these may be different
+  # from / or a subset of the training settings. This is that we don't have to
+  # provide, like, a learning rater or something to eval.
+  - inference: ${dataset}_${model}
+
+  # Simple shared imports.
+  - _logging
+
+  # Override.
+  - _self_
+
+seed: 42
+
+# This is the checkpoint that we're evaluating. You can change this to whatever you need,
+# like if you want multiple checkpoints simultaneously, etc.
+checkpoint:
+  # If we want to load a model for a specific run, we can change that here.
+  run_id: ???
+  reference: ${wandb.entity}/${wandb.project}/model-${checkpoint.run_id}:best
+
+resources:
+  num_workers: 4
+  gpus:
+    - 0
+
+wandb:
+  # The group ***should*** be the same as the training group (so it can be bundled)
+  # nicely in the UI. But you might have a one-off eval or something.
+  group: ???
diff --git a/configs/inference/cifar10_vit.yaml b/configs/inference/cifar10_vit.yaml
new file mode 100644
index 0000000..1750891
--- /dev/null
+++ b/configs/inference/cifar10_vit.yaml
@@ -0,0 +1 @@
+batch_size: 128
diff --git a/configs/model/vit.yaml b/configs/model/vit.yaml
new file mode 100644
index 0000000..681b719
--- /dev/null
+++ b/configs/model/vit.yaml
@@ -0,0 +1,8 @@
+name: vit
+hidden_dim: 512
+num_heads: 8
+num_layers: 6
+patch_size: 4
+representation_size: 256
+mlp_dim: 2048
+dropout: 0.2
diff --git a/configs/train.yaml b/configs/train.yaml
new file mode 100644
index 0000000..356c535
--- /dev/null
+++ b/configs/train.yaml
@@ -0,0 +1,29 @@
+mode: train
+
+# This is somewhat arbitrary.
+job_type: ${mode}_${dataset.name}
+
+defaults:
+  # Each of these have their own configuration parameters.
+  - dataset: cifar10
+  - model: vit
+
+  # We assume a different training config for each dataset/model pair.
+  - training: ${dataset}_${model}
+
+  # Simple shared imports.
+  - _logging
+
+  # Override.
+  - _self_
+
+seed: 42
+
+resources:
+  num_workers: 4
+  gpus:
+    - 0
+
+wandb:
+  # Assume no group provided, we will create a default one.
+  group: Null
diff --git a/configs/training/cifar10_vit.yaml b/configs/training/cifar10_vit.yaml
new file mode 100644
index 0000000..4324b00
--- /dev/null
+++ b/configs/training/cifar10_vit.yaml
@@ -0,0 +1,3 @@
+lr: 3e-4
+batch_size: 128
+epochs: 100
diff --git a/pyproject.toml b/pyproject.toml
index a7b98d2..9cc3b18 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,12 +9,13 @@ authors = [
   {email = "baeisner@andrew.cmu.edu", name = "Ben Eisner"}
 ]
 dependencies = [
+  "hydra-core == 1.3.2",
   "lightning == 2.0.3",
+  "omegaconf == 2.3.0",
   "pandas",
-  # CUDA 11.8
-  "torch == 2.0.1",
+  "torch == 2.0.1", # CUDA 11.8
   "torchmetrics",
-  "torchvision == 0.15.2",
+  "torchvision == 0.15.2", # CUDA 11.8
   "wandb == 0.15.4",
 ]
 
@@ -71,3 +72,6 @@ explicit_package_bases = true
 
 [tool.pylint]
 known-third-party = "wandb"
+
+[tool.pylint.TYPECHECK]
+generated-members = 'torch.*'
diff --git a/scripts/eval.py b/scripts/eval.py
index 991c10c..c112d12 100644
--- a/scripts/eval.py
+++ b/scripts/eval.py
@@ -1,52 +1,72 @@
-from typing import Any
-
+import hydra
 import lightning as L
-import pandas as pd
+import omegaconf
 import torch
-import torchmetrics.functional.classification as tfc
-import torchvision as tv
+import torch.utils._pytree as pytree
 import wandb
 
 from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule
-from python_ml_project_template.utils.script_utils import PROJECT_ROOT, match_fn
-
-
-class ClassifierEvalModule(L.LightningModule):
-    def __init__(self, network) -> None:
-        super().__init__()
-        self.network = network
-
-    def forward(self, x):
-        self.network(x)
-
-    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
-        imgs, labels = batch
-        preds = self.network(imgs)
-        return {"preds": preds, "labels": labels}
+from python_ml_project_template.metrics.classification import get_metrics
+from python_ml_project_template.models.classifier import ClassifierInferenceModule
+from python_ml_project_template.utils.script_utils import (
+    PROJECT_ROOT,
+    create_model,
+    flatten_outputs,
+    match_fn,
+)
 
 
 @torch.no_grad()
-def main():
-    # Global seed for reproducibility.
-    L.seed_everything(42)
+@hydra.main(config_path="../configs", config_name="eval", version_base="1.3")
+def main(cfg):
+    ######################################################################
+    # Torch settings.
+    ######################################################################
+
+    # Make deterministic + reproducible.
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
 
     # Since most of us are training on 3090s+, we can use mixed precision.
     torch.set_float32_matmul_precision("medium")
-    # run_id = "pjf0nfg6"
-    run_id = "0v36p8tn"
-    checkpoint_reference = f"r-pad/lightning-hydra-template/model-{run_id}:v0"
 
-    # download checkpoint locally (if not already cached)
+    # Global seed for reproducibility.
+    L.seed_everything(42)
+
+    ######################################################################
+    # Create the datamodule.
+    # Should be the same one as in training, but we're gonna use val+test
+    # dataloaders.
+    ######################################################################
+
+    datamodule = CIFAR10DataModule(
+        root=cfg.dataset.data_dir,
+        batch_size=cfg.inference.batch_size,
+        num_workers=cfg.resources.num_workers,
+    )
+    # Gotta call this in order to establish the dataloaders.
+    datamodule.setup("predict")
+
+    ######################################################################
+    # Set up logging in WandB.
+    # This is a different job type (eval), but we want it all grouped
+    # together. Notice that we use our own logging here (not lightning).
+    ######################################################################
+
+    # Create a run.
     run = wandb.init(
-        entity="r-pad",
-        project="lightning-hydra-template",
-        job_type="eval",
-        group=f"experiment-{run_id}",
-        config={"eval config": "wat"},
+        entity=cfg.wandb.entity,
+        project=cfg.wandb.project,
+        dir=cfg.wandb.save_dir,
+        config=omegaconf.OmegaConf.to_container(
+            cfg, resolve=True, throw_on_missing=True
+        ),
+        job_type=cfg.job_type,
+        save_code=True,  # This just has the main script.
+        group=cfg.wandb.group,
     )
 
+    # Log the code.
     wandb.run.log_code(
         root=PROJECT_ROOT,
         include_fn=match_fn(
@@ -55,44 +75,77 @@ def main():
         ),
     )
 
-    network = tv.models.VisionTransformer(
-        image_size=32,
-        hidden_dim=512,
-        num_heads=8,
-        num_layers=6,
-        patch_size=4,
-        num_classes=10,
-        representation_size=256,
-        mlp_dim=2048,
-        dropout=0.2,
+    ######################################################################
+    # Create the network(s) which will be evaluated (same as training).
+    # You might want to put this into a "create_network" function
+    # somewhere so train and eval can be the same.
+    #
+    # We'll also load the weights.
+    ######################################################################
+
+    network = create_model(
+        image_size=cfg.dataset.image_size,
+        num_classes=cfg.dataset.num_classes,
+        model_cfg=cfg.model,
     )
 
-    artifact = run.use_artifact(checkpoint_reference, type="model")
-    # artifact_dir = artifact.download()
-    ckpt_file = artifact.get_path("model.ckpt").download()
-
+    # Get the checkpoint file. If it's a wandb reference, download.
+    # Otherwise look to disk.
+    checkpoint_reference = cfg.checkpoint.reference
+    if checkpoint_reference.startswith(cfg.wandb.entity):
+        # download checkpoint locally (if not already cached)
+        artifact_dir = cfg.wandb.artifact_dir
+        artifact = run.use_artifact(checkpoint_reference, type="model")
+        ckpt_file = artifact.get_path("model.ckpt").download(root=artifact_dir)
+    else:
+        ckpt_file = checkpoint_reference
+
+    # Load the network weights.
     ckpt = torch.load(ckpt_file)
-
     network.load_state_dict(
         {k.partition(".")[2]: v for k, v, in ckpt["state_dict"].items()}
     )
 
-    model = ClassifierEvalModule(network)
-
-    root = "./data"
-    datamodule = CIFAR10DataModule(root, batch_size=128, num_workers=4)
-    # Gotta call this in order to establish the dataloaders.
-    datamodule.setup("predict")
+    ######################################################################
+    # Create an inference module, which is basically just a bare-bones
+    # class which runs the model. In this example, we only implement
+    # the "predict_step" function, which may not be the blessed
+    # way to do it vis a vis lightning, but whatever.
+    #
+    # If this is a downstream application or something, you might
+    # want to implement a different interface (like with a "predict"
+    # function), so you can pass in un-batched observations from an
+    # environment, for instance.
+    ######################################################################
+
+    model = ClassifierInferenceModule(network)
+
+    ######################################################################
+    # Create the trainer.
+    # Bit of a misnomer here, we're not doing training. But we are gonna
+    # use it to set up the model appropriately and do all the batching
+    # etc.
+    #
+    # If this is a different kind of downstream eval, chuck this block.
+    ######################################################################
 
     trainer = L.Trainer(
         accelerator="gpu",
-        devices=1,
+        devices=cfg.resources.gpus,
         precision="16-mixed",
-        max_epochs=1,
         logger=False,
     )
 
-    train_preds, val_preds, test_preds = trainer.predict(
+    ######################################################################
+    # Run the model on the train/val/test sets.
+    # This outputs a list of dictionaries, one for each batch. This
+    # is annoying to work with, so later we'll flatten.
+    #
+    # If a downstream eval, you can swap it out with whatever the eval
+    # function is.
+    ######################################################################
+
+    train_outputs, val_outputs, test_outputs = trainer.predict(
         model,
         dataloaders=[
             *datamodule.val_dataloader(),  # There are two different loaders (train_val and val).
@@ -100,60 +153,28 @@ def main():
         ],
     )
 
-    # Each of these is a list of dictionaries, where each dictionary is the output of the predict_step method.
-    # We can use the `preds` and `labels` keys to calculate metrics.
-    # For example, we can calculate the accuracy like so:
-
-    for pred_list, name in [
-        (train_preds, "train"),
-        (val_preds, "val"),
-        (test_preds, "test"),
+    for outputs_list, name in [
+        (train_outputs, "train"),
+        (val_outputs, "val"),
+        (test_outputs, "test"),
     ]:
-        pass
-
-        all_preds = torch.cat([x["preds"].cpu() for x in pred_list])
-        all_labels = torch.cat([x["labels"].cpu() for x in pred_list])
-        global_acc = (
-            tfc.multiclass_accuracy(
-                all_preds,
-                all_labels,
-                num_classes=10,
-                average="micro",
-            )
-            .numpy()
-            .item()
-        )
-
-        macro_acc = (
-            tfc.multiclass_accuracy(
-                all_preds,
-                all_labels,
-                num_classes=10,
-                average="micro",
-            )
-            .numpy()
-            .item()
-        )
-
-        # We also want to log per-label accuracies.
-        acc_per_label = tfc.multiclass_accuracy(
-            all_preds,
-            all_labels,
-            num_classes=10,
-            average="none",
-        ).numpy()
-
-        # Create a dataframe with the per-label accuracies, as well as the global and macro accuracies.
-        # The columns of the table should be the labels, and there should only be a single row.
-        acc_df = pd.DataFrame(acc_per_label[None], columns=[str(i) for i in range(10)])
-
-        # Log the dataframe to wandb.
-        table = wandb.Table(dataframe=acc_df)
-        run.log({f"{name}_accuracy_table": table})
+        # Put everything on CPU, and flatten a list of dicts into one dict.
+        out_cpu = [pytree.tree_map(lambda x: x.cpu(), o) for o in outputs_list]
+        outputs = flatten_outputs(out_cpu)
+
+        # Compute the metrics.
+        metrics = get_metrics(outputs["preds"], outputs["labels"])
+        global_acc = metrics["global_acc"]
+        macro_acc = metrics["macro_acc"]
+        acc_df = metrics["acc_df"]
 
+        # Log the metrics + table to wandb.
         run.summary[f"{name}_true_accuracy"] = global_acc
         run.summary[f"{name}_class_balanced_accuracy"] = macro_acc
 
+        table = wandb.Table(dataframe=acc_df)
+        run.log({f"{name}_accuracy_table": table})
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/train.py b/scripts/train.py
index 2dab2bd..07e14d0 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,130 +1,147 @@
+import json
+
+import hydra
 import lightning as L
+import omegaconf
 import torch
-import torch.nn.functional as F
-import torchvision as tv
 import wandb
 from lightning.pytorch.callbacks import ModelCheckpoint
 from lightning.pytorch.loggers import WandbLogger
-from torch import optim
 
 from python_ml_project_template.datasets.cifar10 import CIFAR10DataModule
+from python_ml_project_template.models.classifier import ClassifierTrainingModule
 from python_ml_project_template.utils.script_utils import (
     PROJECT_ROOT,
     LogPredictionSamplesCallback,
+    create_model,
     match_fn,
 )
 
-# TODOs:
-# [x] Switch to CIFAR10
-# [x] Add wandb, including saving the checkpoint, logging an image, and saving the codebase state.
-#     - [x] Add a callback to save the model to wandb.
-#     - [x] Add a callback to save the codebase to wandb.
-#     - [x] Add an image logging example.
-# [x] Add a DataModule
-# [x] Add an eval script which loads from wandb, and outputs a table.
-# [ ] Fix grouping stuff...
-# [ ] Add hydra configs
-# [ ] Align the checkpoints and log files
-
-
-class ClassifierTrainingModule(L.LightningModule):
-    def __init__(self, network, lr: float) -> None:
-        super().__init__()
-        self.network = network
-        self.lr = lr
-
-    def forward(self, x):
-        self.network(x)
-
-    def configure_optimizers(self):
-        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
-        lr_scheduler = optim.lr_scheduler.MultiStepLR(
-            optimizer, milestones=[100, 150], gamma=0.1
-        )
-        return [optimizer], [lr_scheduler]
-
-    def _calculate_loss(self, batch, mode="train"):
-        imgs, labels = batch
-        preds = self.network(imgs)
-        loss = F.cross_entropy(preds, labels)
-        acc = (preds.argmax(dim=-1) == labels).float().mean()
-
-        istrain = mode == "train"
-        self.log("%s_loss" % mode, loss, prog_bar=istrain, add_dataloader_idx=False)
-        self.log("%s_acc" % mode, acc, add_dataloader_idx=False)
-        return {"loss": loss, "acc": acc, "preds": preds}
-
-    def training_step(self, batch, batch_idx):
-        loss = self._calculate_loss(batch, mode="train")
-        return loss
-
-    def validation_step(self, batch, batch_idx, dataloader_idx=0):
-        if dataloader_idx == 0:
-            mode = "train_val"
-        else:
-            mode = "val"
-        return self._calculate_loss(batch, mode=mode)
-
-    def test_step(self, batch, batch_idx):
-        return self._calculate_loss(batch, mode="test")
 
+@hydra.main(config_path="../configs", config_name="train", version_base="1.3")
+def main(cfg):
+    print(
+        json.dumps(
+            omegaconf.OmegaConf.to_container(cfg, resolve=True, throw_on_missing=False),
+            sort_keys=True,
+            indent=4,
+        )
+    )
+    ######################################################################
+    # Torch settings.
+    ######################################################################
 
-def main():
-    # Global seed for reproducibility.
-    L.seed_everything(42)
+    # Make deterministic + reproducible.
     torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
 
     # Since most of us are training on 3090s+, we can use mixed precision.
     torch.set_float32_matmul_precision("medium")
 
-    root = "./data"
-
-    network = tv.models.VisionTransformer(
-        image_size=32,
-        hidden_dim=512,
-        num_heads=8,
-        num_layers=6,
-        patch_size=4,
-        num_classes=10,
-        representation_size=256,
-        mlp_dim=2048,
-        dropout=0.2,
+    # Global seed for reproducibility.
+    L.seed_everything(cfg.seed)
+
+    ######################################################################
+    # Create the datamodule.
+    # The datamodule is responsible for all the data loading, including
+    # downloading the data, and splitting it into train/val/test.
+    #
+    # This could be swapped out for a different datamodule in-place,
+    # or with an if statement, or by using hydra.instantiate.
+    ######################################################################
+
+    datamodule = CIFAR10DataModule(
+        root=cfg.dataset.data_dir,
+        batch_size=cfg.training.batch_size,
+        num_workers=cfg.resources.num_workers,
     )
-    model = ClassifierTrainingModule(network, lr=3e-4)
 
-    datamodule = CIFAR10DataModule(root, batch_size=128, num_workers=4)
-
-    save_dir = "./wandb"
-    checkpoint_dir = "./checkpoints"
+    ######################################################################
+    # Create the network(s) which will be trained by the Training Module.
+    # The network should (ideally) be lightning-independent. This allows
+    # us to use the network in other projects, or in other training
+    # configurations.
+    #
+    # This might get a bit more complicated if we have multiple networks,
+    # but we can just customize the training module and the Hydra configs
+    # to handle that case. No need to over-engineer it. You might
+    # want to put this into a "create_network" function somewhere so train
+    # and eval can be the same.
+    #
+    # If it's a custom network, a good idea is to put the custom network
+    # in `python_ml_project_template.nets.my_net`.
+    ######################################################################
+
+    # Model architecture is dataset-dependent, so we have a helper
+    # function to create the model (while separating out relevant vals).
+    network = create_model(
+        image_size=cfg.dataset.image_size,
+        num_classes=cfg.dataset.num_classes,
+        model_cfg=cfg.model,
+    )
 
-    # Create a new wandb run.
-    id = wandb.util.generate_id()
-    group = "experiment-" + id
+    ######################################################################
+    # Create the training module.
+    # The training module is responsible for all the different parts of
+    # training, including the network, the optimizer, the loss function,
+    # and the logging.
+    ######################################################################
+
+    model = ClassifierTrainingModule(network, training_cfg=cfg.training)
+
+    ######################################################################
+    # Set up logging in WandB.
+    # This is a bit complicated, because we want to log the codebase,
+    # the model, and the checkpoints.
+    ######################################################################
+
+    # If no group is provided, then we should create a new one (so we can allocate)
+    # evaluations to this group later.
+    if cfg.wandb.group is None:
+        id = wandb.util.generate_id()
+        group = "experiment-" + id
+    else:
+        group = cfg.wandb.group
 
     logger = WandbLogger(
-        project="lightning-hydra-template",
-        entity="r-pad",
+        entity=cfg.wandb.entity,
+        project=cfg.wandb.project,
         log_model=True,  # Only log the last checkpoint to wandb, and only the LAST model checkpoint.
-        save_dir=save_dir,
-        config={"testit": "wat"},
-        job_type="train",
+        save_dir=cfg.wandb.save_dir,
+        config=omegaconf.OmegaConf.to_container(
+            cfg, resolve=True, throw_on_missing=True
+        ),
+        job_type=cfg.job_type,
+        save_code=True,  # This just has the main script.
         group=group,
-        id=id,
     )
 
+    ######################################################################
+    # Create the trainer.
+    # The trainer is responsible for running the training loop, and
+    # logging the results.
+    #
+    # There are a few callbacks (which we could customize):
+    # - LogPredictionSamplesCallback: Logs some examples from the dataset,
+    #       and the model's predictions.
+    # - ModelCheckpoint #1: Saves the latest model.
+    # - ModelCheckpoint #2: Saves the best model (according to validation
+    #       loss), and logs it to wandb.
+    ######################################################################
+
     trainer = L.Trainer(
         accelerator="gpu",
-        devices=1,
+        devices=cfg.resources.gpus,
         precision="16-mixed",
-        max_epochs=1,
+        max_epochs=cfg.training.epochs,
         logger=logger,
         callbacks=[
+            # Callback which logs whatever visuals (i.e. dataset examples, preds, etc.) we want.
             LogPredictionSamplesCallback(logger),
             # This checkpoint callback saves the latest model during training, i.e. so we can resume if it crashes.
             # It saves everything, and you can load by referencing last.ckpt.
             ModelCheckpoint(
-                checkpoint_dir,
+                dirpath=cfg.lightning.checkpoint_dir,
                 filename="{epoch}-{step}",
                 monitor="step",
                 mode="max",
@@ -133,7 +150,7 @@ def main():
             ),
             # This checkpoint will get saved to WandB. The Callback mechanism in lightning is poorly designed, so we have to put it last.
             ModelCheckpoint(
-                checkpoint_dir,
+                dirpath=cfg.lightning.checkpoint_dir,
                 filename="{epoch}-{step}-{val_loss:.2f}-weights-only",
                 monitor="val_loss",
                 mode="min",
@@ -142,6 +159,13 @@ def main():
         ],
     )
 
+    ######################################################################
+    # Log the code to wandb.
+    # This is somewhat custom, you'll have to edit this to include whatever
+    # additional files you want, but basically it just logs all the files
+    # in the project root inside dirs, and with extensions.
+    ######################################################################
+
     # Log the code used to train the model. Make sure not to log too much, because it will be too big.
     wandb.run.log_code(
         root=PROJECT_ROOT,
@@ -151,7 +175,10 @@ def main():
         ),
     )
 
-    # Run training.
+    ######################################################################
+    # Train the model.
+    ######################################################################
+
     trainer.fit(model, datamodule=datamodule)
 
 
diff --git a/src/python_ml_project_template/metrics/__init__.py b/src/python_ml_project_template/metrics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python_ml_project_template/metrics/classification.py b/src/python_ml_project_template/metrics/classification.py
new file mode 100644
index 0000000..68ca4bc
--- /dev/null
+++ b/src/python_ml_project_template/metrics/classification.py
@@ -0,0 +1,22 @@
+import pandas as pd
+import torchmetrics.functional.classification as tfc
+
+
+def get_metrics(preds, labels):
+    # "True" accuracy, aka on the true distribution (without considering class imbalance).
+    global_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="micro")
+
+    # Per-class accuracy, averaged over all classes with equal weight.
+    macro_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="macro")
+
+    # Per-class accuracy, not averaged over all classes.
+    class_acc = tfc.multiclass_accuracy(preds, labels, num_classes=10, average="none")
+
+    # Create a dataframe with the per-label accuracies.
+    acc_df = pd.DataFrame(class_acc[None], columns=[str(i) for i in range(10)])
+
+    return {
+        "global_acc": global_acc,
+        "macro_acc": macro_acc,
+        "acc_df": acc_df,
+    }
diff --git a/src/python_ml_project_template/models/__init__.py b/src/python_ml_project_template/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python_ml_project_template/models/classifier.py b/src/python_ml_project_template/models/classifier.py
new file mode 100644
index 0000000..f567782
--- /dev/null
+++ b/src/python_ml_project_template/models/classifier.py
@@ -0,0 +1,61 @@
+from typing import Any
+
+import lightning as L
+import torch.nn.functional as F
+from torch import optim
+
+
+class ClassifierTrainingModule(L.LightningModule):
+    def __init__(self, network, training_cfg) -> None:
+        super().__init__()
+        self.network = network
+        self.lr = training_cfg.lr
+
+    def forward(self, x):
+        self.network(x)
+
+    def configure_optimizers(self):
+        optimizer = optim.AdamW(self.parameters(), lr=self.lr)
+        lr_scheduler = optim.lr_scheduler.MultiStepLR(
+            optimizer, milestones=[100, 150], gamma=0.1
+        )
+        return [optimizer], [lr_scheduler]
+
+    def _calculate_loss(self, batch, mode="train"):
+        imgs, labels = batch
+        preds = self.network(imgs)
+        loss = F.cross_entropy(preds, labels)
+        acc = (preds.argmax(dim=-1) == labels).float().mean()
+
+        istrain = mode == "train"
+        self.log("%s_loss" % mode, loss, prog_bar=istrain, add_dataloader_idx=False)
+        self.log("%s_acc" % mode, acc, add_dataloader_idx=False)
+        return {"loss": loss, "acc": acc, "preds": preds}
+
+    def training_step(self, batch, batch_idx):
+        loss = self._calculate_loss(batch, mode="train")
+        return loss
+
+    def validation_step(self, batch, batch_idx, dataloader_idx=0):
+        if dataloader_idx == 0:
+            mode = "train_val"
+        else:
+            mode = "val"
+        return self._calculate_loss(batch, mode=mode)
+
+    def test_step(self, batch, batch_idx):
+        return self._calculate_loss(batch, mode="test")
+
+
+class ClassifierInferenceModule(L.LightningModule):
+    def __init__(self, network) -> None:
+        super().__init__()
+        self.network = network
+
+    def forward(self, x):
+        self.network(x)
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+        imgs, labels = batch
+        preds = self.network(imgs)
+        return {"preds": preds, "labels": labels}
diff --git a/src/python_ml_project_template/nets/__init__.py b/src/python_ml_project_template/nets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/python_ml_project_template/utils/script_utils.py b/src/python_ml_project_template/utils/script_utils.py
index 80bbd50..6a51de3 100644
--- a/src/python_ml_project_template/utils/script_utils.py
+++ b/src/python_ml_project_template/utils/script_utils.py
@@ -1,7 +1,10 @@
 import os
 import pathlib
-from typing import Sequence
+from typing import Dict, List, Sequence, Union
 
+import torch
+import torch.utils._pytree as pytree
+import torchvision as tv
 import wandb
 from lightning.pytorch import Callback
 from pytorch_lightning.loggers import WandbLogger
@@ -9,6 +12,23 @@
 PROJECT_ROOT = str(pathlib.Path(__file__).parent.parent.parent.parent.resolve())
 
 
+def create_model(image_size, num_classes, model_cfg):
+    if model_cfg.name == "vit":
+        return tv.models.VisionTransformer(
+            image_size=image_size,
+            num_classes=num_classes,
+            hidden_dim=model_cfg.hidden_dim,
+            num_heads=model_cfg.num_heads,
+            num_layers=model_cfg.num_layers,
+            patch_size=model_cfg.patch_size,
+            representation_size=model_cfg.representation_size,
+            mlp_dim=model_cfg.mlp_dim,
+            dropout=model_cfg.dropout,
+        )
+    else:
+        raise ValueError("not a valid model name")
+
+
 # This matching function
 def match_fn(dirs: Sequence[str], extensions: Sequence[str], root: str = PROJECT_ROOT):
     def _match_fn(path: pathlib.Path):
@@ -25,6 +45,21 @@ def _match_fn(path: pathlib.Path):
     return _match_fn
 
 
+TorchTree = Dict[str, Union[torch.Tensor, "TorchTree"]]
+
+
+def flatten_outputs(outputs: List[TorchTree]) -> TorchTree:
+    """Flatten a list of dictionaries into a single dictionary."""
+
+    # Concatenate all leaf nodes in the trees.
+    flattened_outputs = [pytree.tree_flatten(output) for output in outputs]
+    flattened_list = [o[0] for o in flattened_outputs]
+    flattened_spec = flattened_outputs[0][1]  # Spec definitely should be the same...
+    cat_flat = [torch.cat(x) for x in list(zip(*flattened_list))]
+    output_dict = pytree.tree_unflatten(cat_flat, flattened_spec)
+    return output_dict
+
+
 class LogPredictionSamplesCallback(Callback):
     def __init__(self, logger: WandbLogger):
         self.logger = logger

From 87feb0829df1922f8d4aed361acb460d454dfb9f Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Thu, 22 Jun 2023 16:59:09 -0400
Subject: [PATCH 07/11] fix typing

---
 pyproject.toml                                       | 11 ++++++-----
 src/python_ml_project_template/utils/script_utils.py |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9cc3b18..523127f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ develop = [
   "black == 23.3.0",
   "isort == 5.12.0",
   "mypy == 1.3.0",
+  "pandas-stubs == 2.0.2.230605",
   "pylint == 2.17.4",
   "pytest == 7.3.2",
   "pre-commit == 3.3.3",
@@ -64,11 +65,11 @@ mypy_path = "src"
 namespace_packages = true
 explicit_package_bases = true
 
-# # Uncomment this when you have imports for mypy to ignore.
-# [[tool.mypy.overrides]]
-# module = [
-# ]
-# ignore_missing_imports = true
+[[tool.mypy.overrides]]
+module = [
+  "torchvision.*",
+]
+ignore_missing_imports = true
 
 [tool.pylint]
 known-third-party = "wandb"
diff --git a/src/python_ml_project_template/utils/script_utils.py b/src/python_ml_project_template/utils/script_utils.py
index 6a51de3..ebb68b5 100644
--- a/src/python_ml_project_template/utils/script_utils.py
+++ b/src/python_ml_project_template/utils/script_utils.py
@@ -1,6 +1,6 @@
 import os
 import pathlib
-from typing import Dict, List, Sequence, Union
+from typing import Dict, List, Sequence, Union, cast
 
 import torch
 import torch.utils._pytree as pytree
@@ -57,7 +57,7 @@ def flatten_outputs(outputs: List[TorchTree]) -> TorchTree:
     flattened_spec = flattened_outputs[0][1]  # Spec definitely should be the same...
     cat_flat = [torch.cat(x) for x in list(zip(*flattened_list))]
     output_dict = pytree.tree_unflatten(cat_flat, flattened_spec)
-    return output_dict
+    return cast(TorchTree, output_dict)
 
 
 class LogPredictionSamplesCallback(Callback):

From 7fb123ae1c4ccf1780a84ed11a6f59a65bb22aa3 Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Fri, 4 Aug 2023 01:27:41 -0400
Subject: [PATCH 08/11] add a simple Dockerfile which can be used to train

---
 .dockerignore | 370 ++++++++++++++++++++++++++++++++++++++++++++++++++
 Dockerfile    |  55 ++++++++
 README.md     |  25 ++++
 3 files changed, 450 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..c8702b8
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,370 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos,pycharm,git,linux
+
+### Git ###
+# Created by git for backups. To disable backups in Git:
+# $ git config --global mergetool.keepBackup false
+*.orig
+
+# Created by git when using merge tools for conflicts
+*.BACKUP.*
+*.BASE.*
+*.LOCAL.*
+*.REMOTE.*
+*_BACKUP_*.txt
+*_BASE_*.txt
+*_LOCAL_*.txt
+*_REMOTE_*.txt
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# Support for Project snippet scope
+.vscode/*.code-snippets
+
+# Ignore code-workspaces
+*.code-workspace
+
+# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos,pycharm,git,linux
+
+.dockerignore
+Dockerfile
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..eb217db
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,55 @@
+# Use the official Ubuntu 20.04 image as the base
+FROM ubuntu:20.04
+
+# Set environment variables to avoid interactive prompts during installation
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install necessary dependencies
+RUN apt-get update && \
+    apt-get install -y curl git build-essential libssl-dev zlib1g-dev libbz2-dev \
+    git \
+    libreadline-dev libsqlite3-dev wget llvm libncurses5-dev libncursesw5-dev \
+    xz-utils tk-dev libffi-dev liblzma-dev python-openssl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install pyenv
+ENV HOME="/root"
+
+WORKDIR $HOME
+RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
+
+ENV PYENV_ROOT="$HOME/.pyenv"
+ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH"
+
+# Install Python 3.10 using pyenv
+RUN pyenv install 3.10.0
+RUN pyenv global 3.10.0
+
+# Install PyTorch with CUDA support (make sure to adjust this depending on your CUDA version)
+RUN pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118/
+
+# Make the working directory the home directory
+RUN mkdir $HOME/code
+WORKDIR $HOME/code
+
+# Only copy in the source code that is necessary for the dependencies to install
+COPY ./src $HOME/code/src
+COPY ./setup.py $HOME/code/setup.py
+COPY ./pyproject.toml $HOME/code/pyproject.toml
+RUN pip install -e .
+
+# Changes to the configs and scripts will not require a rebuild
+COPY ./configs $HOME/code/configs
+COPY ./scripts $HOME/code/scripts
+
+RUN git config --global --add safe.directory /root/code
+
+# Make a data directory.
+RUN mkdir /root/data
+
+# Make a logs directory.
+RUN mkdir /root/logs
+
+# Set up the entry point
+CMD ["python", "-c", "import torch; print(torch.cuda.is_available())"]
diff --git a/README.md b/README.md
index e3fb3dc..fbb7e41 100644
--- a/README.md
+++ b/README.md
@@ -46,3 +46,28 @@ Then we install pre-commit hooks:
 pre-commit install
 
 ```
+
+## Docker
+
+To build the docker image, run:
+
+```bash
+docker build -t python-ml-project-template .
+```
+
+To run the training script, run:
+
+```bash
+WANDB_API_KEY=<API_KEY>
+# Optional: mount current directory to run / test new code.
+# Mount data directory to access data.
+docker run \
+    -v $(pwd)/data:/root/data \
+    -v $(pwd)/logs:/root/logs \
+    --gpus all \
+    -e WANDB_API_KEY=$WANDB_API_KEY \
+    -e WANDB_DOCKER_IMAGE=python-ml-project-template \
+    python-ml-project-template python scripts/train.py \
+        dataset.data_dir=/root/data \
+        log_dir=/root/logs
+```

From c821ec2f419f59675a058be400970f6c00e975c1 Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Fri, 4 Aug 2023 18:11:19 -0400
Subject: [PATCH 09/11] update the Dockerfile so that things work on
 singularity smh

---
 Dockerfile | 24 ++++++++++++------------
 README.md  | 18 ++++++++++++++----
 autobot.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 16 deletions(-)
 create mode 100644 autobot.md

diff --git a/Dockerfile b/Dockerfile
index eb217db..b6be688 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,12 +14,12 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 # Install pyenv
-ENV HOME="/root"
+ENV CODING_ROOT="/opt/baeisner"
 
-WORKDIR $HOME
+WORKDIR $CODING_ROOT
 RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
 
-ENV PYENV_ROOT="$HOME/.pyenv"
+ENV PYENV_ROOT="$CODING_ROOT/.pyenv"
 ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH"
 
 # Install Python 3.10 using pyenv
@@ -30,26 +30,26 @@ RUN pyenv global 3.10.0
 RUN pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118/
 
 # Make the working directory the home directory
-RUN mkdir $HOME/code
-WORKDIR $HOME/code
+RUN mkdir $CODING_ROOT/code
+WORKDIR $CODING_ROOT/code
 
 # Only copy in the source code that is necessary for the dependencies to install
-COPY ./src $HOME/code/src
-COPY ./setup.py $HOME/code/setup.py
-COPY ./pyproject.toml $HOME/code/pyproject.toml
+COPY ./src $CODING_ROOT/code/src
+COPY ./setup.py $CODING_ROOT/code/setup.py
+COPY ./pyproject.toml $CODING_ROOT/code/pyproject.toml
 RUN pip install -e .
 
 # Changes to the configs and scripts will not require a rebuild
-COPY ./configs $HOME/code/configs
-COPY ./scripts $HOME/code/scripts
+COPY ./configs $CODING_ROOT/code/configs
+COPY ./scripts $CODING_ROOT/code/scripts
 
 RUN git config --global --add safe.directory /root/code
 
 # Make a data directory.
-RUN mkdir /root/data
+RUN mkdir $CODING_ROOT/data
 
 # Make a logs directory.
-RUN mkdir /root/logs
+RUN mkdir $CODING_ROOT/logs
 
 # Set up the entry point
 CMD ["python", "-c", "import torch; print(torch.cuda.is_available())"]
diff --git a/README.md b/README.md
index fbb7e41..44dab1d 100644
--- a/README.md
+++ b/README.md
@@ -52,18 +52,18 @@ pre-commit install
 To build the docker image, run:
 
 ```bash
-docker build -t python-ml-project-template .
+docker build -t <my_dockerhub_username>/python-ml-project-template .
 ```
 
-To run the training script, run:
+To run the training script locally, run:
 
 ```bash
 WANDB_API_KEY=<API_KEY>
 # Optional: mount current directory to run / test new code.
 # Mount data directory to access data.
 docker run \
-    -v $(pwd)/data:/root/data \
-    -v $(pwd)/logs:/root/logs \
+    -v $(pwd)/data:/opt/baeisner/data \
+    -v $(pwd)/logs:/opt/baeisner/logs \
     --gpus all \
     -e WANDB_API_KEY=$WANDB_API_KEY \
     -e WANDB_DOCKER_IMAGE=python-ml-project-template \
@@ -71,3 +71,13 @@ docker run \
         dataset.data_dir=/root/data \
         log_dir=/root/logs
 ```
+
+To push this:
+
+```bash
+docker push <my_dockerhub_username>/python-ml-project-template:latest
+```
+
+## Running on Clusters
+
+* [Autobot](autobot.md)
diff --git a/autobot.md b/autobot.md
new file mode 100644
index 0000000..d039f5b
--- /dev/null
+++ b/autobot.md
@@ -0,0 +1,49 @@
+# Instructions for running this thing on Autobot.
+
+
+0. Before you do anything, make sure you've built your docker image and pushed it to dockerhub!!!
+
+1. ssh into autobot:
+
+    ```
+    ssh <SCS_username>@autobot.vision.cs.cmu.edu
+    ```
+
+    a. *YOU ONLY NEED TO DO THIS ONCE*: Add your wandb API key to your bashrc:
+
+        ```bash
+        echo 'export WANDB_API_KEY="your_api_key_here"' >> ~/.bashrc
+        source ~/.bashrc
+        ```
+
+2. Find a node on http://autobot.vision.cs.cmu.edu/mtcmon/ which has open GPUs.
+
+3. SSH into that node:
+
+    ```
+    ssh autobot-0-33
+    ```
+
+    a. *YOU ONLY NEED TO DO THIS ONCE*: Create some scratch directories for your data and logs.
+
+        ```bash
+        mkdir -p /scratch/$(whoami)/data
+        mkdir -p /scratch/$(whoami)/logs
+        ```
+4. Run a training job like so. Don't worry about building or installing. You can modify the files here to map to whatever you want. In future iterations of this, we'll make this easier to do (aka by using a hydra singularity condfig file or something so you don't have to explictly map as arguments).
+
+    You can also change which GPU you want access to using CUDA_VISIBLE_DEVICES below.
+
+    ```bash
+    SINGULARITYENV_CUDA_VISIBLE_DEVICES=0 \
+    SINGULARITYENV_WANDB_DOCKER_IMAGE=python-ml-project-template \
+    singularity exec \
+    --nv \
+    --pwd /opt/$(whoami)/code \
+    -B /scratch/$(whoami)/data:/opt/data \
+    -B /scratch/$(whoami)/logs:/opt/logs \
+    docker://beisner/python-ml-project-template \
+    python scripts/train.py \
+        dataset.data_dir=/opt/data \
+        log_dir=/opt/logs
+    ```

From 6d48fcd943292ac3376f3d9eef4d2f4657d3248c Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Wed, 2 Oct 2024 16:19:10 -0400
Subject: [PATCH 10/11] first attempt to get the github actions to run

---
 .github/workflows/build-container.yaml | 43 ++++++++++++++++++++++++
 .github/workflows/build-site.yaml      | 17 ++++++----
 .github/workflows/compute-tag.yaml     | 45 ++++++++++++++++++++++++++
 .github/workflows/deploy-site.yaml     |  6 ++--
 .github/workflows/merge-request.yaml   |  9 ++++++
 .github/workflows/push.yaml            |  8 +++++
 .github/workflows/run-tests.yaml       | 35 +++++++++++---------
 .vscode/settings.json                  |  2 +-
 Dockerfile                             |  2 +-
 README.md                              | 10 ++++++
 pyproject.toml                         | 32 +++++++-----------
 11 files changed, 161 insertions(+), 48 deletions(-)
 create mode 100644 .github/workflows/build-container.yaml
 create mode 100644 .github/workflows/compute-tag.yaml

diff --git a/.github/workflows/build-container.yaml b/.github/workflows/build-container.yaml
new file mode 100644
index 0000000..44d878a
--- /dev/null
+++ b/.github/workflows/build-container.yaml
@@ -0,0 +1,43 @@
+# A github action that builds a container image for the project.
+
+name: Build Container
+
+on:
+  push:
+
+    # Allows you to run this workflow manually from the Actions tab
+    workflow_dispatch:
+
+    paths:
+      # This is the entire list of files that will trigger the workflow.
+      - Dockerfile
+      - pyproject.toml
+      - requirements-gpu.txt
+      - .github/workflows/build-container.yaml
+      - .github/workflows/compute-tag.yaml
+
+jobs:
+  compute_tag:
+    uses: ./.github/workflows/compute-tag.yaml
+
+  docker:
+    runs-on: ubuntu-latest
+    needs: compute_tag
+    steps:
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          # This is the name of the image that will be pushed to Docker Hub. If the branch is main, the image will be tagged as latest. Else, it will be tagged as the branch name.
+          tags: ${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${{ needs.compute_tag.outputs.image_tag }}
+          cache-from: type=registry,ref=${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${{ needs.compute_tag.outputs.image_tag }}
+          cache-to: type=inline
diff --git a/.github/workflows/build-site.yaml b/.github/workflows/build-site.yaml
index eabaf89..3e47f36 100644
--- a/.github/workflows/build-site.yaml
+++ b/.github/workflows/build-site.yaml
@@ -7,31 +7,36 @@ jobs:
   build-docs:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+        with:
+          submodules: 'true'
 
       ##############################################
       # Skip caching if using a local runner.
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         if: ${{ !env.ACT }}
         with:
           python-version: '3.10'
           cache: 'pip'
           cache-dependency-path: "pyproject.toml"
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         if: ${{ env.ACT }}
         with:
           python-version: '3.10'
       ##############################################
 
-      - name: Install Dependencies
-        run: pip install -e ".[build_docs]"
+      - name: Install specific pip.
+        run: pip install pip==23.0.0
+
+      - name: Install doc requirements.
+        run: pip install mkdocs-material mkdocstrings[python]
 
       - name: Build mkdocs site
         working-directory: docs
         run: mkdocs build
 
       - name: Upload the built site.
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         if: ${{ !env.ACT }}
         with:
           name: site
diff --git a/.github/workflows/compute-tag.yaml b/.github/workflows/compute-tag.yaml
new file mode 100644
index 0000000..fccf6f2
--- /dev/null
+++ b/.github/workflows/compute-tag.yaml
@@ -0,0 +1,45 @@
+name: Compute the docker tag for this branch
+
+on:
+  workflow_call:
+    inputs:
+      # description: 'If true, the tag will be latest if the docker image tag does not exist'
+      latest_on_noexist:
+        required: false
+        type: string
+        default: 'false'
+    outputs:
+      image_tag:
+        description: 'The tag to use for the docker image'
+        value: ${{ jobs.compute_tag.outputs.image_tag }}
+
+
+jobs:
+  compute_tag:
+    runs-on: ubuntu-latest
+    outputs:
+      image_tag: ${{ steps.set_tag.outputs.tag }}
+    steps:
+      - id: set_tag
+        run: |
+          branch_name="${{ github.head_ref }}"
+          if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
+            echo "tag=latest" >> $GITHUB_OUTPUT
+          elif [[ "${{ github.event_name }}" == "pull_request" ]]; then
+            sanitized_branch_name="${branch_name//\//-}"
+            # If latest_on_noexist is true, set the tag to latest if the tag does not exist.
+            if [[ "${{ inputs.latest_on_noexist }}" == "true" ]]; then
+              # Check if the tag exists using docker manifest.
+              if ! docker manifest inspect ${{ secrets.DOCKERHUB_USERNAME }}/python_ml_project_template:${sanitized_branch_name} > /dev/null 2>&1; then
+                echo "tag=latest" >> $GITHUB_OUTPUT
+              else
+                echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT
+              fi
+            else
+              echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT
+            fi
+          else
+            sanitized_branch_name="${GITHUB_REF#refs/heads/}"
+            sanitized_branch_name="${sanitized_branch_name//\//-}"
+            echo "tag=${sanitized_branch_name}" >> $GITHUB_OUTPUT
+          fi
diff --git a/.github/workflows/deploy-site.yaml b/.github/workflows/deploy-site.yaml
index 98dd7a1..fb275cb 100644
--- a/.github/workflows/deploy-site.yaml
+++ b/.github/workflows/deploy-site.yaml
@@ -18,18 +18,18 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download Site Artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: site
           path: docs/site/
 
       - name: Setup Pages
         if: ${{ !env.ACT }}
-        uses: actions/configure-pages@v1
+        uses: actions/configure-pages@v5
 
       - name: Upload Artifact to Pages
         if: ${{ !env.ACT }}
-        uses: actions/upload-pages-artifact@v1
+        uses: actions/upload-pages-artifact@v3
         with:
           path: docs/site/
 
diff --git a/.github/workflows/merge-request.yaml b/.github/workflows/merge-request.yaml
index 8e227ad..d33dcf3 100644
--- a/.github/workflows/merge-request.yaml
+++ b/.github/workflows/merge-request.yaml
@@ -8,9 +8,18 @@ on:
   workflow_dispatch:
 
 jobs:
+  compute_tag:
+    uses: ./.github/workflows/compute-tag.yaml
+    with:
+      latest_on_noexist: 'true'
+
   test:
     uses: ./.github/workflows/run-tests.yaml
+    needs: compute_tag
     with:
       install_string: .[develop]
+      # Get the image tag from the compute_tag job.
+      image_tag: ${{ needs.compute_tag.outputs.image_tag }}
+
   build_site:
     uses: ./.github/workflows/build-site.yaml
diff --git a/.github/workflows/push.yaml b/.github/workflows/push.yaml
index 2b14daa..857c021 100644
--- a/.github/workflows/push.yaml
+++ b/.github/workflows/push.yaml
@@ -8,10 +8,18 @@ on:
   workflow_dispatch:
 
 jobs:
+  compute_tag:
+    uses: ./.github/workflows/compute-tag.yaml
+    with:
+      latest_on_noexist: 'true'
   test:
     uses: ./.github/workflows/run-tests.yaml
+    needs: compute_tag
     with:
       install_string: .[develop]
+      # Get the image tag from the compute_tag job.
+      image_tag: ${{ needs.compute_tag.outputs.image_tag }}
+
   build_site:
     uses: ./.github/workflows/build-site.yaml
   deploy_site:
diff --git a/.github/workflows/run-tests.yaml b/.github/workflows/run-tests.yaml
index 5426c1d..797e0f7 100644
--- a/.github/workflows/run-tests.yaml
+++ b/.github/workflows/run-tests.yaml
@@ -6,30 +6,33 @@ on:
       install_string:
         required: True
         type: string
+      image_tag:
+        required: True
+        type: string
+        default: "latest"
 
 jobs:
   test:
     runs-on: ubuntu-latest
 
-    steps:
-      - uses: actions/checkout@v3
+    container:
+      # Image tag is "latest" if the branch is main, else it is the branch name.
+      image: beisner/python_ml_project_template:${{ inputs.image_tag }}
 
-      ##############################################
-      # Skip caching if using a local runner.
-      - uses: actions/setup-python@v4
-        if: ${{ !env.ACT }}
-        with:
-          python-version: '3.10'
-          cache: 'pip'
-          cache-dependency-path: "pyproject.toml"
-      - uses: actions/setup-python@v4
-        if: ${{ env.ACT }}
+    defaults:
+      run:
+        working-directory: /opt/baeisner/code
+
+    steps:
+      - uses: actions/checkout@v4
         with:
-          python-version: '3.10'
-      ##############################################
+          submodules: 'true'
 
-      - name: Install package
-        run: pip install "${{ inputs.install_string }}"
+      # Link the code from the default checkout directory to the correct directory.
+      # Use the github workspace variable to get the correct directory.
+      # Can't use the checkout action to checkout to a different directory, so we have to simlink.
+      - name: Move code to correct directory
+        run: rm -rf /opt/baeisner/code && ln -s $GITHUB_WORKSPACE /opt/baeisner/code
 
       - name: Code Quality
         run: python -m black src/ tests/ --check
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 0c4b36b..9376e6b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -2,7 +2,7 @@
     "editor.formatOnSave": true,
     "python.formatting.provider": "none",
     "editor.codeActionsOnSave": {
-        "source.organizeImports": true
+        "source.organizeImports": "explicit"
     },
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
diff --git a/Dockerfile b/Dockerfile
index b6be688..56b5814 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ WORKDIR $CODING_ROOT/code
 COPY ./src $CODING_ROOT/code/src
 COPY ./setup.py $CODING_ROOT/code/setup.py
 COPY ./pyproject.toml $CODING_ROOT/code/pyproject.toml
-RUN pip install -e .
+RUN pip install -e .[develop]
 
 # Changes to the configs and scripts will not require a rebuild
 COPY ./configs $CODING_ROOT/code/configs
diff --git a/README.md b/README.md
index 44dab1d..9c24e63 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,16 @@ To push this:
 docker push <my_dockerhub_username>/python-ml-project-template:latest
 ```
 
+## Using the CI.
+
+Set up pushing to docker:
+
+Put the following secrets in the Github repository:
+* `DOCKERHUB_USERNAME`: Your Dockerhub username
+* `DOCKERHUB_TOKEN`: Your Dockerhub token
+
+You'll also need to Ctrl-F replace instances of beisner and baeisner with appropriate usernames.
+
 ## Running on Clusters
 
 * [Autobot](autobot.md)
diff --git a/pyproject.toml b/pyproject.toml
index 523127f..e0dcf4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,27 +4,21 @@ version = "0.1.0"
 description = "A Python Package Template"
 readme = "README.md"
 requires-python = ">=3.6"
-license = {file = "LICENSE.txt"}
-authors = [
-  {email = "baeisner@andrew.cmu.edu", name = "Ben Eisner"}
-]
+license = { file = "LICENSE.txt" }
+authors = [{ email = "baeisner@andrew.cmu.edu", name = "Ben Eisner" }]
 dependencies = [
   "hydra-core == 1.3.2",
   "lightning == 2.0.3",
   "omegaconf == 2.3.0",
   "pandas",
-  "torch == 2.0.1", # CUDA 11.8
+  "torch == 2.0.1",        # CUDA 11.8
   "torchmetrics",
   "torchvision == 0.15.2", # CUDA 11.8
   "wandb == 0.15.4",
 ]
 
 [build-system]
-requires = [
-  "setuptools >= 62.3.2",
-  "setuptools-scm",
-  "wheel",
-]
+requires = ["setuptools >= 62.3.2", "setuptools-scm", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project.optional-dependencies]
@@ -38,13 +32,8 @@ develop = [
   "pytest == 7.3.2",
   "pre-commit == 3.3.3",
 ]
-notebooks = [
-  "jupyter",
-]
-build_docs = [
-  "mkdocs-material",
-  "mkdocstrings[python]",
-]
+notebooks = ["jupyter"]
+build_docs = ["mkdocs-material", "mkdocstrings[python]"]
 
 # This is required to allow us to have notebooks/ at the top level.
 [tool.setuptools.packages.find]
@@ -58,7 +47,7 @@ profile = "black"
 known_third_party = "wandb"
 
 [tool.mypy]
-python_version = 3.8
+python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 mypy_path = "src"
@@ -66,11 +55,12 @@ namespace_packages = true
 explicit_package_bases = true
 
 [[tool.mypy.overrides]]
-module = [
-  "torchvision.*",
-]
+module = ["torchvision.*"]
 ignore_missing_imports = true
 
+[tool.pytest.ini_options]
+testpaths = "tests"
+
 [tool.pylint]
 known-third-party = "wandb"
 

From 33cbbd1e9f455831e661506ecf0065de77344e01 Mon Sep 17 00:00:00 2001
From: Ben Eisner <ben.a.eisner@gmail.com>
Date: Wed, 2 Oct 2024 18:18:34 -0400
Subject: [PATCH 11/11] set up pushing sif files to seuss

---
 .gitignore                        |  3 +++
 cluster/build_push_sif_seuss.bash | 33 +++++++++++++++++++++++++++++++
 cluster/sanitize_branch_name.bash | 10 ++++++++++
 3 files changed, 46 insertions(+)
 create mode 100755 cluster/build_push_sif_seuss.bash
 create mode 100755 cluster/sanitize_branch_name.bash

diff --git a/.gitignore b/.gitignore
index 5623d3f..12cce24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -383,3 +383,6 @@ wandb_artifacts/
 # Generated by hydra.
 # outputs/
 logs/
+
+# Generated for pushing to seuss.
+.singularity_images/
diff --git a/cluster/build_push_sif_seuss.bash b/cluster/build_push_sif_seuss.bash
new file mode 100755
index 0000000..fd31008
--- /dev/null
+++ b/cluster/build_push_sif_seuss.bash
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Build a docker image, convert it to as singularity image, and push it to the seuss cluster.
+# Right now, this is a total hack since we can't actually build the docker image on the cluster,
+# nor can we build the singularity image on the cluster. So we build the docker image locally,
+# convert it to a singularity image locally, and then push it to the cluster.
+
+# Whole script fails if any command fails.
+
+set -e
+
+# Set some variables.
+dockerhub_username=beisner
+project_name=python_ml_project_template
+scs_username=baeisner
+
+# Get paths.
+script_path=$(realpath $0)
+script_dir=$(dirname $script_path)
+root_dir=$(realpath ${script_dir}/..)
+
+# Compute a good tag for the image, which will be <dockerhub_username>/<project_name>:<branch-name>-scratch.
+sanitized_branch_name=`${script_dir}/sanitize_branch_name.bash`
+
+# Build the docker image.
+docker build -t ${dockerhub_username}/${project_name}:${sanitized_branch_name}-scratch .
+
+# Convert the docker image to a singularity image, and save it in the .singularity_images directory.
+mkdir -p ${root_dir}/.singularity_images
+sif_name=${root_dir}/.singularity_images/${project_name}_${sanitized_branch_name}-scratch.sif
+singularity build ${sif_name} docker-daemon://$dockerhub_username}/${project_name}:${sanitized_branch_name}-scratch
+
+# Rsync the singularity image to the seuss cluster.
+rsync -avz --progress ${sif_name} ${scs_username}@seuss.ri.cmu.edu:/home/${scs_username}/singularity_images/
diff --git a/cluster/sanitize_branch_name.bash b/cluster/sanitize_branch_name.bash
new file mode 100755
index 0000000..645302c
--- /dev/null
+++ b/cluster/sanitize_branch_name.bash
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Sanitize a branch name for use in a docker image tag.
+
+branch_name=$(git branch | grep \* | cut -d ' ' -f2)
+
+# Sanitize by replacing all slashes with underscores.
+sanitized_branch_name=$(echo $branch_name | sed 's/\//_/g')
+
+echo $sanitized_branch_name