mosaicml · coryMosaicML · Aug 10, 2023 · Aug 9, 2023 · Aug 9, 2023 · Aug 10, 2023
diff --git a/README.md b/README.md
@@ -47,6 +47,19 @@ Here are the system settings we recommend to start training your own diffusion m
     - Ubuntu Version: 20.04
 - Use a system with NVIDIA GPUs
 
+- For running on NVIDIA H100s, use a docker image with PyTorch 2.0+ e.g. [MosaicML's PyTorch base image](https://hub.docker.com/r/mosaicml/pytorch/tags)
+  - Recommended tag: `mosaicml/pytorch_vision:2.0.1_cu118-python3.10-ubuntu20.04`
+  - This image comes pre-configured with the following dependencies:
+    - PyTorch Version: 2.0.1
+    - CUDA Version: 11.8
+    - Python Version: 3.10
+    - Ubuntu Version: 20.04
+  - Depending on the training config, an additional install of `xformers` may be needed:
+    ```
+    pip install -U ninja
+    pip install -U git+https://github.com/facebookresearch/xformers
+    ```
+
 # How many GPUs do I need?
 
 We benchmarked the U-Net training throughput as we scale the number of A100 GPUs from 8 to 128. Our time estimates are based on training Stable Diffusion 2.0 base on 1,126,400,000 images at 256x256 resolution and 1,740,800,000 images at 512x512 resolution. Our cost estimates are based on $2 / A100-hour. Since the time and cost estimates are for the U-Net only, these only hold if the VAE and CLIP latents are computed before training. It took 3,784 A100-hours (cost of $7,600) to pre-compute the VAE and CLIP latents offline. If you are computing VAE and CLIP latents while training, expect a 1.4x increase in time and cost.

diff --git a/diffusion/train.py b/diffusion/train.py
@@ -4,6 +4,7 @@
 """Train model."""
 
 import operator
+import time
 from collections.abc import Iterable
 from typing import Any, Dict, List, Optional, Union
 
@@ -38,6 +39,8 @@ def train(config: DictConfig) -> None:
         config.dataset.train_dataset,
         batch_size=config.dataset.train_batch_size // dist.get_world_size(),
     )
+    # Need to sleep for a bit to avoid dataloader crash
+    time.sleep(10)
 
     # Composer can take dataloaders, dataspecs, evaluators, or list of evaluators
     eval_set: Optional[Union[DataSpec, List[Evaluator]]] = None
@@ -52,13 +55,17 @@ def train(config: DictConfig) -> None:
                 config.dataset.eval_batch_size // dist.get_world_size(),
             )
             evaluator = hydra.utils.instantiate(eval_conf.evaluator, dataloader=eval_dataloader)
+            # Need to sleep for a bit to avoid dataloader crash
+            time.sleep(10)
             evaluators.append(evaluator)
 
         eval_set = evaluators
 
     else:
         eval_set = hydra.utils.instantiate(config.dataset.eval_dataset,
                                            batch_size=config.dataset.eval_batch_size // dist.get_world_size())
+        # Need to sleep for a bit to avoid dataloader crash
+        time.sleep(10)
 
     # Build list of loggers, callbacks, and algorithms to pass to trainer
     logger: List[LoggerDestination] = []

diff --git a/setup.py b/setup.py
@@ -6,16 +6,16 @@
 from setuptools import find_packages, setup
 
 install_requires = [
-    'mosaicml@git+https://github.com/mosaicml/composer.git@6cf3d3a1aa300834c650f89460b5ac9bbc5a1e46',
+    'mosaicml==0.15.1',
     'mosaicml-streaming>=0.4.0,<1.0',
     'hydra-core>=1.2',
     'hydra-colorlog>=1.1.0',
-    'diffusers[torch]==0.16.0',
-    'transformers[torch]==4.29.2',
+    'diffusers[torch]==0.19.3',
+    'transformers[torch]==4.31.0',
     'wandb==0.15.4',
-    'xformers==0.0.16',
+    'xformers==0.0.20',
     'triton==2.0.0',
-    'torchmetrics[image]==0.11.3',
+    'torchmetrics[image]==0.11.4',
     'clean-fid',
     'clip@git+https://github.com/openai/CLIP.git',
 ]