Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 68 additions & 24 deletions .lightning/workflows/pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,65 +8,109 @@ timeout: "55" # minutes
parametrize:
matrix: {}
include:
# note that this is setting also all oldest requirements which is linked to Torch == 2.1
- image: "pytorchlightning/pytorch_lightning:base-cuda12.1.1-py3.10-torch2.1"
# note that this also sets oldest requirements which are linked to Python == 3.10
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
PACKAGE_NAME: "pytorch"
python_version: "3.10"
machine: "A100_X_2"
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
PACKAGE_NAME: "pytorch"
python_version: "3.12"
machine: "L4_X_2"
# - image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.7"
# - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
# PACKAGE_NAME: "pytorch"
- image: "pytorchlightning/pytorch_lightning:base-cuda12.6.3-py3.12-torch2.8"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
PACKAGE_NAME: "lightning"
python_version: "3.12"
machine: "L4_X_2"
exclude: []

env:
TZ: "Etc/UTC"
DEBIAN_FRONTEND: "noninteractive"
CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda"
MKL_THREADING_LAYER: "GNU"
CUDA_LAUNCH_BLOCKING: "1"
NCCL_DEBUG: "INFO"
TORCHDYNAMO_VERBOSE: "1"
FREEZE_REQUIREMENTS: "1"
RUN_ONLY_CUDA_TESTS: "1"

run: |
# Install Python and UV
apt-get update -qq --fix-missing
apt-get install -q -y software-properties-common curl
# Add deadsnakes PPA for newer Python versions if needed
add-apt-repository ppa:deadsnakes/ppa -y
apt-get update -qq --fix-missing
apt-get install -q -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
pkg-config \
cmake \
ca-certificates \
libopenmpi-dev \
openmpi-bin \
ninja-build \
libnccl2 \
libnccl-dev

apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev
ln -sf /usr/bin/python${python_version} /usr/bin/python
curl -LsSf https://astral.sh/uv/install.sh | sh

# Source the environment and ensure UV is in PATH
[ -f "$HOME/.local/bin/env" ] && . "$HOME/.local/bin/env"
export PATH="$HOME/.local/bin:$PATH"
source $HOME/.cargo/env 2>/dev/null || true
export PATH="$HOME/.cargo/bin:$PATH"

# Verify UV installation
command -v uv || (echo "UV not found in PATH" && exit 1)
# Create and activate a local uv virtual environment
uv venv .venv -p "/usr/bin/python${python_version}" || uv venv .venv -p "python${python_version}" || uv venv .venv
. .venv/bin/activate
hash -r

whereis nvidia
nvidia-smi
python --version
pip --version
pip install -q fire wget packaging
pip list
uv --version
uv pip list
set -ex

CUDA_VERSION="${image##*cuda}" # Remove everything up to and including "cuda"
# Parse CUDA version from image tag, e.g., "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
IMAGE_TAG="${image##*:}" # "12.6.3-runtime-ubuntu22.04"
CUDA_VERSION="${IMAGE_TAG%%-*}" # "12.6.3"
echo "Using CUDA version: ${CUDA_VERSION}"
CUDA_VERSION_M_M="${cuda_version%.*}" # Get major.minor by removing the last dot and everything after
CUDA_VERSION_MM="${CUDA_VERSION_M_M//'.'/''}"
TORCH_URL="https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html"
echo "Torch URL: ${TORCH_URL}"
CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6"
CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126"
export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM}
COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))')
echo "collecting coverage for: ${COVERAGE_SOURCE}"
TORCH_VER=$(python -c "import torch; print(torch.__version__.rsplit('.', 1)[0])")

if [ "${TORCH_VER}" == "2.1" ]; then
uv pip install -q fire wget packaging "lightning-utilities[cli]"
if [ "${python_version}" == "3.10" ]; then
echo "Set oldest versions"
pip uninstall -y deepspeed
pip install -U "lightning-utilities[cli]"
cd requirements/pytorch
python -m lightning_utilities.cli requirements set-oldest --req_files "['base.txt', 'extra.txt', 'strategies.txt', 'examples.txt']"
python -m lightning_utilities.cli requirements prune-pkgs --packages deepspeed --req_files strategies.txt
cd ../..
pip install "cython<3.0" wheel # for compatibility
uv pip install "cython<3.0" wheel # for compatibility
fi

# install the base so we can adjust other packages
uv pip install .
echo "Adjust torch versions in requirements files"
PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
pip install -q wget packaging
uv pip install -q wget packaging
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
for fpath in `ls requirements/**/*.txt`; do \
python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
done

if [ "${PACKAGE_NAME}" == "pytorch" ]; then
echo "Adjust PL imports"
pip install -U -q -r .actions/requirements.txt
uv pip install --upgrade -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_pytorch" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
Expand All @@ -76,14 +120,14 @@ run: |
fi

extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))")
pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}"
uv pip install -e ".[${extra}dev]" --upgrade

if [ "${PACKAGE_NAME}" == "pytorch" ]; then
echo "uninstall lightning to have just single package"
pip uninstall -y lightning
uv pip uninstall lightning
elif [ "${PACKAGE_NAME}" == "lightning" ]; then
echo "uninstall PL to have just single package"
pip uninstall -y pytorch-lightning
uv pip uninstall pytorch-lightning
fi

python requirements/collect_env_details.py
Expand Down Expand Up @@ -112,7 +156,7 @@ run: |
echo "Testing: fabric standalone"
export PL_USE_MOCKED_MNIST=1
export PL_RUN_STANDALONE_TESTS=1
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
bash ./run_standalone_tests.sh "tests_pytorch"
export PL_RUN_STANDALONE_TESTS=0

Expand Down
8 changes: 6 additions & 2 deletions tests/tests_pytorch/plugins/precision/test_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ def test_fsdp_precision_scaler_with_bf16():


@RunIf(min_cuda_gpus=1)
def test_fsdp_precision_forward_context():
"""Test to ensure that the context manager correctly is set to bfloat16."""
def test_fsdp_precision_forward_context_f16():
"""Test to ensure that the context manager correctly is set to float16."""
from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler

precision = FSDPPrecision(precision="16-mixed")
Expand All @@ -94,6 +94,10 @@ def test_fsdp_precision_forward_context():
assert isinstance(precision.forward_context(), _DtypeContextManager)
assert precision.forward_context()._new_dtype == torch.float16


@RunIf(min_cuda_gpus=1, bf16_cuda=True)
def test_fsdp_precision_forward_context_bf16():
"""Test to ensure that the context manager correctly is set to bfloat16."""
precision = FSDPPrecision(precision="bf16-mixed")
assert precision.scaler is None
with precision.forward_context():
Expand Down
Loading