awslabs
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎s3torchbenchmarking/README.md‎
Lines changed: 9 additions & 3 deletions b/‎s3torchbenchmarking/README.md‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎s3torchbenchmarking/conf/dcp_ddp_load.yaml‎
Lines changed: 31 additions & 0 deletions b/‎s3torchbenchmarking/conf/dcp_ddp_load.yaml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎s3torchbenchmarking/conf/dcp_ddp.yaml‎ renamed to ‎s3torchbenchmarking/conf/dcp_ddp_save.yaml‎
Lines changed: 4 additions & 3 deletions b/‎s3torchbenchmarking/conf/dcp_ddp.yaml‎ renamed to ‎s3torchbenchmarking/conf/dcp_ddp_save.yaml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎s3torchbenchmarking/conf/dcp_fsdp_load.yaml‎
Lines changed: 38 additions & 0 deletions b/‎s3torchbenchmarking/conf/dcp_fsdp_load.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎s3torchbenchmarking/conf/dcp_fsdp.yaml‎ renamed to ‎s3torchbenchmarking/conf/dcp_fsdp_save.yaml‎ b/‎s3torchbenchmarking/conf/dcp_fsdp.yaml‎ renamed to ‎s3torchbenchmarking/conf/dcp_fsdp_save.yaml‎
diff --git a/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_common.py‎
Lines changed: 17 additions & 2 deletions b/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_common.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/README.md‎
Lines changed: 27 additions & 6 deletions b/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/README.md‎
Lines changed: 27 additions & 6 deletions
diff --git a/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/load_benchmark.py‎
Lines changed: 75 additions & 0 deletions b/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/load_benchmark.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/benchmark.py‎ renamed to ‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/save_benchmark.py‎
Lines changed: 2 additions & 2 deletions b/‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/benchmark.py‎ renamed to ‎s3torchbenchmarking/src/s3torchbenchmarking/dcp_ddp/save_benchmark.py‎
Lines changed: 2 additions & 2 deletions
@@ -6,7 +6,8 @@
 * Fix SequentialS3Reader seek beyond EOF to clamp position to object size (#362)
 
 ### Other changes
-* Added thread_count parameter to S3StorageWriter 
+* Add benchmark to run DCP Loading Workloads (#357)
+* Add thread_count parameter to S3StorageWriter (#370)
 
 ## v1.4.3 (July 25, 2025)
 
 
@@ -112,10 +112,16 @@ vim ./conf/lightning_checkpointing.yaml # 1. edit config
 ./utils/run_lightning_benchmarks.sh      # 2. run scenario
 
 # PyTorch’s Distributed Checkpointing (DCP) benchmarks
-vim ./conf/dcp_ddp.yaml           # 1. edit config
-vim ./conf/dcp_fsdp.yaml
-./utils/run_dcp_ddp_benchmarks.sh # 2. run scenario
+vim ./conf/dcp_ddp_load.yaml              # 1. edit config
+vim ./conf/dcp_fsdp_load.yaml
+vim ./conf/dcp_ddp_save.yaml                   
+vim ./conf/dcp_fsdp_save.yaml
+# Saving Checkpoint
+./utils/run_dcp_ddp_benchmarks.sh         # 2. run scenario for saving checkpoint
 ./utils/run_dcp_fsdp_benchmarks.sh
+# Loading Checkpoint
+./utils/run_ddp_benchmarks.sh --load      # 3. run scenario for loading checkpoint after saving
+./utils/run_dcp_fsdp_benchmarks.sh --load 
 ```
 
 > [!NOTE]
 
@@ -0,0 +1,31 @@
+defaults:
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
+  - _self_
+
+# S3 bucket to use to save checkpoints.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  uri: ???    # e.g., s3://my-bucket/
+# Number of iterations for "saving" a model's checkpoint.
+# NOTE: this does not affect model training, as no actual training occurs in these benchmarks.
+epochs: 4
+
+hydra:
+  mode: MULTIRUN
+  sweep:
+    dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
+  sweeper:
+    params:
+      # Short name of a pre-trained model (from Hugging Face), listed in `models.py`.
+      +model: ???
+      # Type of Torch distributed backend (valid options: "nccl", "gloo").
+      +backend: nccl
+      # Number of workers.
+      +world_size: 8
+      # Checkpoint storage location (valid options: "disk", "s3").
+      +checkpoint.storage: disk, s3
+      # Checkpoint storage suffix location generated by save benchmarks, e.g., 2025-09-23-11-05-zmuZ/
+      +checkpoint.suffix: ???
+
@@ -23,8 +23,9 @@ hydra:
       # Type of Torch distributed backend (valid options: "nccl", "gloo").
       +backend: nccl
       # Number of workers.
-      +world_size: 4
+      +world_size: 8
       # Number of threads to use for saving the checkpoints.
-      +thread_count: 4
+      +thread_count: 8
       # Checkpoint storage location (valid options: "disk", "s3").
-      +checkpoint.storage: disk, s3
+      +checkpoint.storage: disk, s3
+
@@ -0,0 +1,38 @@
+defaults:
+  - hydra/callbacks/collate_results
+  - aws/dynamodb # save run results to DynamoDB -- comment me if not required
+  - _self_
+
+# S3 bucket to use to save checkpoints.
+# NOTE: a non-existing bucket will fail the benchmarks.
+s3:
+  region: ??? # e.g., eu-west-1
+  uri: ???    # e.g., s3://my-bucket/
+# Number of iterations for "saving" a model's checkpoint.
+# NOTE: this does not affect model training, as no actual training occurs in these benchmarks.
+epochs: 4
+
+hydra:
+  mode: MULTIRUN
+  sweep:
+    dir: multirun/${hydra.job.config_name}/${now:%Y-%m-%d_%H-%M-%S}
+  sweeper:
+    params:
+      # Short name of a pre-trained llama v2 model (valid options: "L7b", "L13b", "L30b", "L65b", "L70b").
+      +model: ???
+      # Type of Torch distributed backend (valid options: "nccl", "gloo").
+      +backend: nccl
+      # Number of workers.
+      +world_size: 8
+      # Checkpoint storage location (valid options: "disk", "s3").
+      +checkpoint.storage: disk, s3
+      # Sharding strategy (valid options: "full", "hybrid").
+      +checkpoint.sharding_strategy: full
+      # Controls whether files are forcibly synced to disk (only relevant for "disk" storage).
+      # NOTE: We disabled this option to improve performance since FSDP checkpointing with
+      # forced syncing (maximum durability) was significantly slower than storage throughput.
+      # This setting has no effect when using "s3" storage.
+      +checkpoint.sync_files: false
+      # Checkpoint storage suffix location generated by save benchmarks, e.g., 2025-09-23-11-05-zmuZ/
+      +checkpoint.suffix: ???
+
@@ -14,13 +14,14 @@
 import torch.distributed.checkpoint as dcp
 from omegaconf import DictConfig
 from torch import multiprocessing as mp
-from torch.distributed.checkpoint import FileSystemWriter
+from torch.distributed.checkpoint import FileSystemWriter, FileSystemReader
+
 
 from s3torchbenchmarking.benchmark_utils import (
     build_random_suffix,
     build_checkpoint_uri,
 )
-from s3torchconnector.dcp import S3StorageWriter
+from s3torchconnector.dcp import S3StorageWriter, S3StorageReader
 
 Timestamps = Tuple[float, float]
 logger = logging.getLogger(__name__)
@@ -49,6 +50,20 @@ def get_writer(cfg: DictConfig, suffix: str) -> FileSystemWriter:
     raise ValueError(f"Storage writer {cfg.checkpoint.storage} not supported")
 
 
+def get_reader(cfg: DictConfig) -> FileSystemReader:
+    """Instantiate a checkpoint reader based on the input config."""
+    suffix = cfg.checkpoint.suffix
+    if cfg.checkpoint.storage == "disk":
+        local_path = Path(cfg.path) / suffix
+        logger.info("Loading checkpoint from %s (disk)...", local_path)
+        return dcp.FileSystemReader(local_path)
+    elif cfg.checkpoint.storage == "s3":
+        uri = build_checkpoint_uri(cfg.s3.uri, suffix)
+        logger.info("Loading checkpoint from %s (S3)...", uri)
+        return S3StorageReader(cfg.s3.region, uri)
+    raise ValueError(f"Storage reader {cfg.checkpoint.storage} not supported")
+
+
 def benchmark_common_runner(
     cfg: DictConfig,
     run_fn,
 
@@ -10,21 +10,42 @@ where memory requirements per GPU are manageable.
 
 ### Purpose
 
-These benchmarks focus on testing the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`). The primary
-objectives are to evaluate the `s3torchconnector` library's performance against other libraries and local storage
-options, by measuring the following metrics:
+These benchmarks test both "save" and "load" mechanisms of PyTorch DCP (`torch.distributed.checkpoint.save` and `torch.distributed.checkpoint.load`). The primary objectives are to evaluate the `s3torchconnector` library's performance against other libraries and local storage options, by measuring the following metrics:
 
-- Checkpoint saving throughput (in MiB/s);
-- Checkpoint "corrected" save durations (in seconds), which exclude the influence of model load duration on the device.
+**Save Benchmarks:**
+- Checkpoint saving throughput (in MiB/s)
+- Checkpoint "corrected" save durations (in seconds), which exclude the influence of model load duration on the device
+
+**Load Benchmarks:**
+- Checkpoint loading throughput (in MiB/s)
+- Checkpoint "corrected" load durations (in seconds), which exclude the influence of process setup and model loading to device
 
 ### Configuration
 
-The benchmark runs can be customized through the [`dcp_ddp.yaml`](../../../conf/dcp_ddp.yaml) file.
+The benchmark runs can be customized through configuration files:
+
+- **Save benchmarks**: [`dcp_ddp_save.yaml`](../../../conf/dcp_ddp.yaml)
+- **Load benchmarks**: [`dcp_ddp_load.yaml`](../../../conf/dcp_ddp_load.yaml)
+
+The load configuration includes a `checkpoint.suffix` parameter that specifies which saved checkpoint to load.
 
 > [!IMPORTANT]
 > A `+path` option is passed to the running script ([`run_dcp_ddp_benchmarks.sh`](../../../utils/run_dcp_ddp_benchmarks.sh)),
 > and will be used only if `checkpoint.storage` key includes `disk`.
 
+### Usage
+
+**Save benchmarks (default):**
+```bash
+./utils/run_dcp_ddp_benchmarks.sh
+./utils/run_dcp_ddp_benchmarks.sh --save
+```
+
+**Load benchmarks:**
+```bash
+./utils/run_dcp_ddp_benchmarks.sh --load
+```
+
 ### References
 
 - https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html
 
@@ -0,0 +1,75 @@
+#  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  // SPDX-License-Identifier: BSD
+
+import logging
+from multiprocessing.queues import Queue
+from time import perf_counter
+from typing import Tuple
+
+import hydra
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+from omegaconf import DictConfig
+from torch.nn.parallel import DistributedDataParallel
+
+from s3torchbenchmarking.dcp_common import setup, get_reader, benchmark_common_runner
+from s3torchbenchmarking.models import get_benchmark_model, BenchmarkModel
+
+Timestamps = Tuple[float, float]
+logger = logging.getLogger(__name__)
+
+
+# TODO: add Structured Config (https://hydra.cc/docs/tutorials/structured_config/intro/)
+@hydra.main(version_base=None)
+def run_benchmark(cfg: DictConfig) -> dict:
+    """DCP benchmarks entry point."""
+    benchmark_model = get_benchmark_model(cfg.model)
+
+    return benchmark_common_runner(cfg, run_ddp_load, (cfg, benchmark_model))
+
+
+def run_ddp_load(
+    rank: int,  # needs to be passed first (provided by `multiprocessing.spawn` automatically)
+    cfg: DictConfig,
+    proxy_model: BenchmarkModel,
+    suffix: str,
+    load_timestamps: Queue,
+) -> None:
+    """Execute the actual code for checkpoint loading.
+
+    This function is meant to be executed in subprocesses."""
+    begin_process = perf_counter()
+    # Override random suffix with suffix from config
+    storage_reader = get_reader(cfg)
+    model_size = proxy_model.size
+    model = proxy_model.model
+
+    setup(cfg.backend, world_size=cfg.world_size, rank=rank)
+    if cfg.backend == "nccl":
+        device_id = rank % torch.cuda.device_count()
+        torch.cuda.set_device(device_id)
+        model.to(device_id)
+        model = DistributedDataParallel(model, device_ids=[device_id])
+    else:
+        device_id = rank % torch.cpu.device_count()
+        torch.cpu.set_device(device_id)
+        model.to(device=torch.device("cpu"))
+        model = DistributedDataParallel(model)
+
+    state_dict = model.state_dict()
+
+    begin_load = perf_counter()  # also "end_process"
+    dcp.load(state_dict, storage_reader=storage_reader)
+    end_load = perf_counter()
+
+    # Record the load times excluding the influence of the process setup and model loading to device.
+    load_timestamps.put(
+        (begin_process, end_load - (begin_load - begin_process), model_size)
+    )
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    run_benchmark()
@@ -26,10 +26,10 @@ def run_benchmark(cfg: DictConfig) -> dict:
     """DCP benchmarks entry point."""
     benchmark_model = get_benchmark_model(cfg.model)
 
-    return benchmark_common_runner(cfg, run_ddp, (cfg, benchmark_model))
+    return benchmark_common_runner(cfg, run_ddp_save, (cfg, benchmark_model))
 
 
-def run_ddp(
+def run_ddp_save(
     rank: int,  # needs to be passed first (provided by `multiprocessing.spawn` automatically)
     cfg: DictConfig,
     proxy_model: BenchmarkModel,