Skip to content

Commit 7edace9

Browse files
yao-matrixgithub-actions[bot]hlky
authored
fix CPU offloading related fail cases on XPU (#11288)
* fix CPU offloading related fail cases on XPU Signed-off-by: YAO Matrix <[email protected]> * fix style Signed-off-by: YAO Matrix <[email protected]> * Apply style fixes * trigger tests * test_pipe_same_device_id_offload --------- Signed-off-by: YAO Matrix <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: hlky <[email protected]>
1 parent 6e80d24 commit 7edace9

File tree

3 files changed

+38
-13
lines changed

3 files changed

+38
-13
lines changed

src/diffusers/pipelines/pipeline_utils.py

+23-12
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
numpy_to_pil,
6666
)
6767
from ..utils.hub_utils import _check_legacy_sharding_variant_format, load_or_create_model_card, populate_model_card
68-
from ..utils.torch_utils import is_compiled_module
68+
from ..utils.torch_utils import get_device, is_compiled_module
6969

7070

7171
if is_torch_npu_available():
@@ -1084,19 +1084,20 @@ def remove_all_hooks(self):
10841084
accelerate.hooks.remove_hook_from_module(model, recurse=True)
10851085
self._all_hooks = []
10861086

1087-
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
1087+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
10881088
r"""
10891089
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
1090-
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
1091-
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
1092-
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
1090+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
1091+
`forward` method is called, and the model remains in accelerator until the next model runs. Memory savings are
1092+
lower than with `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
1093+
of the `unet`.
10931094
10941095
Arguments:
10951096
gpu_id (`int`, *optional*):
10961097
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
1097-
device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
1098+
device (`torch.Device` or `str`, *optional*, defaults to None):
10981099
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
1099-
default to "cuda".
1100+
automatically detect the available accelerator and use.
11001101
"""
11011102
self._maybe_raise_error_if_group_offload_active(raise_error=True)
11021103

@@ -1118,6 +1119,11 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
11181119

11191120
self.remove_all_hooks()
11201121

1122+
if device is None:
1123+
device = get_device()
1124+
if device == "cpu":
1125+
raise RuntimeError("`enable_model_cpu_offload` requires accelerator, but not found")
1126+
11211127
torch_device = torch.device(device)
11221128
device_index = torch_device.index
11231129

@@ -1196,20 +1202,20 @@ def maybe_free_model_hooks(self):
11961202
# make sure the model is in the same state as before calling it
11971203
self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))
11981204

1199-
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
1205+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = None):
12001206
r"""
12011207
Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
12021208
dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
1203-
and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
1204-
method called. Offloading happens on a submodule basis. Memory savings are higher than with
1209+
and then moved to `torch.device('meta')` and loaded to accelerator only when their specific submodule has its
1210+
`forward` method called. Offloading happens on a submodule basis. Memory savings are higher than with
12051211
`enable_model_cpu_offload`, but performance is lower.
12061212
12071213
Arguments:
12081214
gpu_id (`int`, *optional*):
12091215
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
1210-
device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
1216+
device (`torch.Device` or `str`, *optional*, defaults to None):
12111217
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
1212-
default to "cuda".
1218+
automatically detect the available accelerator and use.
12131219
"""
12141220
self._maybe_raise_error_if_group_offload_active(raise_error=True)
12151221

@@ -1225,6 +1231,11 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
12251231
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
12261232
)
12271233

1234+
if device is None:
1235+
device = get_device()
1236+
if device == "cpu":
1237+
raise RuntimeError("`enable_sequential_cpu_offload` requires accelerator, but not found")
1238+
12281239
torch_device = torch.device(device)
12291240
device_index = torch_device.index
12301241

src/diffusers/utils/torch_utils.py

+9
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,12 @@ def get_torch_cuda_device_capability():
159159
return float(compute_capability)
160160
else:
161161
return None
162+
163+
164+
def get_device():
165+
if torch.cuda.is_available():
166+
return "cuda"
167+
elif hasattr(torch, "xpu") and torch.xpu.is_available():
168+
return "xpu"
169+
else:
170+
return "cpu"

tests/pipelines/test_pipelines.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -1816,7 +1816,12 @@ def test_pipe_same_device_id_offload(self):
18161816
feature_extractor=self.dummy_extractor,
18171817
)
18181818

1819-
sd.enable_model_cpu_offload(gpu_id=5)
1819+
# `enable_model_cpu_offload` detects device type when not passed
1820+
# `enable_model_cpu_offload` raises ValueError if detected device is `cpu`
1821+
# This test only checks whether `_offload_gpu_id` is set correctly
1822+
# So the device passed can be any supported `torch.device` type
1823+
# This allows us to keep the test under `PipelineFastTests`
1824+
sd.enable_model_cpu_offload(gpu_id=5, device="cuda")
18201825
assert sd._offload_gpu_id == 5
18211826
sd.maybe_free_model_hooks()
18221827
assert sd._offload_gpu_id == 5

0 commit comments

Comments
 (0)