65
65
numpy_to_pil ,
66
66
)
67
67
from ..utils .hub_utils import _check_legacy_sharding_variant_format , load_or_create_model_card , populate_model_card
68
- from ..utils .torch_utils import is_compiled_module
68
+ from ..utils .torch_utils import get_device , is_compiled_module
69
69
70
70
71
71
if is_torch_npu_available ():
@@ -1084,19 +1084,20 @@ def remove_all_hooks(self):
1084
1084
accelerate .hooks .remove_hook_from_module (model , recurse = True )
1085
1085
self ._all_hooks = []
1086
1086
1087
- def enable_model_cpu_offload (self , gpu_id : Optional [int ] = None , device : Union [torch .device , str ] = "cuda" ):
1087
+ def enable_model_cpu_offload (self , gpu_id : Optional [int ] = None , device : Union [torch .device , str ] = None ):
1088
1088
r"""
1089
1089
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
1090
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
1091
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
1092
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
1090
+ to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the accelerator when its
1091
+ `forward` method is called, and the model remains in accelerator until the next model runs. Memory savings are
1092
+ lower than with `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution
1093
+ of the `unet`.
1093
1094
1094
1095
Arguments:
1095
1096
gpu_id (`int`, *optional*):
1096
1097
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
1097
- device (`torch.Device` or `str`, *optional*, defaults to "cuda" ):
1098
+ device (`torch.Device` or `str`, *optional*, defaults to None ):
1098
1099
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
1099
- default to "cuda" .
1100
+ automatically detect the available accelerator and use .
1100
1101
"""
1101
1102
self ._maybe_raise_error_if_group_offload_active (raise_error = True )
1102
1103
@@ -1118,6 +1119,11 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t
1118
1119
1119
1120
self .remove_all_hooks ()
1120
1121
1122
+ if device is None :
1123
+ device = get_device ()
1124
+ if device == "cpu" :
1125
+ raise RuntimeError ("`enable_model_cpu_offload` requires accelerator, but not found" )
1126
+
1121
1127
torch_device = torch .device (device )
1122
1128
device_index = torch_device .index
1123
1129
@@ -1196,20 +1202,20 @@ def maybe_free_model_hooks(self):
1196
1202
# make sure the model is in the same state as before calling it
1197
1203
self .enable_model_cpu_offload (device = getattr (self , "_offload_device" , "cuda" ))
1198
1204
1199
- def enable_sequential_cpu_offload (self , gpu_id : Optional [int ] = None , device : Union [torch .device , str ] = "cuda" ):
1205
+ def enable_sequential_cpu_offload (self , gpu_id : Optional [int ] = None , device : Union [torch .device , str ] = None ):
1200
1206
r"""
1201
1207
Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
1202
1208
dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
1203
- and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
1204
- method called. Offloading happens on a submodule basis. Memory savings are higher than with
1209
+ and then moved to `torch.device('meta')` and loaded to accelerator only when their specific submodule has its
1210
+ `forward` method called. Offloading happens on a submodule basis. Memory savings are higher than with
1205
1211
`enable_model_cpu_offload`, but performance is lower.
1206
1212
1207
1213
Arguments:
1208
1214
gpu_id (`int`, *optional*):
1209
1215
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
1210
- device (`torch.Device` or `str`, *optional*, defaults to "cuda" ):
1216
+ device (`torch.Device` or `str`, *optional*, defaults to None ):
1211
1217
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
1212
- default to "cuda" .
1218
+ automatically detect the available accelerator and use .
1213
1219
"""
1214
1220
self ._maybe_raise_error_if_group_offload_active (raise_error = True )
1215
1221
@@ -1225,6 +1231,11 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un
1225
1231
"It seems like you have activated a device mapping strategy on the pipeline so calling `enable_sequential_cpu_offload() isn't allowed. You can call `reset_device_map()` first and then call `enable_sequential_cpu_offload()`."
1226
1232
)
1227
1233
1234
+ if device is None :
1235
+ device = get_device ()
1236
+ if device == "cpu" :
1237
+ raise RuntimeError ("`enable_sequential_cpu_offload` requires accelerator, but not found" )
1238
+
1228
1239
torch_device = torch .device (device )
1229
1240
device_index = torch_device .index
1230
1241
0 commit comments