From 61287def48e3b6153ac23e353de3e7dd5d24ba32 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 12 Apr 2025 12:39:34 +0530 Subject: [PATCH 1/3] docs: promote the usage of automodel. --- docs/source/en/quantization/torchao.md | 25 ++++++++++++------- docs/source/en/quicktour.md | 6 +++++ docs/source/en/training/adapt_a_model.md | 4 +-- .../en/training/distributed_inference.md | 4 +-- .../en/tutorials/inference_with_big_models.md | 8 +++--- .../en/using-diffusers/loading_adapters.md | 4 +-- docs/source/en/using-diffusers/merge_loras.md | 6 ++--- 7 files changed, 35 insertions(+), 22 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 19a8970fa9df..8008e87251be 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -26,13 +26,13 @@ The example below only quantizes the weights to int8. ```python import torch -from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig +from diffusers import FluxPipeline, AutoModel, TorchAoConfig model_id = "black-forest-labs/FLUX.1-dev" dtype = torch.bfloat16 quantization_config = TorchAoConfig("int8wo") -transformer = FluxTransformer2DModel.from_pretrained( +transformer = AutoModel.from_pretrained( model_id, subfolder="transformer", quantization_config=quantization_config, @@ -99,10 +99,11 @@ To serialize a quantized model in a given dtype, first load the model with the d ```python import torch -from diffusers import FluxTransformer2DModel, TorchAoConfig +from diffusers import AutoModel, TorchAoConfig quantization_config = TorchAoConfig("int8wo") -transformer = FluxTransformer2DModel.from_pretrained( +transformer = from diffusers import AutoModel, TorchAoConfig +.from_pretrained( "black-forest-labs/Flux.1-Dev", subfolder="transformer", quantization_config=quantization_config, @@ -115,9 +116,9 @@ To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] me ```python import torch -from diffusers import FluxPipeline, FluxTransformer2DModel +from diffusers import FluxPipeline, AutoModel -transformer = FluxTransformer2DModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False) +transformer = AutoModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False) pipe = FluxPipeline.from_pretrained("black-forest-labs/Flux.1-Dev", transformer=transformer, torch_dtype=torch.bfloat16) pipe.to("cuda") @@ -131,10 +132,10 @@ If you are using `torch<=2.6.0`, some quantization methods, such as `uint4wo`, c ```python import torch from accelerate import init_empty_weights -from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig +from diffusers import FluxPipeline, AutoModel, TorchAoConfig # Serialize the model -transformer = FluxTransformer2DModel.from_pretrained( +transformer = AutoModel.from_pretrained( "black-forest-labs/Flux.1-Dev", subfolder="transformer", quantization_config=TorchAoConfig("uint4wo"), @@ -146,10 +147,16 @@ transformer.save_pretrained("/path/to/flux_uint4wo", safe_serialization=False, m # Load the model state_dict = torch.load("/path/to/flux_uint4wo/diffusion_pytorch_model.bin", weights_only=False, map_location="cpu") with init_empty_weights(): - transformer = FluxTransformer2DModel.from_config("/path/to/flux_uint4wo/config.json") + transformer = AutoModel.from_config("/path/to/flux_uint4wo/config.json") transformer.load_state_dict(state_dict, strict=True, assign=True) ``` + + +With Torch 2.6 or higher, you can directly do: `transformer = AutoModel.from_pretrained("/path/to/flux_uint4wo/")`. + + + ## Resources - [TorchAO Quantization API](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md) diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 2d9f7fe3736a..12e3d71fd5a1 100644 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -163,6 +163,12 @@ Models are initiated with the [`~ModelMixin.from_pretrained`] method which also >>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True) ``` + + +Use `AutoModel` from `from diffusers import AutoModel` in case you are unsure which model class to use. + + + To access the model parameters, call `model.config`: ```py diff --git a/docs/source/en/training/adapt_a_model.md b/docs/source/en/training/adapt_a_model.md index e6a088675a34..f528c8bfb656 100644 --- a/docs/source/en/training/adapt_a_model.md +++ b/docs/source/en/training/adapt_a_model.md @@ -31,10 +31,10 @@ To adapt your text-to-image model for inpainting, you'll need to change the numb Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model weights, and change `in_channels` to 9. Changing the number of `in_channels` means you need to set `ignore_mismatched_sizes=True` and `low_cpu_mem_usage=False` to avoid a size mismatch error because the shape is different now. ```py -from diffusers import UNet2DConditionModel +from diffusers import AutoModel model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5" -unet = UNet2DConditionModel.from_pretrained( +unet = AutoModel.from_pretrained( model_id, subfolder="unet", in_channels=9, diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index 79b4f785f30c..cfb83119bd98 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -165,10 +165,10 @@ flush() Load the diffusion transformer next which has 12.5B parameters. This time, set `device_map="auto"` to automatically distribute the model across two 16GB GPUs. The `auto` strategy is backed by [Accelerate](https://hf.co/docs/accelerate/index) and available as a part of the [Big Model Inference](https://hf.co/docs/accelerate/concept_guides/big_model_inference) feature. It starts by distributing a model across the fastest device first (GPU) before moving to slower devices like the CPU and hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency. ```py -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel import torch -transformer = FluxTransformer2DModel.from_pretrained( +transformer = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", device_map="auto", diff --git a/docs/source/en/tutorials/inference_with_big_models.md b/docs/source/en/tutorials/inference_with_big_models.md index 6af2e9bd3253..a2620e95ba29 100644 --- a/docs/source/en/tutorials/inference_with_big_models.md +++ b/docs/source/en/tutorials/inference_with_big_models.md @@ -32,9 +32,9 @@ The denoiser checkpoint can also have multiple shards and supports inference tha For example, let's save a sharded checkpoint for the [SDXL UNet](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/unet): ```python -from diffusers import UNet2DConditionModel +from diffusers import AutoModel -unet = UNet2DConditionModel.from_pretrained( +unet = AutoModel.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet" ) unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB") @@ -43,10 +43,10 @@ unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB") The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]: ```python -from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline +from diffusers import AutoModel, StableDiffusionXLPipeline import torch -unet = UNet2DConditionModel.from_pretrained( +unet = AutoModel.from_pretrained( "sayakpaul/sdxl-unet-sharded", torch_dtype=torch.float16 ) pipeline = StableDiffusionXLPipeline.from_pretrained( diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index 7522996b2424..b6d606afa44c 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -134,7 +134,7 @@ The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads L - the LoRA weights don't have separate identifiers for the UNet and text encoder - the LoRA weights have separate identifiers for the UNet and text encoder -To directly load (and save) a LoRA adapter at the *model-level*, use [`~PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder. +To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder. Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load. @@ -155,7 +155,7 @@ image -Save an adapter with [`~PeftAdapterMixin.save_lora_adapter`]. +Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`]. To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights: diff --git a/docs/source/en/using-diffusers/merge_loras.md b/docs/source/en/using-diffusers/merge_loras.md index eb7d7d57ef3d..e3ade4b01cf0 100644 --- a/docs/source/en/using-diffusers/merge_loras.md +++ b/docs/source/en/using-diffusers/merge_loras.md @@ -66,10 +66,10 @@ Let's dive deeper into what these steps entail. 1. Load a UNet that corresponds to the UNet in the LoRA checkpoint. In this case, both LoRAs use the SDXL UNet as their base model. ```python -from diffusers import UNet2DConditionModel +from diffusers import AutoModel import torch -unet = UNet2DConditionModel.from_pretrained( +unet = AutoModel.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, @@ -136,7 +136,7 @@ feng_peft_model.load_state_dict(original_state_dict, strict=True) ```python from peft import PeftModel -base_unet = UNet2DConditionModel.from_pretrained( +base_unet = AutoModel.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, From 7b09d62e72160ec0d1b851908ae65635501c7685 Mon Sep 17 00:00:00 2001 From: sayakpaul Date: Sat, 12 Apr 2025 12:41:25 +0530 Subject: [PATCH 2/3] bitsandbytes --- docs/source/en/quantization/bitsandbytes.md | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md index 266daa01935e..744351c9b15e 100644 --- a/docs/source/en/quantization/bitsandbytes.md +++ b/docs/source/en/quantization/bitsandbytes.md @@ -49,7 +49,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel from transformers import T5EncoderModel quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,) @@ -63,7 +63,7 @@ text_encoder_2_8bit = T5EncoderModel.from_pretrained( quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True,) -transformer_8bit = FluxTransformer2DModel.from_pretrained( +transformer_8bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -74,7 +74,7 @@ transformer_8bit = FluxTransformer2DModel.from_pretrained( By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter. ```diff -transformer_8bit = FluxTransformer2DModel.from_pretrained( +transformer_8bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -133,7 +133,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel from transformers import T5EncoderModel quant_config = TransformersBitsAndBytesConfig(load_in_4bit=True,) @@ -147,7 +147,7 @@ text_encoder_2_4bit = T5EncoderModel.from_pretrained( quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,) -transformer_4bit = FluxTransformer2DModel.from_pretrained( +transformer_4bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -158,7 +158,7 @@ transformer_4bit = FluxTransformer2DModel.from_pretrained( By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter. ```diff -transformer_4bit = FluxTransformer2DModel.from_pretrained( +transformer_4bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -217,11 +217,11 @@ print(model.get_memory_footprint()) Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters: ```py -from diffusers import FluxTransformer2DModel, BitsAndBytesConfig +from diffusers import AutoModel, BitsAndBytesConfig quantization_config = BitsAndBytesConfig(load_in_4bit=True) -model_4bit = FluxTransformer2DModel.from_pretrained( +model_4bit = AutoModel.from_pretrained( "hf-internal-testing/flux.1-dev-nf4-pkg", subfolder="transformer" ) ``` @@ -243,13 +243,13 @@ An "outlier" is a hidden state value greater than a certain threshold, and these To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]: ```py -from diffusers import FluxTransformer2DModel, BitsAndBytesConfig +from diffusers import AutoModel, BitsAndBytesConfig quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_threshold=10, ) -model_8bit = FluxTransformer2DModel.from_pretrained( +model_8bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quantization_config, @@ -305,7 +305,7 @@ NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel from transformers import T5EncoderModel quant_config = TransformersBitsAndBytesConfig( @@ -325,7 +325,7 @@ quant_config = DiffusersBitsAndBytesConfig( bnb_4bit_quant_type="nf4", ) -transformer_4bit = FluxTransformer2DModel.from_pretrained( +transformer_4bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -343,7 +343,7 @@ Nested quantization is a technique that can save additional memory at no additio from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel from transformers import T5EncoderModel quant_config = TransformersBitsAndBytesConfig( @@ -363,7 +363,7 @@ quant_config = DiffusersBitsAndBytesConfig( bnb_4bit_use_double_quant=True, ) -transformer_4bit = FluxTransformer2DModel.from_pretrained( +transformer_4bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, @@ -379,7 +379,7 @@ Once quantized, you can dequantize a model to its original precision, but this m from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig -from diffusers import FluxTransformer2DModel +from diffusers import AutoModel from transformers import T5EncoderModel quant_config = TransformersBitsAndBytesConfig( @@ -399,7 +399,7 @@ quant_config = DiffusersBitsAndBytesConfig( bnb_4bit_use_double_quant=True, ) -transformer_4bit = FluxTransformer2DModel.from_pretrained( +transformer_4bit = AutoModel.from_pretrained( "black-forest-labs/FLUX.1-dev", subfolder="transformer", quantization_config=quant_config, From 9d36ebc227f48c6714a8f1efa005baae0544751b Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 15 Apr 2025 09:15:33 +0530 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/quantization/torchao.md | 10 +++------- docs/source/en/quicktour.md | 7 ++----- docs/source/en/using-diffusers/loading_adapters.md | 2 +- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 8008e87251be..a493cc830b47 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -102,8 +102,7 @@ import torch from diffusers import AutoModel, TorchAoConfig quantization_config = TorchAoConfig("int8wo") -transformer = from diffusers import AutoModel, TorchAoConfig -.from_pretrained( +transformer = AutoModel.from_pretrained( "black-forest-labs/Flux.1-Dev", subfolder="transformer", quantization_config=quantization_config, @@ -151,11 +150,8 @@ with init_empty_weights(): transformer.load_state_dict(state_dict, strict=True, assign=True) ``` - - -With Torch 2.6 or higher, you can directly do: `transformer = AutoModel.from_pretrained("/path/to/flux_uint4wo/")`. - - +> [!TIP] +> The [`AutoModel`] API is supported for PyTorch >= 2.6 as shown in the examples below. ## Resources diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 12e3d71fd5a1..14f567d46163 100644 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -163,11 +163,8 @@ Models are initiated with the [`~ModelMixin.from_pretrained`] method which also >>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True) ``` - - -Use `AutoModel` from `from diffusers import AutoModel` in case you are unsure which model class to use. - - +> [!TIP] +> Use the [`AutoModel`] API to automatically select a model class if you're unsure of which one to use. To access the model parameters, call `model.config`: diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md index b6d606afa44c..3400774e6b6a 100644 --- a/docs/source/en/using-diffusers/loading_adapters.md +++ b/docs/source/en/using-diffusers/loading_adapters.md @@ -134,7 +134,7 @@ The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads L - the LoRA weights don't have separate identifiers for the UNet and text encoder - the LoRA weights have separate identifiers for the UNet and text encoder -To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder. +To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder. Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.