diff --git a/comfyui/cogvideox_fun/nodes.py b/comfyui/cogvideox_fun/nodes.py
index 61b2bf6e..06a9264d 100755
--- a/comfyui/cogvideox_fun/nodes.py
+++ b/comfyui/cogvideox_fun/nodes.py
@@ -92,7 +92,7 @@ def loadmodel(self, GPU_memory_mode, model, model_type, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/comfyui_nodes.py b/comfyui/comfyui_nodes.py
index b648eb28..a5a3704a 100755
--- a/comfyui/comfyui_nodes.py
+++ b/comfyui/comfyui_nodes.py
@@ -16,7 +16,8 @@
                                   LoadCogVideoXFunModel)
 from .comfyui_utils import script_directory
 from .qwenimage.nodes import (CombineQwenImagePipeline, LoadQwenImageLora,
-                              LoadQwenImageModel, LoadQwenImageProcessor,
+                              LoadQwenImageModel, LoadQwenImageProcessor, LoadQwenImageControlNetInPipeline, 
+                              LoadQwenImageControlNetInModel, QwenImageControlSampler, QwenImageEditPlusSampler,
                               LoadQwenImageTextEncoderModel,
                               LoadQwenImageTransformerModel,
                               LoadQwenImageVAEModel, QwenImageEditSampler,
@@ -461,10 +462,14 @@ def run(self,camera_pose,fx,fy,cx,cy):
     "LoadQwenImageVAEModel": LoadQwenImageVAEModel, 
     "LoadQwenImageProcessor": LoadQwenImageProcessor,
     "CombineQwenImagePipeline": CombineQwenImagePipeline, 
+    "LoadQwenImageControlNetInPipeline": LoadQwenImageControlNetInPipeline, 
+    "LoadQwenImageControlNetInModel": LoadQwenImageControlNetInModel, 
 
     "LoadQwenImageModel": LoadQwenImageModel,
     "QwenImageT2VSampler": QwenImageT2VSampler,
     "QwenImageEditSampler": QwenImageEditSampler,
+    "QwenImageEditPlusSampler": QwenImageEditPlusSampler,
+    "QwenImageControlSampler": QwenImageControlSampler,
     
     "LoadZImageLora": LoadZImageLora,
     "LoadZImageTextEncoderModel": LoadZImageTextEncoderModel,
@@ -549,10 +554,14 @@ def run(self,camera_pose,fx,fy,cx,cy):
     "LoadQwenImageVAEModel": "Load QwenImage VAE Model", 
     "LoadQwenImageProcessor": "Load QwenImage Processor",
     "CombineQwenImagePipeline": "Combine QwenImage Pipeline", 
+    "LoadQwenImageControlNetInPipeline": "Load QwenImage ControlNet In Pipeline", 
+    "LoadQwenImageControlNetInModel": "Load QwenImage ControlNet In Model", 
 
     "LoadQwenImageModel": "Load QwenImage Model",
     "QwenImageT2VSampler": "QwenImage T2V Sampler",
     "QwenImageEditSampler": "QwenImage Edit Sampler",
+    "QwenImageEditPlusSampler": "QwenImage Edit Plus Sampler",
+    "QwenImageControlSampler": "QwenImage Control Sampler",
     
     "LoadZImageLora": "Load ZImage Lora",
     "LoadZImageTextEncoderModel": "Load ZImage TextEncoder Model",
diff --git a/comfyui/qwenimage/README.md b/comfyui/qwenimage/README.md
index a0eb1537..025a2779 100644
--- a/comfyui/qwenimage/README.md
+++ b/comfyui/qwenimage/README.md
@@ -2,13 +2,85 @@
 
 ## a. Model Links and Storage Locations
 
+**Chunked loading is recommended** as it better aligns with ComfyUI's standard workflow.
+
+### 1. Chunked Loading Weights (Recommended)
+
+For chunked loading, it is recommended to directly download the Z-Image weights provided by ComfyUI official. Please organize the files according to the following directory structure:
+
+**Core Model Files:**
+
+| Component | File Name | 
+|-----------|-----------| 
+| Text Encoder | [`qwen_2.5_vl_7b_fp8_scaled.safetensors`](https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors) |
+| Diffusion Model | [`qwen_image_fp8_e4m3fn.safetensors`](https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/diffusion_models/qwen_image_fp8_e4m3fn.safetensors) | 
+| VAE | [`qwen_image_vae.safetensors`](https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors) | 
+| tokenizer | [`tokenizer`](https://huggingface.co/Qwen/Qwen-Image-Edit/tree/main/tokenizer) | 
+| processor | [`processor`](https://huggingface.co/Qwen/Qwen-Image-Edit/tree/main/processor) | 
+
+**ControlNet Model Files:**
+
+| Name | Storage | Hugging Face | Model Scope | Description |
+|--|--|--|--|--|
+| Qwen-Image-2512-Fun-Controlnet-Union | - | [🤗Link](https://huggingface.co/alibaba-pai/Qwen-Image-2512-Fun-Controlnet-Union) | [😄Link](https://modelscope.cn/models/PAI/Qwen-Image-2512-Fun-Controlnet-Union) | ControlNet weights for Qwen-Image-2512, supporting multiple control conditions such as Canny, Depth, Pose, MLSD, Scribble, etc. |
+
+**Storage Location:**
+
+```
+📂 ComfyUI/
+├── 📂 models/
+│ ├── 📂 text_encoders/
+│ │ └── qwen_2.5_vl_7b_fp8_scaled.safetensors
+│ ├── 📂 diffusion_models/
+│ │ └── qwen_image_fp8_e4m3fn.safetensors`
+│ ├── 📂 vae/
+│ │ └── qwen_image_vae.safetensors
+│ ├── 📂 Fun_Models/
+│ │ ├── qwen2_tokenizer/
+│ │ └── qwen2_processor/
+│ └── 📂 model_patches/
+│   └── Qwen-Image-2512-Fun-Controlnet-Union.safetensors
+```
+
+### 2. Preprocessing Weights (Optional)
+
+If you want to use the control preprocessing nodes, you can download the preprocessing weights to `ComfyUI/custom_nodes/Fun_Models/Third_Party/`.
+
+**Required Files:**
+
+| File Name | Download Link | Purpose |
+|-----------|---------------|---------|
+| `yolox_l.onnx` | [Download](https://huggingface.co/yzd-v/DWPose/resolve/main/yolox_l.onnx) | YOLO Detection Model |
+| `dw-ll_ucoco_384.onnx` | [Download](https://huggingface.co/yzd-v/DWPose/resolve/main/dw-ll_ucoco_384.onnx) | DWPose Pose Estimation Model |
+| `ZoeD_M12_N.pt` | [Download](https://huggingface.co/lllyasviel/Annotators/resolve/main/ZoeD_M12_N.pt) | ZoeDepth Depth Estimation Model |
+
+**Storage Location:**
+
+```
+📂 ComfyUI/
+├── 📂 models/
+│ └── 📂 Fun_Models/
+│   └── 📂 Third_Party
+│       ├── yolox_l.onnx
+│       ├── dw-ll_ucoco_384.onnx
+│       └── ZoeD_M12_N.pt
+```
+
+### 3. Full Model Loading (Optional)
+
+If you prefer full model loading, you can directly download the diffusers weights.
+
 **Required Files:**
 
 | Name | Storage | Hugging Face | Model Scope | Description |
 |--|--|--|--|--|
 | Qwen-Image | [🤗Link](https://huggingface.co/Qwen/Qwen-Image) | [😄Link](https://modelscope.cn/models/Qwen/Qwen-Image) | Official Qwen-Image weights |
+| Qwen-Image-2512 | [🤗Link](https://huggingface.co/Qwen/Qwen-Image-2512) | [😄Link](https://modelscope.cn/models/Qwen/Qwen-Image-2512) | Official Qwen-Image weights |
 | Qwen-Image-Edit | [🤗Link](https://huggingface.co/Qwen/Qwen-Image-Edit) | [😄Link](https://modelscope.cn/models/Qwen/Qwen-Image-Edit) | Official Qwen-Image-Edit weights |
 | Qwen-Image-Edit-2509 | [🤗Link](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) | [😄Link](https://modelscope.cn/models/Qwen/Qwen-Image-Edit-2509) | Official Qwen-Image-Edit-2509 weights |
+| Qwen-Image-Edit-2511 | [🤗Link](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) | [😄Link](https://modelscope.cn/models/Qwen/Qwen-Image-Edit-2511) | Official Qwen-Image-Edit-2511 weights |
+
+For full model loading, use the diffusers version of Qwen-Image Turbo and place the model in `ComfyUI/models/Fun_Models/`.
 
 **Storage Location:**
 
@@ -26,10 +98,26 @@
 
 [Qwen-Image Text to Image](v1/qwenimage_chunked_loading_workflow_t2i.json)
 
+[Qwen-Image Text to Image Control](v1/qwenimage_chunked_loading_workflow_t2i_control.json)
+
+[Qwen-Image Text to Image Inpaint](v1/qwenimage_chunked_loading_workflow_t2i_inpaint.json)
+
 [Qwen-Image Edit](v1/qwenimage_chunked_loading_workflow_edit.json)
 
+[Qwen-Image Edit 2509](v1/qwenimage_chunked_loading_workflow_edit_2509.json)
+
+[Qwen-Image Edit 2511](v1/qwenimage_chunked_loading_workflow_edit_2511.json)
+
 ### 2. Full Model Loading (Optional)
 
 [Qwen-Image Text to Image](v1/qwenimage_workflow_t2i.json)
 
-[Qwen-Image Edit](v1/qwenimage_workflow_edit.json)
\ No newline at end of file
+[Qwen-Image Text to Image Control](v1/qwenimage_workflow_t2i_control.json)
+
+[Qwen-Image Text to Image Inpaint](v1/qwenimage_workflow_t2i_inpaint.json)
+
+[Qwen-Image Edit](v1/qwenimage_workflow_edit.json)
+
+[Qwen-Image Edit 2509](v1/qwenimage_workflow_edit_2509.json)
+
+[Qwen-Image Edit 2511](v1/qwenimage_workflow_edit_2511.json)
\ No newline at end of file
diff --git a/comfyui/qwenimage/nodes.py b/comfyui/qwenimage/nodes.py
index 1189b683..12542867 100644
--- a/comfyui/qwenimage/nodes.py
+++ b/comfyui/qwenimage/nodes.py
@@ -6,6 +6,7 @@
 import json
 import os
 
+import accelerate
 import comfy.model_management as mm
 import cv2
 import folder_paths
@@ -13,18 +14,32 @@
 import torch
 from comfy.utils import ProgressBar, load_torch_file
 from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers import __version__ as diffusers_version
 from einops import rearrange
 from omegaconf import OmegaConf
-from PIL import Image
+from safetensors.torch import load_file
+
+if diffusers_version >= "0.33.0":
+    from diffusers.models.model_loading_utils import load_model_dict_into_meta
+else:
+    from diffusers.models.modeling_utils import \
+        load_model_dict_into_meta
 
 from ...videox_fun.data.bucket_sampler import (ASPECT_RATIO_512,
                                                get_closest_ratio)
 from ...videox_fun.models import (AutoencoderKLQwenImage, Qwen2_5_VLConfig,
                                   Qwen2_5_VLForConditionalGeneration,
                                   Qwen2Tokenizer, Qwen2VLProcessor,
+                                  QwenImageControlTransformer2DModel,
                                   QwenImageTransformer2DModel)
 from ...videox_fun.models.cache_utils import get_teacache_coefficients
-from ...videox_fun.pipeline import QwenImageEditPipeline, QwenImagePipeline
+from ...videox_fun.pipeline import (QwenImageControlPipeline,
+                                    QwenImageEditPipeline,
+                                    QwenImageEditPlusPipeline,
+                                    QwenImagePipeline)
+from ...videox_fun.utils import (register_auto_device_hook,
+                                 safe_enable_group_offload,
+                                 safe_remove_group_offloading)
 from ...videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from ...videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from ...videox_fun.utils.fp8_optimization import (
@@ -32,7 +47,7 @@
     replace_parameters_by_name, undo_convert_weight_dtype_wrapper)
 from ...videox_fun.utils.lora_utils import merge_lora, unmerge_lora
 from ...videox_fun.utils.utils import (filter_kwargs, get_autocast_dtype,
-                                       get_image)
+                                       get_image, get_image_latent)
 from ..comfyui_utils import (eas_cache_dir, script_directory,
                              search_model_in_possible_folders,
                              search_sub_dir_in_possible_folders, to_pil)
@@ -77,7 +92,10 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("diffusion_models"),
-                    {"default": "Wan2_1-T2V-1_3B_bf16.safetensors,"},
+                    {"default": "qwen_image_fp8_e4m3fn.safetensors",},
+                ),
+                "zero_cond_t":(
+                    [False, True],  {"default": False,}
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -89,14 +107,14 @@ def INPUT_TYPES(s):
     FUNCTION    = "loadmodel"
     CATEGORY    = "CogVideoXFUNWrapper"
 
-    def loadmodel(self, model_name, precision):
+    def loadmodel(self, model_name, zero_cond_t, precision):
         # Init weight_dtype and device
         device          = mm.get_torch_device()
         offload_device  = mm.unet_offload_device()
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
         transformer = None
 
@@ -118,14 +136,43 @@ def loadmodel(self, model_name, precision):
             "num_layers": 60,
             "out_channels": 16,
             "patch_size": 2,
-            "pooled_projection_dim": 768
+            "zero_cond_t": zero_cond_t,
         }
 
         sig = inspect.signature(QwenImageTransformer2DModel)
         accepted = {k: v for k, v in kwargs.items() if k in sig.parameters}
-        transformer = QwenImageTransformer2DModel(**accepted)
-        transformer.load_state_dict(transformer_state_dict)
-        transformer = transformer.eval().to(device=offload_device, dtype=weight_dtype)
+        with accelerate.init_empty_weights():
+            transformer = QwenImageTransformer2DModel(**accepted)
+
+        new_state_dict = {}
+        for key, value in transformer_state_dict.items():
+            if key.startswith('model.diffusion_model.'):
+                new_key = key.replace('model.diffusion_model.', '')
+                new_state_dict[new_key] = value
+            else:
+                new_state_dict[key] = value
+        transformer_state_dict = new_state_dict
+
+        if diffusers_version >= "0.33.0":
+            # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
+            # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
+            load_model_dict_into_meta(
+                transformer,
+                transformer_state_dict,
+                dtype=weight_dtype,
+                model_name_or_path="",
+            )
+        else:
+            transformer._convert_deprecated_attention_blocks(transformer_state_dict)
+            unexpected_keys = load_model_dict_into_meta(
+                transformer,
+                transformer_state_dict,
+                device=offload_device,
+                dtype=weight_dtype,
+                model_name_or_path="",
+            )
+
+        transformer = transformer.eval()
         return (transformer, model_name_in_pipeline)
 
 class LoadQwenImageVAEModel:
@@ -135,7 +182,7 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("vae"),
-                    {"default": "QwenImage2.1_VAE.pth"}
+                    {"default": "qwen_image_vae.safetensors"}
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -238,7 +285,7 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("text_encoders"),
-                    {"default": "models_t5_umt5-xxl-enc-bf16.pth"}
+                    {"default": "qwen_2.5_vl_7b_fp8_scaled.safetensors", }
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -396,11 +443,31 @@ def loadmodel(self, model_name, precision,):
         }
         config = Qwen2_5_VLConfig(**kwargs)
         text_encoder = Qwen2_5_VLForConditionalGeneration._from_config(config)
-        def transform_key(key):
-            key = key.replace("model.", "model.language_model.")
-            key = key.replace("visual.", "model.visual.")
-            return key
-        text_state_dict = {transform_key(k): v for k, v in text_state_dict.items()}
+
+        new_state_dict = {}
+        scale_dict = {}
+        for key, value in text_state_dict.items():
+            if 'scale_input' in key or 'scale_weight' in key:
+                scale_dict[key] = value
+
+        for key, value in text_state_dict.items():
+            if 'scale_input' in key or 'scale_weight' in key or key == 'scaled_fp8':
+                continue
+            if key.startswith('visual.'):
+                new_key = 'model.' + key
+            elif key.startswith('model.layers.') or key.startswith('model.embed_tokens.') or key.startswith('model.norm.'):
+                new_key = 'model.language_' + key
+            else:
+                new_key = key
+            
+            if '.weight' in key and value.dtype == torch.float8_e4m3fn:
+                scale_key = key.replace('.weight', '.scale_weight')
+                if scale_key in scale_dict:
+                    value = value.float() * scale_dict[scale_key].float()
+            
+            new_state_dict[new_key] = value
+
+        text_state_dict = new_state_dict
 
         text_encoder.load_state_dict(text_state_dict)
         text_encoder = text_encoder.eval().to(device=offload_device, dtype=weight_dtype)
@@ -461,7 +528,9 @@ def INPUT_TYPES(s):
                 "tokenizer": ("Tokenizer",),
                 "model_name": ("STRING",),
                 "GPU_memory_mode":(
-                    ["model_full_load", "model_full_load_and_qfloat8","model_cpu_offload", "model_cpu_offload_and_qfloat8", "sequential_cpu_offload"],
+                    [
+                        "model_full_load", "model_full_load_and_qfloat8", "model_cpu_offload", 
+                        "model_cpu_offload_and_qfloat8", "model_group_offload", "sequential_cpu_offload"],
                     {
                         "default": "model_cpu_offload",
                     }
@@ -484,17 +553,31 @@ def loadmodel(self, model_name, GPU_memory_mode, transformer, vae, text_encoder,
         offload_device  = mm.unet_offload_device()
 
         # Get pipeline
-        model_type = "Inpaint"
+        if hasattr(transformer, "control_layers"):
+            model_type = "Control"
+        else:
+            model_type = "Inpaint"
+
         if model_type == "Inpaint":
             if processor is not None:
-                pipeline = QwenImageEditPipeline(
-                    vae=vae,
-                    tokenizer=tokenizer,
-                    text_encoder=text_encoder,
-                    transformer=transformer,
-                    scheduler=None,
-                    processor=processor,
-                )
+                if "2509" in model_name or "2511" in model_name:
+                    pipeline = QwenImageEditPlusPipeline(
+                        vae=vae,
+                        tokenizer=tokenizer,
+                        text_encoder=text_encoder,
+                        transformer=transformer,
+                        scheduler=None,
+                        processor=processor,
+                    )
+                else:
+                    pipeline = QwenImageEditPipeline(
+                        vae=vae,
+                        tokenizer=tokenizer,
+                        text_encoder=text_encoder,
+                        transformer=transformer,
+                        scheduler=None,
+                        processor=processor,
+                    )
             else:
                 pipeline = QwenImagePipeline(
                     vae=vae,
@@ -504,15 +587,24 @@ def loadmodel(self, model_name, GPU_memory_mode, transformer, vae, text_encoder,
                     scheduler=None,
                 )
         else:
-            raise ValueError("Not supported now.")
+            pipeline = QwenImageControlPipeline(
+                vae=vae,
+                tokenizer=tokenizer,
+                text_encoder=text_encoder,
+                transformer=transformer,
+                scheduler=None,
+            )
 
         pipeline.remove_all_hooks()
+        safe_remove_group_offloading(pipeline)
         undo_convert_weight_dtype_wrapper(transformer)
-        pipeline.to(device=offload_device)
         transformer = transformer.to(weight_dtype)
 
         if GPU_memory_mode == "sequential_cpu_offload":
             pipeline.enable_sequential_cpu_offload(device=device)
+        elif GPU_memory_mode == "model_group_offload":
+            register_auto_device_hook(pipeline.transformer)
+            safe_enable_group_offload(pipeline, onload_device=device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
         elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
             convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
             convert_weight_dtype_wrapper(transformer, weight_dtype)
@@ -528,6 +620,7 @@ def loadmodel(self, model_name, GPU_memory_mode, transformer, vae, text_encoder,
 
         funmodels = {
             'pipeline': pipeline, 
+            'GPU_memory_mode': GPU_memory_mode,
             'dtype': weight_dtype,
             'model_name': model_name,
             'model_type': model_type,
@@ -544,14 +637,19 @@ def INPUT_TYPES(s):
                 "model": (
                     [
                         'Qwen-Image',
+                        'Qwen-Image-2512',
                         'Qwen-Image-Edit',
+                        'Qwen-Image-Edit-2509',
+                        'Qwen-Image-Edit-2511',
                     ],
                     {
                         "default": 'Qwen-Image',
                     }
                 ),
                 "GPU_memory_mode":(
-                    ["model_full_load", "model_full_load_and_qfloat8","model_cpu_offload", "model_cpu_offload_and_qfloat8", "sequential_cpu_offload"],
+                    [
+                        "model_full_load", "model_full_load_and_qfloat8", "model_cpu_offload", 
+                        "model_cpu_offload_and_qfloat8", "model_group_offload", "sequential_cpu_offload"],
                     {
                         "default": "model_cpu_offload",
                     }
@@ -577,7 +675,7 @@ def loadmodel(self, GPU_memory_mode, model, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
@@ -640,14 +738,24 @@ def loadmodel(self, GPU_memory_mode, model, precision):
         model_type = "Inpaint"
         if model_type == "Inpaint":
             if need_processor:
-                pipeline = QwenImageEditPipeline(
-                    vae=vae,
-                    tokenizer=tokenizer,
-                    text_encoder=text_encoder,
-                    transformer=transformer,
-                    scheduler=None,
-                    processor=processor,
-                )
+                if "2509" in model_name or "2511" in model_name:
+                    pipeline = QwenImageEditPlusPipeline(
+                        vae=vae,
+                        tokenizer=tokenizer,
+                        text_encoder=text_encoder,
+                        transformer=transformer,
+                        scheduler=None,
+                        processor=processor,
+                    )
+                else:
+                    pipeline = QwenImageEditPipeline(
+                        vae=vae,
+                        tokenizer=tokenizer,
+                        text_encoder=text_encoder,
+                        transformer=transformer,
+                        scheduler=None,
+                        processor=processor,
+                    )
             else:
                 pipeline = QwenImagePipeline(
                     vae=vae,
@@ -664,6 +772,9 @@ def loadmodel(self, GPU_memory_mode, model, precision):
 
         if GPU_memory_mode == "sequential_cpu_offload":
             pipeline.enable_sequential_cpu_offload(device=device)
+        elif GPU_memory_mode == "model_group_offload":
+            register_auto_device_hook(pipeline.transformer)
+            safe_enable_group_offload(pipeline, onload_device=device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
         elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
             convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
             convert_weight_dtype_wrapper(transformer, weight_dtype)
@@ -679,6 +790,7 @@ def loadmodel(self, GPU_memory_mode, model, precision):
 
         funmodels = {
             'pipeline': pipeline, 
+            'GPU_memory_mode': GPU_memory_mode,
             'dtype': weight_dtype,
             'model_name': model_name,
             'model_type': model_type,
@@ -713,6 +825,240 @@ def load_lora(self, funmodels, lora_name, strength_model, lora_cache):
             new_funmodels['lora_cache'] = lora_cache
         return (new_funmodels,)
 
+class LoadQwenImageControlNetInPipeline:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "config": (
+                    [
+                        "qwenimage/qwenimage_control.yaml",
+                    ],
+                    {
+                        "default": "qwenimage/qwenimage_control.yaml",
+                    }
+                ),
+                "model_name": (
+                    folder_paths.get_filename_list("model_patches"),
+                    {"default": "Qwen-Image-2512-Fun-Controlnet-Union.safetensors", },
+                ),
+                "sub_transformer_name":(
+                    ["transformer", "transformer_2"],
+                    {
+                        "default": "transformer",
+                    }
+                ),
+                "funmodels": ("FunModels",),
+            },
+        }
+    
+    RETURN_TYPES = ("FunModels",)
+    RETURN_NAMES = ("funmodels",)
+    FUNCTION = "loadmodel"
+    CATEGORY = "CogVideoXFUNWrapper"
+
+    def loadmodel(self, config, model_name, sub_transformer_name, funmodels):
+        device          = mm.get_torch_device()
+        offload_device  = mm.unet_offload_device()
+        GPU_memory_mode = funmodels["GPU_memory_mode"]
+        weight_dtype    = funmodels['dtype']
+
+        # Get Transformer
+        transformer = getattr(funmodels["pipeline"], sub_transformer_name)
+        transformer = transformer.cpu()
+
+        # Remove hooks
+        funmodels["pipeline"].remove_all_hooks()
+        safe_remove_group_offloading(funmodels["pipeline"])
+
+        # Get state_dict
+        transformer_state_dict = transformer.state_dict()
+        del transformer
+        mm.soft_empty_cache()
+        gc.collect()
+
+        # Load config
+        config_path = f"{script_directory}/config/{config}"
+        config = OmegaConf.load(config_path)
+        kwargs = {
+            "attention_head_dim": 128,
+            "axes_dims_rope": [
+                16,
+                56,
+                56
+            ],
+            "guidance_embeds": False,
+            "in_channels": 64,
+            "joint_attention_dim": 3584,
+            "num_attention_heads": 24,
+            "num_layers": 60,
+            "out_channels": 16,
+            "patch_size": 2,
+            "pooled_projection_dim": 768
+        }
+        kwargs.update(OmegaConf.to_container(config['transformer_additional_kwargs']))
+
+        # Get Model
+        sig = inspect.signature(QwenImageControlTransformer2DModel)
+        accepted = {k: v for k, v in kwargs.items() if k in sig.parameters}
+        with accelerate.init_empty_weights():
+            control_transformer = QwenImageControlTransformer2DModel(**accepted).to(weight_dtype)
+        print(f"Load Control Transformer")
+
+        # Load Control state_dict
+        control_model_path = folder_paths.get_full_path("model_patches", model_name)
+        if control_model_path.endswith(".safetensors"):
+            control_state_dict = load_file(control_model_path)
+        else:
+            control_state_dict = torch.load(control_model_path)
+
+        state_dict = {**transformer_state_dict, **control_state_dict}
+        if diffusers_version >= "0.33.0":
+            # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
+            # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
+            load_model_dict_into_meta(
+                control_transformer,
+                state_dict,
+                dtype=weight_dtype,
+                model_name_or_path="",
+            )
+        else:
+            control_transformer._convert_deprecated_attention_blocks(state_dict)
+            load_model_dict_into_meta(
+                control_transformer,
+                state_dict,
+                device=offload_device,
+                dtype=weight_dtype,
+                model_name_or_path="",
+            )
+
+        pipeline = QwenImageControlPipeline(
+            vae=funmodels["pipeline"].vae,
+            tokenizer=funmodels["pipeline"].tokenizer,
+            text_encoder=funmodels["pipeline"].text_encoder,
+            transformer=control_transformer,
+            scheduler=funmodels["pipeline"].scheduler,
+        ) 
+        del funmodels["pipeline"]
+        mm.soft_empty_cache()
+        gc.collect()
+
+        if GPU_memory_mode == "sequential_cpu_offload":
+            pipeline.enable_sequential_cpu_offload(device=device)
+        elif GPU_memory_mode == "model_group_offload":
+            register_auto_device_hook(pipeline.transformer)
+            safe_enable_group_offload(pipeline, onload_device=device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
+        elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+            convert_model_weight_to_float8(control_transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+            convert_weight_dtype_wrapper(control_transformer, weight_dtype)
+            pipeline.enable_model_cpu_offload(device=device)
+        elif GPU_memory_mode == "model_cpu_offload":
+            pipeline.enable_model_cpu_offload(device=device)
+        elif GPU_memory_mode == "model_full_load_and_qfloat8":
+            convert_model_weight_to_float8(control_transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+            convert_weight_dtype_wrapper(control_transformer, weight_dtype)
+            pipeline.to(device=device)
+        else:
+            pipeline.to(device=device)
+        funmodels["pipeline"] = pipeline
+        funmodels["model_type"] = "Control"
+        return (funmodels, )
+
+class LoadQwenImageControlNetInModel:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "config": (
+                    [
+                        "qwenimage/qwenimage_control.yaml",
+                    ],
+                    {
+                        "default": "qwenimage/qwenimage_control.yaml",
+                    }
+                ),
+                "model_name": (
+                    folder_paths.get_filename_list("model_patches"),
+                    {"default": "Qwen-Image-2512-Fun-Controlnet-Union.safetensors", },
+                ),
+                "transformer": ("TransformerModel",),
+            },
+        }
+    
+    RETURN_TYPES = ("TransformerModel",)
+    RETURN_NAMES = ("transformer",)
+    FUNCTION = "loadmodel"
+    CATEGORY = "CogVideoXFUNWrapper"
+
+    def loadmodel(self, config, model_name, transformer):
+        offload_device  = mm.unet_offload_device()
+        dtype           = transformer.dtype
+        
+        # Get Transformer
+        transformer = transformer.cpu()
+
+        # Get state_dict
+        transformer_state_dict = transformer.state_dict()
+        del transformer
+        mm.soft_empty_cache()
+        gc.collect()
+
+        # Load config
+        config_path = f"{script_directory}/config/{config}"
+        config = OmegaConf.load(config_path)
+        kwargs = {
+            "attention_head_dim": 128,
+            "axes_dims_rope": [
+                16,
+                56,
+                56
+            ],
+            "guidance_embeds": False,
+            "in_channels": 64,
+            "joint_attention_dim": 3584,
+            "num_attention_heads": 24,
+            "num_layers": 60,
+            "out_channels": 16,
+            "patch_size": 2,
+            "pooled_projection_dim": 768
+        }
+        kwargs.update(OmegaConf.to_container(config['transformer_additional_kwargs']))
+
+        # Get Model
+        sig = inspect.signature(QwenImageControlTransformer2DModel)
+        accepted = {k: v for k, v in kwargs.items() if k in sig.parameters}
+        with accelerate.init_empty_weights():
+            control_transformer = QwenImageControlTransformer2DModel(**accepted).to(dtype)
+        print(f"Load Control Transformer")
+
+        # Load Control state_dict
+        control_model_path = folder_paths.get_full_path("model_patches", model_name)
+        if control_model_path.endswith(".safetensors"):
+            control_state_dict = load_file(control_model_path)
+        else:
+            control_state_dict = torch.load(control_model_path)
+
+        state_dict = {**transformer_state_dict, **control_state_dict}
+        if diffusers_version >= "0.33.0":
+            # Diffusers has refactored `load_model_dict_into_meta` since version 0.33.0 in this commit:
+            # https://github.com/huggingface/diffusers/commit/f5929e03060d56063ff34b25a8308833bec7c785.
+            load_model_dict_into_meta(
+                control_transformer,
+                state_dict,
+                dtype=dtype,
+                model_name_or_path="",
+            )
+        else:
+            control_transformer._convert_deprecated_attention_blocks(state_dict)
+            load_model_dict_into_meta(
+                control_transformer,
+                state_dict,
+                device=offload_device,
+                dtype=dtype,
+                model_name_or_path="",
+            )
+        return (control_transformer, )
+
 class QwenImageT2VSampler:
     @classmethod
     def INPUT_TYPES(s):
@@ -894,7 +1240,156 @@ def INPUT_TYPES(s):
                     "INT", {"default": 1, "min": 1, "max": 100, "step": 1}
                 ),
                 "teacache_threshold": (
-                    "FLOAT", {"default": 0.10, "min": 0.00, "max": 1.00, "step": 0.005}
+                    "FLOAT", {"default": 0.250, "min": 0.00, "max": 1.00, "step": 0.005}
+                ),
+                "enable_teacache":(
+                    [False, True],  {"default": True,}
+                ),
+                "num_skip_start_steps": (
+                    "INT", {"default": 5, "min": 0, "max": 50, "step": 1}
+                ),
+                "teacache_offload":(
+                    [False, True],  {"default": True,}
+                ),
+                "cfg_skip_ratio":(
+                    "FLOAT", {"default": 0, "min": 0, "max": 1, "step": 0.01}
+                ),
+            },
+            "optional":{
+                "image": ("IMAGE",),
+            },
+        }
+    
+    RETURN_TYPES = ("IMAGE",)
+    RETURN_NAMES =("images",)
+    FUNCTION = "process"
+    CATEGORY = "CogVideoXFUNWrapper"
+
+    def process(self, funmodels, prompt, negative_prompt, width, height, seed, steps, cfg, scheduler, shift, teacache_threshold, enable_teacache, num_skip_start_steps, teacache_offload, cfg_skip_ratio, image=None):
+        global transformer_cpu_cache
+        global lora_path_before
+        device = mm.get_torch_device()
+        offload_device = mm.unet_offload_device()
+
+        mm.soft_empty_cache()
+        gc.collect()
+
+        # Get Pipeline
+        pipeline = funmodels['pipeline']
+        model_name = funmodels['model_name']
+        weight_dtype = funmodels['dtype']
+
+        # Load Sampler
+        pipeline.scheduler = get_qwen_scheduler(scheduler, shift)
+
+        coefficients = get_teacache_coefficients(model_name) if enable_teacache else None
+        if coefficients is not None:
+            print(f"Enable TeaCache with threshold {teacache_threshold} and skip the first {num_skip_start_steps} steps.")
+            pipeline.transformer.enable_teacache(
+                coefficients, steps, teacache_threshold, num_skip_start_steps=num_skip_start_steps, offload=teacache_offload
+            )
+        else:
+            pipeline.transformer.disable_teacache()
+
+        if cfg_skip_ratio is not None:
+            print(f"Enable cfg_skip_ratio {cfg_skip_ratio}.")
+            pipeline.transformer.enable_cfg_skip(cfg_skip_ratio, steps)
+
+        generator= torch.Generator(device).manual_seed(seed)
+
+        with torch.no_grad():
+            # Apply lora
+            if funmodels.get("lora_cache", False):
+                if len(funmodels.get("loras", [])) != 0:
+                    # Save the original weights to cpu
+                    if len(transformer_cpu_cache) == 0:
+                        print('Save transformer state_dict to cpu memory')
+                        transformer_state_dict = pipeline.transformer.state_dict()
+                        for key in transformer_state_dict:
+                            transformer_cpu_cache[key] = transformer_state_dict[key].clone().cpu()
+                    
+                    lora_path_now = str(funmodels.get("loras", []) + funmodels.get("strength_model", []))
+                    if lora_path_now != lora_path_before:
+                        print('Merge Lora with Cache')
+                        lora_path_before = copy.deepcopy(lora_path_now)
+                        pipeline.transformer.load_state_dict(transformer_cpu_cache)
+                        for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                            pipeline = merge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+                   
+            else:
+                print('Merge Lora')
+                # Clear lora when switch from lora_cache=True to lora_cache=False.
+                if len(transformer_cpu_cache) != 0:
+                    pipeline.transformer.load_state_dict(transformer_cpu_cache)
+                    transformer_cpu_cache = {}
+                    lora_path_before = ""
+                    gc.collect()
+                
+                for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                    pipeline = merge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+
+            image = [to_pil(image) for image in image]
+            image = get_image(image[0]) if image is not None else image
+
+            sample = pipeline(
+                image       = image,
+                prompt      = prompt,
+                negative_prompt = negative_prompt,
+                height      = height,
+                width       = width,
+                generator   = generator,
+                true_cfg_scale = cfg,
+                num_inference_steps = steps,
+                comfyui_progressbar = True,
+            ).images
+            image = torch.Tensor(np.array(sample[0])).unsqueeze(0) / 255
+
+            if not funmodels.get("lora_cache", False):
+                print('Unmerge Lora')
+                for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                    pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+        return (image,)   
+
+class QwenImageEditPlusSampler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "funmodels": (
+                    "FunModels", 
+                ),
+                "prompt": (
+                    "STRING_PROMPT", 
+                ),
+                "negative_prompt": (
+                    "STRING_PROMPT", 
+                ),
+                "width": (
+                    "INT", {"default": 1344, "min": 64, "max": 2048, "step": 16}
+                ),
+                "height": (
+                    "INT", {"default": 768, "min": 64, "max": 2048, "step": 16}
+                ),
+                "seed": (
+                    "INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}
+                ),
+                "steps": (
+                    "INT", {"default": 50, "min": 1, "max": 200, "step": 1}
+                ),
+                "cfg": (
+                    "FLOAT", {"default": 4.0, "min": 1.0, "max": 20.0, "step": 0.01}
+                ),
+                "scheduler": (
+                    ["Flow", "Flow_Unipc", "Flow_DPM++"],
+                    {
+                        "default": 'Flow'
+                    }
+                ),
+                "shift": (
+                    "INT", {"default": 1, "min": 1, "max": 100, "step": 1}
+                ),
+                "teacache_threshold": (
+                    "FLOAT", {"default": 0.250, "min": 0.00, "max": 1.00, "step": 0.005}
                 ),
                 "enable_teacache":(
                     [False, True],  {"default": True,}
@@ -933,6 +1428,17 @@ def process(self, funmodels, prompt, negative_prompt, width, height, seed, steps
         model_name = funmodels['model_name']
         weight_dtype = funmodels['dtype']
 
+        # Change to QwenImageEditPlusPipeline
+        if not isinstance(pipeline, QwenImageEditPlusPipeline):
+            pipeline = QwenImageEditPlusPipeline(
+                vae=pipeline.vae,
+                tokenizer=pipeline.tokenizer,
+                text_encoder=pipeline.text_encoder,
+                transformer=pipeline.transformer,
+                processor=pipeline.processor,
+                scheduler=pipeline.scheduler,
+            )
+
         # Load Sampler
         pipeline.scheduler = get_qwen_scheduler(scheduler, shift)
 
@@ -1003,3 +1509,174 @@ def process(self, funmodels, prompt, negative_prompt, width, height, seed, steps
                 for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
                     pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
         return (image,)   
+
+class QwenImageControlSampler:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {
+            "required": {
+                "funmodels": (
+                    "FunModels", 
+                ),
+                "prompt": (
+                    "STRING_PROMPT", 
+                ),
+                "negative_prompt": (
+                    "STRING_PROMPT", 
+                ),
+                "width": (
+                    "INT", {"default": 1568, "min": 64, "max": 20480, "step": 16}
+                ),
+                "height": (
+                    "INT", {"default": 1184, "min": 64, "max": 20480, "step": 16}
+                ),
+                "seed": (
+                    "INT", {"default": 43, "min": 0, "max": 0xffffffffffffffff}
+                ),
+                "steps": (
+                    "INT", {"default": 40, "min": 1, "max": 200, "step": 1}
+                ),
+                "cfg": (
+                    "FLOAT", {"default": 4.0, "min": 0.0, "max": 20.0, "step": 0.01}
+                ),
+                "scheduler": (
+                    ["Flow", "Flow_Unipc", "Flow_DPM++"],
+                    {
+                        "default": 'Flow'
+                    }
+                ),
+                "shift": (
+                    "INT", {"default": 3, "min": 1, "max": 100, "step": 1}
+                ), 
+                "teacache_threshold": (
+                    "FLOAT", {"default": 0.250, "min": 0.00, "max": 1.00, "step": 0.005}
+                ),
+                "enable_teacache":(
+                    [False, True],  {"default": True,}
+                ),
+                "num_skip_start_steps": (
+                    "INT", {"default": 5, "min": 0, "max": 50, "step": 1}
+                ),
+                "teacache_offload":(
+                    [False, True],  {"default": True,}
+                ),
+                "cfg_skip_ratio":(
+                    "FLOAT", {"default": 0, "min": 0, "max": 1, "step": 0.01}
+                ),
+                "control_context_scale": (
+                    "FLOAT", {"default": 0.80, "min": 0.0, "max": 2.0, "step": 0.01}
+                ),
+            },
+            "optional":{
+                "control_image": ("IMAGE",),
+                "inpaint_image": ("IMAGE",),
+                "mask_image": ("IMAGE",),
+            },
+        }
+    
+    RETURN_TYPES = ("IMAGE",)
+    RETURN_NAMES =("images",)
+    FUNCTION = "process"
+    CATEGORY = "CogVideoXFUNWrapper"
+
+    def process(self, funmodels, prompt, negative_prompt, width, height, seed, steps, cfg, scheduler, shift, teacache_threshold, enable_teacache, num_skip_start_steps, teacache_offload, cfg_skip_ratio, control_context_scale, control_image=None, inpaint_image=None, mask_image=None):
+        global transformer_cpu_cache
+        global lora_path_before
+        device = mm.get_torch_device()
+        offload_device = mm.unet_offload_device()
+            
+        mm.soft_empty_cache()
+        gc.collect()
+
+        # Get Pipeline
+        pipeline = funmodels['pipeline']
+        model_name = funmodels['model_name']
+        weight_dtype = funmodels['dtype']
+        sample_size = [height, width]
+
+        # Load Sampler
+        pipeline.scheduler = get_qwen_scheduler(scheduler, shift)
+
+        coefficients = get_teacache_coefficients(model_name) if enable_teacache else None
+        if coefficients is not None:
+            print(f"Enable TeaCache with threshold {teacache_threshold} and skip the first {num_skip_start_steps} steps.")
+            pipeline.transformer.enable_teacache(
+                coefficients, steps, teacache_threshold, num_skip_start_steps=num_skip_start_steps, offload=teacache_offload
+            )
+        else:
+            pipeline.transformer.disable_teacache()
+
+        if cfg_skip_ratio is not None:
+            print(f"Enable cfg_skip_ratio {cfg_skip_ratio}.")
+            pipeline.transformer.enable_cfg_skip(cfg_skip_ratio, steps)
+
+        generator= torch.Generator(device).manual_seed(seed)
+
+        with torch.no_grad():
+            # Apply lora
+            if funmodels.get("lora_cache", False):
+                if len(funmodels.get("loras", [])) != 0:
+                    # Save the original weights to cpu
+                    if len(transformer_cpu_cache) == 0:
+                        print('Save transformer state_dict to cpu memory')
+                        transformer_state_dict = pipeline.transformer.state_dict()
+                        for key in transformer_state_dict:
+                            transformer_cpu_cache[key] = transformer_state_dict[key].clone().cpu()
+                    
+                    lora_path_now = str(funmodels.get("loras", []) + funmodels.get("strength_model", []))
+                    if lora_path_now != lora_path_before:
+                        print('Merge Lora with Cache')
+                        lora_path_before = copy.deepcopy(lora_path_now)
+                        pipeline.transformer.load_state_dict(transformer_cpu_cache)
+                        for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                            pipeline = merge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+                   
+            else:
+                print('Merge Lora')
+                # Clear lora when switch from lora_cache=True to lora_cache=False.
+                if len(transformer_cpu_cache) != 0:
+                    pipeline.transformer.load_state_dict(transformer_cpu_cache)
+                    transformer_cpu_cache = {}
+                    lora_path_before = ""
+                    gc.collect()
+                
+                for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                    pipeline = merge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+
+            if inpaint_image is not None:
+                inpaint_image = [to_pil(inpaint_image) for inpaint_image in inpaint_image][0]
+                inpaint_image = get_image_latent(inpaint_image, sample_size=sample_size)[:, :, 0]
+            else:
+                inpaint_image = torch.zeros([1, 3, sample_size[0], sample_size[1]])
+
+            if mask_image is not None:
+                mask_image = [to_pil(mask_image) for mask_image in mask_image][0]
+                mask_image = get_image_latent(mask_image, sample_size=sample_size)[:, :1, 0]
+            else:
+                mask_image = torch.ones([1, 1, sample_size[0], sample_size[1]]) * 255
+
+            if control_image is not None:
+                control_image = [to_pil(control_image) for control_image in control_image][0]
+                control_image = get_image_latent(control_image, sample_size=sample_size)[:, :, 0]
+
+            sample = pipeline(
+                prompt, 
+                negative_prompt = negative_prompt,
+                height      = height,
+                width       = width,
+                generator   = generator,
+                guidance_scale = cfg,
+                num_inference_steps = steps,
+                image               = inpaint_image,
+                mask_image          = mask_image,
+                control_image       = control_image,
+                control_context_scale = control_context_scale,
+                comfyui_progressbar = True,
+            ).images
+            image = torch.Tensor(np.array(sample[0])).unsqueeze(0) / 255
+
+            if not funmodels.get("lora_cache", False):
+                print('Unmerge Lora')
+                for _lora_path, _lora_weight in zip(funmodels.get("loras", []), funmodels.get("strength_model", [])):
+                    pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight, device=device, dtype=weight_dtype)
+        return (image,)   
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit.json
index 1243fee5..7198aaaa 100644
--- a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit.json
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit.json
@@ -133,7 +133,7 @@
         -275.1779479980469
       ],
       "size": [
-        217.32675170898438,
+        226.6099609375,
         26
       ],
       "flags": {},
@@ -287,6 +287,7 @@
       },
       "widgets_values": [
         "Qwen-Image-Edit_bf16.safetensors",
+        false,
         "bf16"
       ]
     },
@@ -373,8 +374,8 @@
         "Node name for S&R": "QwenImageEditSampler"
       },
       "widgets_values": [
-        1344,
-        768,
+        1728,
+        992,
         373336117071181,
         "randomize",
         40,
@@ -452,7 +453,7 @@
       },
       "widgets_values": [
         "",
-        "model_cpu_offload_and_qfloat8"
+        "model_group_offload"
       ]
     }
   ],
@@ -579,18 +580,19 @@
     "ds": {
       "scale": 0.6905497838871149,
       "offset": [
-        351.20689397709714,
-        562.4271762478439
+        397.7447678565184,
+        611.2107698221871
       ]
     },
-    "frontendVersion": "1.25.11",
+    "frontendVersion": "1.36.14",
     "workspace_info": {
       "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
     },
     "node_versions": {
-      "CogVideoX-Fun": "a97dd425909c3c3719fbbcb99e78061e2f0a237c",
-      "comfy-core": "0.3.57"
-    }
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
   },
   "version": 0.4
 }
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2509.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2509.json
new file mode 100644
index 00000000..fa59ec79
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2509.json
@@ -0,0 +1,598 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 101,
+  "last_link_id": 89,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            84
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadImage",
+      "pos": [
+        312.6856384277344,
+        418.9110107421875
+      ],
+      "size": [
+        315,
+        314.0000305175781
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            85
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "ref_1.png",
+        "image"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 83
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 94,
+      "type": "CombineQwenImagePipeline",
+      "pos": [
+        945.2576293945312,
+        -330.913330078125
+      ],
+      "size": [
+        321.2720642089844,
+        162
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 88
+        },
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "link": 68
+        },
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "link": 70
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "link": 73
+        },
+        {
+          "name": "processor",
+          "shape": 7,
+          "type": "Processor",
+          "link": 74
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "widget": {
+            "name": "model_name"
+          },
+          "link": 89
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CombineQwenImagePipeline"
+      },
+      "widgets_values": [
+        "",
+        "model_group_offload"
+      ]
+    },
+    {
+      "id": 93,
+      "type": "LoadQwenImageVAEModel",
+      "pos": [
+        775.0554809570312,
+        -470.7688293457031
+      ],
+      "size": [
+        377.8583984375,
+        84.69844055175781
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "links": [
+            68
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageVAEModel"
+      },
+      "widgets_values": [
+        "qwen_image_vae.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 91,
+      "type": "LoadQwenImageTextEncoderModel",
+      "pos": [
+        283.53765869140625,
+        -280.6837463378906
+      ],
+      "size": [
+        407.4130859375,
+        102
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "links": [
+            70
+          ]
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "links": [
+            73
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTextEncoderModel"
+      },
+      "widgets_values": [
+        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            80
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "把相机转变成西瓜"
+      ]
+    },
+    {
+      "id": 99,
+      "type": "QwenImageEditPlusSampler",
+      "pos": [
+        722.5752102270133,
+        -64.70894250180959
+      ],
+      "size": [
+        298.1490234375,
+        406
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 82
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 80
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 84
+        },
+        {
+          "name": "image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 85
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            83
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageEditPlusSampler"
+      },
+      "widgets_values": [
+        1728,
+        992,
+        275685855283225,
+        "randomize",
+        50,
+        4,
+        "Flow",
+        1,
+        0.25,
+        true,
+        5,
+        true,
+        0
+      ]
+    },
+    {
+      "id": 101,
+      "type": "LoadQwenImageTransformerModel",
+      "pos": [
+        268.4111302692554,
+        -465.72558354643127
+      ],
+      "size": [
+        452.9282513511349,
+        126
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            88
+          ]
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "links": [
+            89
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTransformerModel"
+      },
+      "widgets_values": [
+        "qwen_image_edit_2509_fp8_e4m3fn.safetensors",
+        false,
+        "bf16"
+      ]
+    },
+    {
+      "id": 96,
+      "type": "LoadQwenImageProcessor",
+      "pos": [
+        706.5747258956637,
+        -287.7528469446806
+      ],
+      "size": [
+        226.6099609375,
+        26
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "processor",
+          "type": "Processor",
+          "links": [
+            74
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageProcessor"
+      },
+      "widgets_values": []
+    }
+  ],
+  "links": [
+    [
+      68,
+      93,
+      0,
+      94,
+      1,
+      "VAEModel"
+    ],
+    [
+      70,
+      91,
+      0,
+      94,
+      2,
+      "TextEncoderModel"
+    ],
+    [
+      73,
+      91,
+      1,
+      94,
+      3,
+      "Tokenizer"
+    ],
+    [
+      74,
+      96,
+      0,
+      94,
+      4,
+      "Processor"
+    ],
+    [
+      80,
+      75,
+      0,
+      99,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      82,
+      94,
+      0,
+      99,
+      0,
+      "FunModels"
+    ],
+    [
+      83,
+      99,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      84,
+      73,
+      0,
+      99,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      85,
+      98,
+      0,
+      99,
+      3,
+      "IMAGE"
+    ],
+    [
+      88,
+      101,
+      0,
+      94,
+      0,
+      "TransformerModel"
+    ],
+    [
+      89,
+      101,
+      1,
+      94,
+      5,
+      "STRING"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        227.96267700195312,
+        -546.4359741210938,
+        1053.5875244140625,
+        397.3387756347656
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6905497838871149,
+      "offset": [
+        553.0897408085864,
+        691.4796929228297
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "13a802b574c3a4397e193a0b8ca4e90c480cc217",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2511.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2511.json
new file mode 100644
index 00000000..e938ee09
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_edit_2511.json
@@ -0,0 +1,598 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 101,
+  "last_link_id": 89,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            84
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadImage",
+      "pos": [
+        312.6856384277344,
+        418.9110107421875
+      ],
+      "size": [
+        315,
+        314.0000305175781
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            85
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "ref_1.png",
+        "image"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 83
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 94,
+      "type": "CombineQwenImagePipeline",
+      "pos": [
+        945.2576293945312,
+        -330.913330078125
+      ],
+      "size": [
+        321.2720642089844,
+        162
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 88
+        },
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "link": 68
+        },
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "link": 70
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "link": 73
+        },
+        {
+          "name": "processor",
+          "shape": 7,
+          "type": "Processor",
+          "link": 74
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "widget": {
+            "name": "model_name"
+          },
+          "link": 89
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CombineQwenImagePipeline"
+      },
+      "widgets_values": [
+        "",
+        "model_group_offload"
+      ]
+    },
+    {
+      "id": 93,
+      "type": "LoadQwenImageVAEModel",
+      "pos": [
+        775.0554809570312,
+        -470.7688293457031
+      ],
+      "size": [
+        377.8583984375,
+        84.69844055175781
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "links": [
+            68
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageVAEModel"
+      },
+      "widgets_values": [
+        "qwen_image_vae.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 91,
+      "type": "LoadQwenImageTextEncoderModel",
+      "pos": [
+        283.53765869140625,
+        -280.6837463378906
+      ],
+      "size": [
+        407.4130859375,
+        102
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "links": [
+            70
+          ]
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "links": [
+            73
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTextEncoderModel"
+      },
+      "widgets_values": [
+        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            80
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "把相机转变成西瓜"
+      ]
+    },
+    {
+      "id": 99,
+      "type": "QwenImageEditPlusSampler",
+      "pos": [
+        722.5752102270133,
+        -64.70894250180959
+      ],
+      "size": [
+        298.1490234375,
+        406
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 82
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 80
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 84
+        },
+        {
+          "name": "image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 85
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            83
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageEditPlusSampler"
+      },
+      "widgets_values": [
+        1728,
+        992,
+        275685855283225,
+        "randomize",
+        50,
+        4,
+        "Flow",
+        1,
+        0.25,
+        true,
+        5,
+        true,
+        0
+      ]
+    },
+    {
+      "id": 96,
+      "type": "LoadQwenImageProcessor",
+      "pos": [
+        706.5747258956637,
+        -287.7528469446806
+      ],
+      "size": [
+        226.6099609375,
+        26
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "processor",
+          "type": "Processor",
+          "links": [
+            74
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageProcessor"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 101,
+      "type": "LoadQwenImageTransformerModel",
+      "pos": [
+        268.4111302692554,
+        -465.72558354643127
+      ],
+      "size": [
+        452.9282513511349,
+        126
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            88
+          ]
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "links": [
+            89
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTransformerModel"
+      },
+      "widgets_values": [
+        "qwen_image_edit_2511_bf16.safetensors",
+        true,
+        "bf16"
+      ]
+    }
+  ],
+  "links": [
+    [
+      68,
+      93,
+      0,
+      94,
+      1,
+      "VAEModel"
+    ],
+    [
+      70,
+      91,
+      0,
+      94,
+      2,
+      "TextEncoderModel"
+    ],
+    [
+      73,
+      91,
+      1,
+      94,
+      3,
+      "Tokenizer"
+    ],
+    [
+      74,
+      96,
+      0,
+      94,
+      4,
+      "Processor"
+    ],
+    [
+      80,
+      75,
+      0,
+      99,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      82,
+      94,
+      0,
+      99,
+      0,
+      "FunModels"
+    ],
+    [
+      83,
+      99,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      84,
+      73,
+      0,
+      99,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      85,
+      98,
+      0,
+      99,
+      3,
+      "IMAGE"
+    ],
+    [
+      88,
+      101,
+      0,
+      94,
+      0,
+      "TransformerModel"
+    ],
+    [
+      89,
+      101,
+      1,
+      94,
+      5,
+      "STRING"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        227.96267700195312,
+        -546.4359741210938,
+        1053.5875244140625,
+        397.3387756347656
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6905497838871149,
+      "offset": [
+        497.11079345101393,
+        668.6095550725204
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "13a802b574c3a4397e193a0b8ca4e90c480cc217",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i.json
index 14be7655..688f680c 100644
--- a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i.json
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i.json
@@ -120,116 +120,6 @@
       "color": "#432",
       "bgcolor": "#653"
     },
-    {
-      "id": 92,
-      "type": "LoadQwenImageTransformerModel",
-      "pos": [
-        275.9798278808594,
-        -465.2391052246094
-      ],
-      "size": [
-        416.3677673339844,
-        106.13789367675781
-      ],
-      "flags": {},
-      "order": 4,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "transformer",
-          "type": "TransformerModel",
-          "links": [
-            78
-          ]
-        },
-        {
-          "name": "model_name",
-          "type": "STRING",
-          "links": [
-            82
-          ]
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "LoadQwenImageTransformerModel"
-      },
-      "widgets_values": [
-        "Qwen-Image_bf16.safetensors",
-        "bf16"
-      ]
-    },
-    {
-      "id": 93,
-      "type": "LoadQwenImageVAEModel",
-      "pos": [
-        775.0554809570312,
-        -470.7688293457031
-      ],
-      "size": [
-        377.8583984375,
-        84.69844055175781
-      ],
-      "flags": {},
-      "order": 5,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "vae",
-          "type": "VAEModel",
-          "links": [
-            79
-          ]
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "LoadQwenImageVAEModel"
-      },
-      "widgets_values": [
-        "Qwen-Image-vae_bf16.safetensors",
-        "bf16"
-      ]
-    },
-    {
-      "id": 91,
-      "type": "LoadQwenImageTextEncoderModel",
-      "pos": [
-        283.53765869140625,
-        -280.6837463378906
-      ],
-      "size": [
-        407.4130859375,
-        102
-      ],
-      "flags": {},
-      "order": 6,
-      "mode": 0,
-      "inputs": [],
-      "outputs": [
-        {
-          "name": "text_encoder",
-          "type": "TextEncoderModel",
-          "links": [
-            80
-          ]
-        },
-        {
-          "name": "tokenizer",
-          "type": "Tokenizer",
-          "links": [
-            81
-          ]
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "LoadQwenImageTextEncoderModel"
-      },
-      "widgets_values": [
-        "Qwen-Image-text_encoder_bf16.safetensors",
-        "bf16"
-      ]
-    },
     {
       "id": 88,
       "type": "PreviewImage",
@@ -301,8 +191,8 @@
         "Node name for S&R": "QwenImageT2VSampler"
       },
       "widgets_values": [
-        1344,
-        768,
+        1728,
+        992,
         867934328802019,
         "randomize",
         40,
@@ -380,7 +270,118 @@
       },
       "widgets_values": [
         "",
-        "model_cpu_offload_and_qfloat8"
+        "model_group_offload"
+      ]
+    },
+    {
+      "id": 92,
+      "type": "LoadQwenImageTransformerModel",
+      "pos": [
+        275.9798278808594,
+        -465.2391052246094
+      ],
+      "size": [
+        416.3677673339844,
+        126.29115625000003
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            78
+          ]
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTransformerModel"
+      },
+      "widgets_values": [
+        "qwen_image_edit_fp8_e4m3fn.safetensors",
+        false,
+        "bf16"
+      ]
+    },
+    {
+      "id": 93,
+      "type": "LoadQwenImageVAEModel",
+      "pos": [
+        775.0554809570312,
+        -470.7688293457031
+      ],
+      "size": [
+        377.8583984375,
+        84.69844055175781
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "links": [
+            79
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageVAEModel"
+      },
+      "widgets_values": [
+        "qwen_image_vae.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 91,
+      "type": "LoadQwenImageTextEncoderModel",
+      "pos": [
+        283.53765869140625,
+        -280.6837463378906
+      ],
+      "size": [
+        407.4130859375,
+        102
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "links": [
+            80
+          ]
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "links": [
+            81
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTextEncoderModel"
+      },
+      "widgets_values": [
+        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+        "bf16"
       ]
     }
   ],
@@ -495,14 +496,15 @@
         640.3416144465855
       ]
     },
-    "frontendVersion": "1.25.11",
+    "frontendVersion": "1.36.14",
     "workspace_info": {
       "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
     },
     "node_versions": {
-      "CogVideoX-Fun": "a97dd425909c3c3719fbbcb99e78061e2f0a237c",
-      "comfy-core": "0.3.57"
-    }
+      "CogVideoX-Fun": "13a802b574c3a4397e193a0b8ca4e90c480cc217",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
   },
   "version": 0.4
 }
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_control.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_control.json
new file mode 100644
index 00000000..948412b6
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_control.json
@@ -0,0 +1,620 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 102,
+  "last_link_id": 103,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            101
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 91,
+      "type": "LoadQwenImageTextEncoderModel",
+      "pos": [
+        283.53765869140625,
+        -280.6837463378906
+      ],
+      "size": [
+        407.4130859375,
+        102
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "links": [
+            80
+          ]
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "links": [
+            81
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTextEncoderModel"
+      },
+      "widgets_values": [
+        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 92,
+      "type": "LoadQwenImageTransformerModel",
+      "pos": [
+        275.9798278808594,
+        -465.2391052246094
+      ],
+      "size": [
+        416.3677673339844,
+        106.13789367675781
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            87
+          ]
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTransformerModel"
+      },
+      "widgets_values": [
+        "qwen_image_2512_fp8_e4m3fn.safetensors",
+        false,
+        "bf16"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadQwenImageControlNetInModel",
+      "pos": [
+        753.1154601481974,
+        -466.61377999428464
+      ],
+      "size": [
+        543.8293619798113,
+        82
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 87
+        }
+      ],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            88
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageControlNetInModel"
+      },
+      "widgets_values": [
+        "qwenimage/qwenimage_control.yaml",
+        "Qwen-Image-2512-Fun-Controlnet-Union.safetensors"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 103
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 100,
+      "type": "LoadImage",
+      "pos": [
+        408.7071040151891,
+        403.5792288936819
+      ],
+      "size": [
+        270,
+        314
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            102
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "a7kXeQ5l9Dhspes7q3x3G (1).png",
+        "image"
+      ]
+    },
+    {
+      "id": 93,
+      "type": "LoadQwenImageVAEModel",
+      "pos": [
+        1133.7959245028942,
+        -257.8118537612475
+      ],
+      "size": [
+        377.8583984375,
+        84.69844055175781
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "links": [
+            79
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageVAEModel"
+      },
+      "widgets_values": [
+        "qwen_image_vae.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 96,
+      "type": "CombineQwenImagePipeline",
+      "pos": [
+        754.4458784421624,
+        -332.72602712957854
+      ],
+      "size": [
+        342.5804748535156,
+        162
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 88
+        },
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "link": 79
+        },
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "link": 80
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "link": 81
+        },
+        {
+          "name": "processor",
+          "shape": 7,
+          "type": "Processor",
+          "link": null
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "widget": {
+            "name": "model_name"
+          },
+          "link": 82
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            99
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CombineQwenImagePipeline"
+      },
+      "widgets_values": [
+        "",
+        "model_group_offload"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            100
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "A photo of Sakura, a 17-year-old high school student from Japan, captured in a candid, high-fidelity cinematic moment on a rainy evening. She is squatting low on the rain-slicked asphalt of an urban sidewalk, holding a transparent vinyl umbrella with a white handle resting over her shoulder in one hand, her other hand resting on her knee. The clear plastic canopy is streaked with rivulets of water and beaded with droplets that catch the ambient city light. A profound, silent interaction defines the scene: Sakura is looking directly downward, her expression gentle and focused, locking eyes with a small black cat sitting on the wet ground in front of her.\\n\\nSakura has long, lustrous black hair styled in a precise hime cut with blunt bangs across her forehead and sidelocks framing her cheeks, damp strands clinging subtly to her jacket, with a single red ribbon tied on the left side. Her visible pores on her nose, and a soft sheen of moisture on her cheeks. She wears a dark navy sailor-style school uniform (seifuku) featuring a white collar with red linear detailing and a bright red necktie loosely knotted at the chest; a simple black choker encircles her neck. The uniform jacket has oversized sleeves. Her lower body features a short, dark pleated miniskirt that fans slightly over clean white ankle socks that provide a stark contrast to the wet asphalt, ending in dark leather loafers that gleam with moisture.\\n\\nThe black cat sits upright in a shallow puddle, its short fur slicked by the rain, tilting its head back to stare intently up into Sakura's face, establishing a clear line of sight. The background is anchored by a large, illuminated red vending machine standing against the darkness, its cool bluish-white interior light spilling onto Sakura's profile and the umbrella. The ground reflects the red chassis and the neon streetlights in distorted patches on the wet pavement. Additional cool rain streaks fall through the frame, some caught in sharp focus and others blurred into vertical lines against the background lights. The scene is rendered with a wide-aperture lens creating a shallow depth of field, keeping the girl and cat in sharp focus while softening the background into gentle bokeh, with the texture of fine-grain 35mm film stock."
+      ]
+    },
+    {
+      "id": 102,
+      "type": "QwenImageControlSampler",
+      "pos": [
+        743.3433452139876,
+        -72.05094395600588
+      ],
+      "size": [
+        289.5494140625,
+        470
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 99
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 100
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 101
+        },
+        {
+          "name": "control_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 102
+        },
+        {
+          "name": "inpaint_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "mask_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            103
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageControlSampler"
+      },
+      "widgets_values": [
+        1184,
+        1568,
+        563555207707640,
+        "randomize",
+        40,
+        4,
+        "Flow",
+        3,
+        0.25,
+        true,
+        5,
+        true,
+        0,
+        0.8
+      ]
+    }
+  ],
+  "links": [
+    [
+      79,
+      93,
+      0,
+      96,
+      1,
+      "VAEModel"
+    ],
+    [
+      80,
+      91,
+      0,
+      96,
+      2,
+      "TextEncoderModel"
+    ],
+    [
+      81,
+      91,
+      1,
+      96,
+      3,
+      "Tokenizer"
+    ],
+    [
+      82,
+      92,
+      1,
+      96,
+      5,
+      "STRING"
+    ],
+    [
+      87,
+      92,
+      0,
+      98,
+      0,
+      "TransformerModel"
+    ],
+    [
+      88,
+      98,
+      0,
+      96,
+      0,
+      "TransformerModel"
+    ],
+    [
+      99,
+      96,
+      0,
+      102,
+      0,
+      "FunModels"
+    ],
+    [
+      100,
+      75,
+      0,
+      102,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      101,
+      73,
+      0,
+      102,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      102,
+      100,
+      0,
+      102,
+      3,
+      "IMAGE"
+    ],
+    [
+      103,
+      102,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        227.96267700195312,
+        -546.4359741210938,
+        1350.7047455077704,
+        391.2301145558972
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7034334668812453,
+      "offset": [
+        519.2334278821429,
+        649.1136351768466
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_inpaint.json b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_inpaint.json
new file mode 100644
index 00000000..ca98e78b
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_chunked_loading_workflow_t2i_inpaint.json
@@ -0,0 +1,710 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 105,
+  "last_link_id": 108,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            101
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 91,
+      "type": "LoadQwenImageTextEncoderModel",
+      "pos": [
+        283.53765869140625,
+        -280.6837463378906
+      ],
+      "size": [
+        407.4130859375,
+        102
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "links": [
+            80
+          ]
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "links": [
+            81
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTextEncoderModel"
+      },
+      "widgets_values": [
+        "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 92,
+      "type": "LoadQwenImageTransformerModel",
+      "pos": [
+        275.9798278808594,
+        -465.2391052246094
+      ],
+      "size": [
+        416.3677673339844,
+        106.13789367675781
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            87
+          ]
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageTransformerModel"
+      },
+      "widgets_values": [
+        "qwen_image_2512_fp8_e4m3fn.safetensors",
+        false,
+        "bf16"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadQwenImageControlNetInModel",
+      "pos": [
+        753.1154601481974,
+        -466.61377999428464
+      ],
+      "size": [
+        543.8293619798113,
+        82
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 87
+        }
+      ],
+      "outputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "links": [
+            88
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageControlNetInModel"
+      },
+      "widgets_values": [
+        "qwenimage/qwenimage_control.yaml",
+        "Qwen-Image-2512-Fun-Controlnet-Union.safetensors"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 13,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 103
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 93,
+      "type": "LoadQwenImageVAEModel",
+      "pos": [
+        1133.7959245028942,
+        -257.8118537612475
+      ],
+      "size": [
+        377.8583984375,
+        84.69844055175781
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "links": [
+            79
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageVAEModel"
+      },
+      "widgets_values": [
+        "qwen_image_vae.safetensors",
+        "bf16"
+      ]
+    },
+    {
+      "id": 96,
+      "type": "CombineQwenImagePipeline",
+      "pos": [
+        754.4458784421624,
+        -332.72602712957854
+      ],
+      "size": [
+        342.5804748535156,
+        162
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "transformer",
+          "type": "TransformerModel",
+          "link": 88
+        },
+        {
+          "name": "vae",
+          "type": "VAEModel",
+          "link": 79
+        },
+        {
+          "name": "text_encoder",
+          "type": "TextEncoderModel",
+          "link": 80
+        },
+        {
+          "name": "tokenizer",
+          "type": "Tokenizer",
+          "link": 81
+        },
+        {
+          "name": "processor",
+          "shape": 7,
+          "type": "Processor",
+          "link": null
+        },
+        {
+          "name": "model_name",
+          "type": "STRING",
+          "widget": {
+            "name": "model_name"
+          },
+          "link": 82
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            99
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CombineQwenImagePipeline"
+      },
+      "widgets_values": [
+        "",
+        "model_group_offload"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            100
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "A photo of Sakura, a 17-year-old high school student from Japan, captured in a candid, high-fidelity cinematic moment on a rainy evening. She is squatting low on the rain-slicked asphalt of an urban sidewalk, holding a transparent vinyl umbrella with a white handle resting over her shoulder in one hand, her other hand resting on her knee. The clear plastic canopy is streaked with rivulets of water and beaded with droplets that catch the ambient city light. A profound, silent interaction defines the scene: Sakura is looking directly downward, her expression gentle and focused, locking eyes with a small black cat sitting on the wet ground in front of her.\\n\\nSakura has long, lustrous black hair styled in a precise hime cut with blunt bangs across her forehead and sidelocks framing her cheeks, damp strands clinging subtly to her jacket, with a single red ribbon tied on the left side. Her visible pores on her nose, and a soft sheen of moisture on her cheeks. She wears a dark navy sailor-style school uniform (seifuku) featuring a white collar with red linear detailing and a bright red necktie loosely knotted at the chest; a simple black choker encircles her neck. The uniform jacket has oversized sleeves. Her lower body features a short, dark pleated miniskirt that fans slightly over clean white ankle socks that provide a stark contrast to the wet asphalt, ending in dark leather loafers that gleam with moisture.\\n\\nThe black cat sits upright in a shallow puddle, its short fur slicked by the rain, tilting its head back to stare intently up into Sakura's face, establishing a clear line of sight. The background is anchored by a large, illuminated red vending machine standing against the darkness, its cool bluish-white interior light spilling onto Sakura's profile and the umbrella. The ground reflects the red chassis and the neon streetlights in distorted patches on the wet pavement. Additional cool rain streaks fall through the frame, some caught in sharp focus and others blurred into vertical lines against the background lights. The scene is rendered with a wide-aperture lens creating a shallow depth of field, keeping the girl and cat in sharp focus while softening the background into gentle bokeh, with the texture of fine-grain 35mm film stock."
+      ]
+    },
+    {
+      "id": 104,
+      "type": "MaskToImage",
+      "pos": [
+        665.9127469276586,
+        458.2047154396486
+      ],
+      "size": [
+        140,
+        26
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 105
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            104,
+            106
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "MaskToImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 105,
+      "type": "LoadImage",
+      "pos": [
+        360.8304806417201,
+        460.85158208210487
+      ],
+      "size": [
+        270,
+        314.00000000000006
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            108
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            105
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage",
+        "image": "clipspace/clipspace-painted-masked-1766731857414.png [input]"
+      },
+      "widgets_values": [
+        "clipspace/clipspace-painted-masked-1766731857414.png [input]",
+        "image"
+      ]
+    },
+    {
+      "id": 103,
+      "type": "PreviewImage",
+      "pos": [
+        839.792105488772,
+        453.4970846240923
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 104
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 102,
+      "type": "QwenImageControlSampler",
+      "pos": [
+        743.3433452139876,
+        -72.05094395600588
+      ],
+      "size": [
+        289.5494140625,
+        470
+      ],
+      "flags": {},
+      "order": 12,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 99
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 100
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 101
+        },
+        {
+          "name": "control_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "inpaint_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 108
+        },
+        {
+          "name": "mask_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 106
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            103
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageControlSampler"
+      },
+      "widgets_values": [
+        1184,
+        1568,
+        284014972131965,
+        "randomize",
+        40,
+        4,
+        "Flow",
+        3,
+        0.25,
+        true,
+        5,
+        true,
+        0,
+        0.8
+      ]
+    }
+  ],
+  "links": [
+    [
+      79,
+      93,
+      0,
+      96,
+      1,
+      "VAEModel"
+    ],
+    [
+      80,
+      91,
+      0,
+      96,
+      2,
+      "TextEncoderModel"
+    ],
+    [
+      81,
+      91,
+      1,
+      96,
+      3,
+      "Tokenizer"
+    ],
+    [
+      82,
+      92,
+      1,
+      96,
+      5,
+      "STRING"
+    ],
+    [
+      87,
+      92,
+      0,
+      98,
+      0,
+      "TransformerModel"
+    ],
+    [
+      88,
+      98,
+      0,
+      96,
+      0,
+      "TransformerModel"
+    ],
+    [
+      99,
+      96,
+      0,
+      102,
+      0,
+      "FunModels"
+    ],
+    [
+      100,
+      75,
+      0,
+      102,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      101,
+      73,
+      0,
+      102,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      103,
+      102,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      104,
+      104,
+      0,
+      103,
+      0,
+      "IMAGE"
+    ],
+    [
+      105,
+      105,
+      1,
+      104,
+      0,
+      "MASK"
+    ],
+    [
+      106,
+      104,
+      0,
+      102,
+      5,
+      "IMAGE"
+    ],
+    [
+      108,
+      105,
+      0,
+      102,
+      4,
+      "IMAGE"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        227.96267700195312,
+        -546.4359741210938,
+        1350.7047455077704,
+        391.2301145558972
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7034334668812453,
+      "offset": [
+        333.2705669168081,
+        543.6154736676693
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_edit.json b/comfyui/qwenimage/v1/qwenimage_workflow_edit.json
index 34b151bc..9507a5f7 100644
--- a/comfyui/qwenimage/v1/qwenimage_workflow_edit.json
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_edit.json
@@ -201,9 +201,9 @@
         "Node name for S&R": "QwenImageEditSampler"
       },
       "widgets_values": [
-        1344,
-        768,
-        686934831068040,
+        1728,
+        992,
+        578345222670916,
         "randomize",
         40,
         4,
@@ -278,7 +278,7 @@
       },
       "widgets_values": [
         "Qwen-Image-Edit",
-        "model_cpu_offload_and_qfloat8",
+        "model_group_offload",
         "bf16"
       ]
     }
@@ -358,18 +358,19 @@
     "ds": {
       "scale": 0.6905497838871149,
       "offset": [
-        542.3419639311205,
-        433.4877940663054
+        454.62313443699367,
+        551.1306970771619
       ]
     },
-    "frontendVersion": "1.25.11",
+    "frontendVersion": "1.36.14",
     "workspace_info": {
       "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
     },
     "node_versions": {
-      "CogVideoX-Fun": "36287cdcab8d5b6972bb6a2d208539c6e4bd81e2",
-      "comfy-core": "0.3.57"
-    }
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
   },
   "version": 0.4
 }
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_edit_2509.json b/comfyui/qwenimage/v1/qwenimage_workflow_edit_2509.json
new file mode 100644
index 00000000..e310ff55
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_edit_2509.json
@@ -0,0 +1,376 @@
+{
+  "id": "0afeb9a9-c8d6-4e64-b303-61e02e44da9e",
+  "revision": 0,
+  "last_node_id": 100,
+  "last_link_id": 86,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            84
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadImage",
+      "pos": [
+        312.6856384277344,
+        418.9110107421875
+      ],
+      "size": [
+        315,
+        314.0000305175781
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            83
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "ref_1.png",
+        "image"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 82
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            85
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "把相机转变成西瓜"
+      ]
+    },
+    {
+      "id": 99,
+      "type": "LoadQwenImageModel",
+      "pos": [
+        294.5816650390625,
+        -309.4810485839844
+      ],
+      "size": [
+        318.1009826660156,
+        106
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            86
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageModel"
+      },
+      "widgets_values": [
+        "Qwen-Image-Edit-2509",
+        "model_group_offload",
+        "bf16"
+      ]
+    },
+    {
+      "id": 100,
+      "type": "QwenImageEditPlusSampler",
+      "pos": [
+        732.0558807778726,
+        -67.107393762887
+      ],
+      "size": [
+        298.1490234375,
+        406
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 86
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 85
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 84
+        },
+        {
+          "name": "image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 83
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageEditPlusSampler"
+      },
+      "widgets_values": [
+        1728,
+        992,
+        437532779396225,
+        "randomize",
+        50,
+        4,
+        "Flow",
+        1,
+        0.25,
+        true,
+        5,
+        true,
+        0
+      ]
+    }
+  ],
+  "links": [
+    [
+      82,
+      100,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      83,
+      98,
+      0,
+      100,
+      3,
+      "IMAGE"
+    ],
+    [
+      84,
+      73,
+      0,
+      100,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      85,
+      75,
+      0,
+      100,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      86,
+      99,
+      0,
+      100,
+      0,
+      "FunModels"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        226.02244567871094,
+        -405.3177185058594,
+        440.6474914550781,
+        238.3169403076172
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6905497838871149,
+      "offset": [
+        456.52379392690324,
+        557.1551088532143
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "13a802b574c3a4397e193a0b8ca4e90c480cc217",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_edit_2511.json b/comfyui/qwenimage/v1/qwenimage_workflow_edit_2511.json
new file mode 100644
index 00000000..fa0cd2f3
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_edit_2511.json
@@ -0,0 +1,376 @@
+{
+  "id": "9a274935-92d9-49d6-bcd9-f4dbdc9c9b60",
+  "revision": 0,
+  "last_node_id": 100,
+  "last_link_id": 86,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            84
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 98,
+      "type": "LoadImage",
+      "pos": [
+        312.6856384277344,
+        418.9110107421875
+      ],
+      "size": [
+        315,
+        314.0000305175781
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            83
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "ref_1.png",
+        "image"
+      ]
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 82
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            85
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "把相机转变成西瓜"
+      ]
+    },
+    {
+      "id": 100,
+      "type": "QwenImageEditPlusSampler",
+      "pos": [
+        732.0558807778726,
+        -67.107393762887
+      ],
+      "size": [
+        298.1490234375,
+        406
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 86
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 85
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 84
+        },
+        {
+          "name": "image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 83
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            82
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageEditPlusSampler"
+      },
+      "widgets_values": [
+        1728,
+        992,
+        496807707597563,
+        "randomize",
+        50,
+        4,
+        "Flow",
+        1,
+        0.25,
+        true,
+        5,
+        true,
+        0
+      ]
+    },
+    {
+      "id": 99,
+      "type": "LoadQwenImageModel",
+      "pos": [
+        294.5816650390625,
+        -309.4810485839844
+      ],
+      "size": [
+        318.1009826660156,
+        106
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            86
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageModel"
+      },
+      "widgets_values": [
+        "Qwen-Image-Edit-2511",
+        "model_group_offload",
+        "bf16"
+      ]
+    }
+  ],
+  "links": [
+    [
+      82,
+      100,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      83,
+      98,
+      0,
+      100,
+      3,
+      "IMAGE"
+    ],
+    [
+      84,
+      73,
+      0,
+      100,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      85,
+      75,
+      0,
+      100,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      86,
+      99,
+      0,
+      100,
+      0,
+      "FunModels"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        226.02244567871094,
+        -405.3177185058594,
+        440.6474914550781,
+        238.3169403076172
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6905497838871149,
+      "offset": [
+        462.22577239663184,
+        589.4946038050374
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "13a802b574c3a4397e193a0b8ca4e90c480cc217",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_t2i.json b/comfyui/qwenimage/v1/qwenimage_workflow_t2i.json
index 75cc7cef..db1620e6 100644
--- a/comfyui/qwenimage/v1/qwenimage_workflow_t2i.json
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_t2i.json
@@ -149,7 +149,7 @@
       },
       "widgets_values": [
         "Qwen-Image",
-        "model_cpu_offload_and_qfloat8",
+        "model_group_offload",
         "bf16"
       ]
     },
@@ -224,8 +224,8 @@
         "Node name for S&R": "QwenImageT2VSampler"
       },
       "widgets_values": [
-        1344,
-        768,
+        1728,
+        992,
         43,
         "randomize",
         40,
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_t2i_control.json b/comfyui/qwenimage/v1/qwenimage_workflow_t2i_control.json
new file mode 100644
index 00000000..b5ac99c0
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_t2i_control.json
@@ -0,0 +1,436 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 94,
+  "last_link_id": 77,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            75
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 77
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 86,
+      "type": "LoadQwenImageModel",
+      "pos": [
+        314.6495666503906,
+        -281.51666259765625
+      ],
+      "size": [
+        276.705078125,
+        106
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            71
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageModel"
+      },
+      "widgets_values": [
+        "Qwen-Image-2512",
+        "model_group_offload",
+        "bf16"
+      ]
+    },
+    {
+      "id": 92,
+      "type": "LoadQwenImageControlNetInPipeline",
+      "pos": [
+        644.9080724681564,
+        -284.9326871492188
+      ],
+      "size": [
+        513.0712528489155,
+        106
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 71
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            73
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageControlNetInPipeline"
+      },
+      "widgets_values": [
+        "qwenimage/qwenimage_control.yaml",
+        "Qwen-Image-2512-Fun-Controlnet-Union.safetensors",
+        "transformer"
+      ]
+    },
+    {
+      "id": 94,
+      "type": "LoadImage",
+      "pos": [
+        348.0102300009322,
+        406.45848378043814
+      ],
+      "size": [
+        270,
+        314
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            76
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": null
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage"
+      },
+      "widgets_values": [
+        "a7kXeQ5l9Dhspes7q3x3G (1).png",
+        "image"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            74
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "A photo of Sakura, a 17-year-old high school student from Japan, captured in a candid, high-fidelity cinematic moment on a rainy evening. She is squatting low on the rain-slicked asphalt of an urban sidewalk, holding a transparent vinyl umbrella with a white handle resting over her shoulder in one hand, her other hand resting on her knee. The clear plastic canopy is streaked with rivulets of water and beaded with droplets that catch the ambient city light. A profound, silent interaction defines the scene: Sakura is looking directly downward, her expression gentle and focused, locking eyes with a small black cat sitting on the wet ground in front of her.\\n\\nSakura has long, lustrous black hair styled in a precise hime cut with blunt bangs across her forehead and sidelocks framing her cheeks, damp strands clinging subtly to her jacket, with a single red ribbon tied on the left side. Her visible pores on her nose, and a soft sheen of moisture on her cheeks. She wears a dark navy sailor-style school uniform (seifuku) featuring a white collar with red linear detailing and a bright red necktie loosely knotted at the chest; a simple black choker encircles her neck. The uniform jacket has oversized sleeves. Her lower body features a short, dark pleated miniskirt that fans slightly over clean white ankle socks that provide a stark contrast to the wet asphalt, ending in dark leather loafers that gleam with moisture.\\n\\nThe black cat sits upright in a shallow puddle, its short fur slicked by the rain, tilting its head back to stare intently up into Sakura's face, establishing a clear line of sight. The background is anchored by a large, illuminated red vending machine standing against the darkness, its cool bluish-white interior light spilling onto Sakura's profile and the umbrella. The ground reflects the red chassis and the neon streetlights in distorted patches on the wet pavement. Additional cool rain streaks fall through the frame, some caught in sharp focus and others blurred into vertical lines against the background lights. The scene is rendered with a wide-aperture lens creating a shallow depth of field, keeping the girl and cat in sharp focus while softening the background into gentle bokeh, with the texture of fine-grain 35mm film stock."
+      ]
+    },
+    {
+      "id": 93,
+      "type": "QwenImageControlSampler",
+      "pos": [
+        723.0842521497987,
+        -74.3739187955474
+      ],
+      "size": [
+        289.5494140625,
+        470
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 73
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 74
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 75
+        },
+        {
+          "name": "control_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 76
+        },
+        {
+          "name": "inpaint_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "mask_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            77
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageControlSampler"
+      },
+      "widgets_values": [
+        1184,
+        1568,
+        733079601689805,
+        "randomize",
+        40,
+        4,
+        "Flow",
+        3,
+        0.25,
+        true,
+        5,
+        true,
+        0,
+        0.8
+      ]
+    }
+  ],
+  "links": [
+    [
+      71,
+      86,
+      0,
+      92,
+      0,
+      "FunModels"
+    ],
+    [
+      73,
+      92,
+      0,
+      93,
+      0,
+      "FunModels"
+    ],
+    [
+      74,
+      75,
+      0,
+      93,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      75,
+      73,
+      0,
+      93,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      76,
+      94,
+      0,
+      93,
+      3,
+      "IMAGE"
+    ],
+    [
+      77,
+      93,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        220,
+        -380,
+        954.3592031237638,
+        226.9206439292882
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7828871306993743,
+      "offset": [
+        428.8268866172707,
+        570.3944941409308
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/qwenimage/v1/qwenimage_workflow_t2i_inpaint.json b/comfyui/qwenimage/v1/qwenimage_workflow_t2i_inpaint.json
new file mode 100644
index 00000000..ed8b8434
--- /dev/null
+++ b/comfyui/qwenimage/v1/qwenimage_workflow_t2i_inpaint.json
@@ -0,0 +1,526 @@
+{
+  "id": "dcf2fcac-6293-4a86-b30b-f63e420177f2",
+  "revision": 0,
+  "last_node_id": 97,
+  "last_link_id": 81,
+  "nodes": [
+    {
+      "id": 78,
+      "type": "Note",
+      "pos": [
+        18,
+        -46
+      ],
+      "size": [
+        210,
+        88
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "You can write prompt here\n（你可以在此填写提示词）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 73,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        160
+      ],
+      "size": [
+        383.7149963378906,
+        183.83506774902344
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            75
+          ]
+        }
+      ],
+      "title": "Negtive Prompt（反向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+      ]
+    },
+    {
+      "id": 80,
+      "type": "Note",
+      "pos": [
+        -92,
+        -294
+      ],
+      "size": [
+        351.1499938964844,
+        130.12660217285156
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "When using the 1.3B model, you can set GPU_memory_mode to model_cpu_offload for faster generation. When using the 20B model, you can use sequential_cpu_offload to save GPU memory during generation.\n（在使用1.3B模型时，可以设置GPU_memory_mode为model_cpu_offload进行更快速度的生成，在使用20B模型时，可以使用sequential_cpu_offload节省显存，进行生成。）"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 88,
+      "type": "PreviewImage",
+      "pos": [
+        1070.207763671875,
+        -73.63389587402344
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 77
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 86,
+      "type": "LoadQwenImageModel",
+      "pos": [
+        314.6495666503906,
+        -281.51666259765625
+      ],
+      "size": [
+        276.705078125,
+        106
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            71
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageModel"
+      },
+      "widgets_values": [
+        "Qwen-Image-2512",
+        "model_group_offload",
+        "bf16"
+      ]
+    },
+    {
+      "id": 92,
+      "type": "LoadQwenImageControlNetInPipeline",
+      "pos": [
+        644.9080724681564,
+        -284.9326871492188
+      ],
+      "size": [
+        513.0712528489155,
+        106
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 71
+        }
+      ],
+      "outputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "links": [
+            73
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadQwenImageControlNetInPipeline"
+      },
+      "widgets_values": [
+        "qwenimage/qwenimage_control.yaml",
+        "Qwen-Image-2512-Fun-Controlnet-Union.safetensors",
+        "transformer"
+      ]
+    },
+    {
+      "id": 75,
+      "type": "FunTextBox",
+      "pos": [
+        250,
+        -50
+      ],
+      "size": [
+        383.54010009765625,
+        156.71620178222656
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "slot_index": 0,
+          "links": [
+            74
+          ]
+        }
+      ],
+      "title": "Positive Prompt（正向提示词）",
+      "properties": {
+        "Node name for S&R": "FunTextBox"
+      },
+      "widgets_values": [
+        "A photo of Sakura, a 17-year-old high school student from Japan, captured in a candid, high-fidelity cinematic moment on a rainy evening. She is squatting low on the rain-slicked asphalt of an urban sidewalk, holding a transparent vinyl umbrella with a white handle resting over her shoulder in one hand, her other hand resting on her knee. The clear plastic canopy is streaked with rivulets of water and beaded with droplets that catch the ambient city light. A profound, silent interaction defines the scene: Sakura is looking directly downward, her expression gentle and focused, locking eyes with a small black cat sitting on the wet ground in front of her.\\n\\nSakura has long, lustrous black hair styled in a precise hime cut with blunt bangs across her forehead and sidelocks framing her cheeks, damp strands clinging subtly to her jacket, with a single red ribbon tied on the left side. Her visible pores on her nose, and a soft sheen of moisture on her cheeks. She wears a dark navy sailor-style school uniform (seifuku) featuring a white collar with red linear detailing and a bright red necktie loosely knotted at the chest; a simple black choker encircles her neck. The uniform jacket has oversized sleeves. Her lower body features a short, dark pleated miniskirt that fans slightly over clean white ankle socks that provide a stark contrast to the wet asphalt, ending in dark leather loafers that gleam with moisture.\\n\\nThe black cat sits upright in a shallow puddle, its short fur slicked by the rain, tilting its head back to stare intently up into Sakura's face, establishing a clear line of sight. The background is anchored by a large, illuminated red vending machine standing against the darkness, its cool bluish-white interior light spilling onto Sakura's profile and the umbrella. The ground reflects the red chassis and the neon streetlights in distorted patches on the wet pavement. Additional cool rain streaks fall through the frame, some caught in sharp focus and others blurred into vertical lines against the background lights. The scene is rendered with a wide-aperture lens creating a shallow depth of field, keeping the girl and cat in sharp focus while softening the background into gentle bokeh, with the texture of fine-grain 35mm film stock."
+      ]
+    },
+    {
+      "id": 95,
+      "type": "PreviewImage",
+      "pos": [
+        726.8217221845972,
+        452.93109914645987
+      ],
+      "size": [
+        366.56134033203125,
+        415.4429626464844
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 78
+        }
+      ],
+      "outputs": [],
+      "properties": {
+        "Node name for S&R": "PreviewImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 93,
+      "type": "QwenImageControlSampler",
+      "pos": [
+        723.0842521497987,
+        -74.3739187955474
+      ],
+      "size": [
+        289.5494140625,
+        470
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "funmodels",
+          "type": "FunModels",
+          "link": 73
+        },
+        {
+          "name": "prompt",
+          "type": "STRING_PROMPT",
+          "link": 74
+        },
+        {
+          "name": "negative_prompt",
+          "type": "STRING_PROMPT",
+          "link": 75
+        },
+        {
+          "name": "control_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": null
+        },
+        {
+          "name": "inpaint_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 80
+        },
+        {
+          "name": "mask_image",
+          "shape": 7,
+          "type": "IMAGE",
+          "link": 81
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            77
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "QwenImageControlSampler"
+      },
+      "widgets_values": [
+        1184,
+        1568,
+        427877921479533,
+        "randomize",
+        40,
+        4,
+        "Flow",
+        3,
+        0.25,
+        true,
+        5,
+        true,
+        0,
+        0.8
+      ]
+    },
+    {
+      "id": 96,
+      "type": "MaskToImage",
+      "pos": [
+        552.9423636234835,
+        457.6387299620162
+      ],
+      "size": [
+        140,
+        26
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "mask",
+          "type": "MASK",
+          "link": 79
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            78,
+            81
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "MaskToImage"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 97,
+      "type": "LoadImage",
+      "pos": [
+        247.8600973375456,
+        460.28559660447246
+      ],
+      "size": [
+        270,
+        314.00000000000006
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            80
+          ]
+        },
+        {
+          "name": "MASK",
+          "type": "MASK",
+          "links": [
+            79
+          ]
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "LoadImage",
+        "image": "clipspace/clipspace-painted-masked-1766731857414.png [input]"
+      },
+      "widgets_values": [
+        "clipspace/clipspace-painted-masked-1766731857414.png [input]",
+        "image"
+      ]
+    }
+  ],
+  "links": [
+    [
+      71,
+      86,
+      0,
+      92,
+      0,
+      "FunModels"
+    ],
+    [
+      73,
+      92,
+      0,
+      93,
+      0,
+      "FunModels"
+    ],
+    [
+      74,
+      75,
+      0,
+      93,
+      1,
+      "STRING_PROMPT"
+    ],
+    [
+      75,
+      73,
+      0,
+      93,
+      2,
+      "STRING_PROMPT"
+    ],
+    [
+      77,
+      93,
+      0,
+      88,
+      0,
+      "IMAGE"
+    ],
+    [
+      78,
+      96,
+      0,
+      95,
+      0,
+      "IMAGE"
+    ],
+    [
+      79,
+      97,
+      1,
+      96,
+      0,
+      "MASK"
+    ],
+    [
+      80,
+      97,
+      0,
+      93,
+      4,
+      "IMAGE"
+    ],
+    [
+      81,
+      96,
+      0,
+      93,
+      5,
+      "IMAGE"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Load Model",
+      "bounding": [
+        220,
+        -380,
+        954.3592031237638,
+        226.9206439292882
+      ],
+      "color": "#b06634",
+      "font_size": 24,
+      "flags": {}
+    },
+    {
+      "id": 2,
+      "title": "Prompts",
+      "bounding": [
+        218,
+        -127,
+        450,
+        483
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.7117155733630676,
+      "offset": [
+        359.457252578857,
+        482.3639645185654
+      ]
+    },
+    "frontendVersion": "1.36.14",
+    "workspace_info": {
+      "id": "776b62b4-bd17-4ed3-9923-b7aad000b1ea"
+    },
+    "node_versions": {
+      "CogVideoX-Fun": "ac114cc14285c8e0073a3e08e27525263d1264a7",
+      "comfy-core": "0.9.2"
+    },
+    "workflowRendererVersion": "LG"
+  },
+  "version": 0.4
+}
\ No newline at end of file
diff --git a/comfyui/wan2_1/nodes.py b/comfyui/wan2_1/nodes.py
index 2be06c8b..4dc72485 100755
--- a/comfyui/wan2_1/nodes.py
+++ b/comfyui/wan2_1/nodes.py
@@ -93,11 +93,7 @@ def loadmodel(self, model_name, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
-        mm.soft_empty_cache()
-
-        mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
         transformer = None
 
@@ -501,7 +497,7 @@ def loadmodel(self, GPU_memory_mode, model, precision, config):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/wan2_1_fun/nodes.py b/comfyui/wan2_1_fun/nodes.py
index bf8a46dd..8801924e 100755
--- a/comfyui/wan2_1_fun/nodes.py
+++ b/comfyui/wan2_1_fun/nodes.py
@@ -105,7 +105,7 @@ def loadmodel(self, GPU_memory_mode, model_type, model, precision, config):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/wan2_2/nodes.py b/comfyui/wan2_2/nodes.py
index c44a5194..172da35a 100755
--- a/comfyui/wan2_2/nodes.py
+++ b/comfyui/wan2_2/nodes.py
@@ -73,7 +73,7 @@ def loadmodel(self, model_name, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
         transformer = None
 
@@ -318,7 +318,7 @@ def loadmodel(self, GPU_memory_mode, model, precision, config):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/wan2_2_fun/nodes.py b/comfyui/wan2_2_fun/nodes.py
index 0b1832f0..385096a3 100755
--- a/comfyui/wan2_2_fun/nodes.py
+++ b/comfyui/wan2_2_fun/nodes.py
@@ -106,7 +106,7 @@ def loadmodel(self, GPU_memory_mode, model_type, model, precision, config):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/wan2_2_vace_fun/nodes.py b/comfyui/wan2_2_vace_fun/nodes.py
index 9988314d..43d404ae 100644
--- a/comfyui/wan2_2_vace_fun/nodes.py
+++ b/comfyui/wan2_2_vace_fun/nodes.py
@@ -70,7 +70,7 @@ def loadmodel(self, model_name, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
         transformer = None
 
@@ -267,7 +267,7 @@ def loadmodel(self, GPU_memory_mode, model, precision, config, model_type="Contr
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
diff --git a/comfyui/z_image/nodes.py b/comfyui/z_image/nodes.py
index 18d6c8fb..9286453d 100644
--- a/comfyui/z_image/nodes.py
+++ b/comfyui/z_image/nodes.py
@@ -70,7 +70,7 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("diffusion_models"),
-                    {"default": "Wan2_1-T2V-1_3B_bf16.safetensors,"},
+                    {"default": "z_image_turbo_bf16.safetensors", },
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -89,7 +89,7 @@ def loadmodel(self, model_name, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
         transformer = None
 
@@ -196,7 +196,7 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("vae"),
-                    {"default": "ZImage2.1_VAE.pth"}
+                    {"default": "ae.safetensors", }
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -371,7 +371,7 @@ def INPUT_TYPES(s):
             "required": {
                 "model_name": (
                     folder_paths.get_filename_list("text_encoders"),
-                    {"default": "models_t5_umt5-xxl-enc-bf16.pth"}
+                    {"default": "qwen_3_4b.safetensors", }
                 ),
                 "precision": (["fp16", "bf16"],
                     {"default": "bf16"}
@@ -569,7 +569,7 @@ def loadmodel(self, GPU_memory_mode, model, precision):
         weight_dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]
 
         mm.unload_all_models()
-        mm.cleanup_models()
+        mm.cleanup_models_gc()
         mm.soft_empty_cache()
 
         # Init processbar
@@ -726,10 +726,14 @@ def INPUT_TYPES(s):
     def loadmodel(self, config, model_name, sub_transformer_name, funmodels):
         device          = mm.get_torch_device()
         offload_device  = mm.unet_offload_device()
+        
         # Get Transformer
         transformer = getattr(funmodels["pipeline"], sub_transformer_name)
         transformer = transformer.cpu()
 
+        # Remove hooks
+        funmodels["pipeline"].remove_all_hooks()
+
         # Load config
         config_path = f"{script_directory}/config/{config}"
         config = OmegaConf.load(config_path)
@@ -797,14 +801,14 @@ def loadmodel(self, config, model_name, sub_transformer_name, funmodels):
         if GPU_memory_mode == "sequential_cpu_offload":
             pipeline.enable_sequential_cpu_offload(device=device)
         elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
-            convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
-            convert_weight_dtype_wrapper(transformer, weight_dtype)
+            convert_model_weight_to_float8(control_transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+            convert_weight_dtype_wrapper(control_transformer, weight_dtype)
             pipeline.enable_model_cpu_offload(device=device)
         elif GPU_memory_mode == "model_cpu_offload":
             pipeline.enable_model_cpu_offload(device=device)
         elif GPU_memory_mode == "model_full_load_and_qfloat8":
-            convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
-            convert_weight_dtype_wrapper(transformer, weight_dtype)
+            convert_model_weight_to_float8(control_transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+            convert_weight_dtype_wrapper(control_transformer, weight_dtype)
             pipeline.to(device=device)
         else:
             pipeline.to(device=device)
@@ -830,7 +834,7 @@ def INPUT_TYPES(s):
                 ),
                 "model_name": (
                     folder_paths.get_filename_list("model_patches"),
-                    {"default": "Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors",},
+                    {"default": "Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors", },
                 ),
                 "transformer": ("TransformerModel",),
             },
diff --git a/examples/qwenimage/predict_t2i.py b/examples/qwenimage/predict_t2i.py
index 16da8c44..e77ec6f9 100644
--- a/examples/qwenimage/predict_t2i.py
+++ b/examples/qwenimage/predict_t2i.py
@@ -16,6 +16,8 @@
                                Qwen2Tokenizer, QwenImageTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImagePipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
@@ -33,9 +35,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -177,6 +182,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/examples/qwenimage/predict_t2i_edit.py b/examples/qwenimage/predict_t2i_edit.py
index 58b8faa3..6b4c6d3e 100644
--- a/examples/qwenimage/predict_t2i_edit.py
+++ b/examples/qwenimage/predict_t2i_edit.py
@@ -17,6 +17,8 @@
                                QwenImageTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImageEditPipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
@@ -35,9 +37,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -188,6 +193,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/examples/qwenimage/predict_t2i_edit_plus.py b/examples/qwenimage/predict_t2i_edit_plus.py
index edded60e..60a60ea6 100644
--- a/examples/qwenimage/predict_t2i_edit_plus.py
+++ b/examples/qwenimage/predict_t2i_edit_plus.py
@@ -17,6 +17,8 @@
                                QwenImageTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImageEditPlusPipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
@@ -35,9 +37,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -188,6 +193,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/examples/qwenimage_fun/predict_i2i_inpaint.py b/examples/qwenimage_fun/predict_i2i_inpaint.py
index 3ee4be0a..e7722b4e 100644
--- a/examples/qwenimage_fun/predict_i2i_inpaint.py
+++ b/examples/qwenimage_fun/predict_i2i_inpaint.py
@@ -2,9 +2,8 @@
 import sys
 
 import torch
-
+from diffusers import FlowMatchEulerDiscreteScheduler
 from omegaconf import OmegaConf
-from diffusers import (FlowMatchEulerDiscreteScheduler)
 
 current_file_path = os.path.abspath(__file__)
 project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
@@ -14,17 +13,18 @@
 from videox_fun.dist import set_multi_gpus_devices, shard_model
 from videox_fun.models import (AutoencoderKLQwenImage,
                                Qwen2_5_VLForConditionalGeneration,
-                               Qwen2Tokenizer, QwenImageControlTransformer2DModel)
+                               Qwen2Tokenizer,
+                               QwenImageControlTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImageControlPipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
                                                convert_weight_dtype_wrapper)
 from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
-from videox_fun.utils.utils import (filter_kwargs, get_image_to_video_latent, get_image_latent, get_image,
-                                    get_video_to_video_latent,
-                                    save_videos_grid)
+from videox_fun.utils.utils import get_image_latent, save_videos_grid
 
 # GPU memory mode, which can be chosen in [model_full_load, model_full_load_and_qfloat8, model_cpu_offload, model_cpu_offload_and_qfloat8, sequential_cpu_offload].
 # model_full_load means that the entire model will be moved to the GPU.
@@ -37,9 +37,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -179,6 +182,7 @@
         print("Add FSDP DIT")
     if fsdp_text_encoder:
         from functools import partial
+
         from videox_fun.dist import set_multi_gpus_devices, shard_model
         shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=text_encoder.language_model.layers)
         text_encoder = shard_fn(text_encoder)
@@ -191,6 +195,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/examples/qwenimage_fun/predict_t2i_control.py b/examples/qwenimage_fun/predict_t2i_control.py
index a9837898..828748bd 100644
--- a/examples/qwenimage_fun/predict_t2i_control.py
+++ b/examples/qwenimage_fun/predict_t2i_control.py
@@ -2,9 +2,8 @@
 import sys
 
 import torch
-
+from diffusers import FlowMatchEulerDiscreteScheduler
 from omegaconf import OmegaConf
-from diffusers import (FlowMatchEulerDiscreteScheduler)
 
 current_file_path = os.path.abspath(__file__)
 project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
@@ -14,17 +13,18 @@
 from videox_fun.dist import set_multi_gpus_devices, shard_model
 from videox_fun.models import (AutoencoderKLQwenImage,
                                Qwen2_5_VLForConditionalGeneration,
-                               Qwen2Tokenizer, QwenImageControlTransformer2DModel)
+                               Qwen2Tokenizer,
+                               QwenImageControlTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImageControlPipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
                                                convert_weight_dtype_wrapper)
 from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
-from videox_fun.utils.utils import (filter_kwargs, get_image_to_video_latent, get_image_latent, get_image,
-                                    get_video_to_video_latent,
-                                    save_videos_grid)
+from videox_fun.utils.utils import get_image_latent, save_videos_grid
 
 # GPU memory mode, which can be chosen in [model_full_load, model_full_load_and_qfloat8, model_cpu_offload, model_cpu_offload_and_qfloat8, sequential_cpu_offload].
 # model_full_load means that the entire model will be moved to the GPU.
@@ -37,9 +37,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -179,6 +182,7 @@
         print("Add FSDP DIT")
     if fsdp_text_encoder:
         from functools import partial
+
         from videox_fun.dist import set_multi_gpus_devices, shard_model
         shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=text_encoder.language_model.layers)
         text_encoder = shard_fn(text_encoder)
@@ -191,6 +195,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/examples/qwenimage_instantx/predict_t2i_control.py b/examples/qwenimage_instantx/predict_t2i_control.py
index f4a88e3f..d64155b0 100644
--- a/examples/qwenimage_instantx/predict_t2i_control.py
+++ b/examples/qwenimage_instantx/predict_t2i_control.py
@@ -17,6 +17,8 @@
                                Qwen2Tokenizer, QwenImageTransformer2DModel)
 from videox_fun.models.cache_utils import get_teacache_coefficients
 from videox_fun.pipeline import QwenImageControlNetPipeline
+from videox_fun.utils import (register_auto_device_hook,
+                              safe_enable_group_offload)
 from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
 from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
 from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
@@ -37,9 +39,12 @@
 # model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
 # and the transformer model has been quantized to float8, which can save more GPU memory. 
 # 
+# model_group_offload transfers internal layer groups between CPU/CUDA, 
+# balancing memory efficiency and speed between full-module and leaf-level offloading methods.
+# 
 # sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
 # resulting in slower speeds but saving a large amount of GPU memory.
-GPU_memory_mode     = "model_cpu_offload_and_qfloat8"
+GPU_memory_mode     = "model_group_offload"
 # Multi GPUs config
 # Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
 # For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
@@ -207,6 +212,9 @@
 
 if GPU_memory_mode == "sequential_cpu_offload":
     pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_group_offload":
+    register_auto_device_hook(pipeline.transformer)
+    safe_enable_group_offload(pipeline, onload_device=device, offload_device="cpu", offload_type="leaf_level", use_stream=True)
 elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
     convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
     convert_weight_dtype_wrapper(transformer, weight_dtype)
diff --git a/pyproject.toml b/pyproject.toml
index c0adf3c1..86e8befe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,43 @@
 [project]
 name = "videox-fun"
-description = "VideoX-Fun is a video generation pipeline that can be used to generate AI images and videos, as well as to train baseline and Lora models for Diffusion Transformer. We support direct prediction from pre-trained baseline models to generate videos with different resolutions, durations, and FPS. Additionally, we also support users in training their own baseline and Lora models to perform specific style transformations."
 version = "1.0.0"
-license = {file = "LICENSE"}
-dependencies = ["Pillow", "einops", "safetensors", "timm", "tomesd", "torch>=2.1.2", "torchdiffeq", "torchsde", "decord", "datasets", "numpy", "scikit-image", "opencv-python", "omegaconf", "SentencePiece", "albumentations", "imageio[ffmpeg]", "imageio[pyav]", "tensorboard", "beautifulsoup4", "ftfy", "func_timeout", "accelerate>=0.25.0", "gradio>=3.41.2,<=3.48.0", "diffusers>=0.30.1,<=0.31.0", "transformers>=4.46.2"]
+description = "VideoX-Fun is a video generation pipeline that can be used to generate AI images and videos, as well as to train baseline and Lora models for Diffusion Transformer. We support direct prediction from pre-trained baseline models to generate videos with different resolutions, durations, and FPS. Additionally, we also support users in training their own baseline and Lora models to perform specific style transformations."
+license = { file = "LICENSE" }
+dependencies = [
+    "Pillow",
+    "einops",
+    "safetensors",
+    "timm",
+    "tomesd",
+    "torch>=2.1.2",
+    "torchdiffeq",
+    "torchsde",
+    "decord",
+    "datasets",
+    "numpy",
+    "scikit-image",
+    "opencv-python",
+    "omegaconf",
+    "SentencePiece",
+    "albumentations",
+    "imageio[ffmpeg]",
+    "imageio[pyav]",
+    "tensorboard",
+    "beautifulsoup4",
+    "ftfy",
+    "func_timeout",
+    "accelerate>=0.25.0",
+    "gradio>=3.41.2",
+    "diffusers>=0.30.1",
+    "transformers>=4.46.2",
+]
 
 [project.urls]
 Repository = "https://github.com/aigc-apps/VideoX-Fun"
-#  Used by Comfy Registry https://comfyregistry.org
+# Used by Comfy Registry https://comfyregistry.org
+
+[tool.setuptools]
+packages = ["videox_fun"]
 
 [tool.comfy]
 PublisherId = "bubbliiiing"
diff --git a/videox_fun/models/qwenimage_transformer2d.py b/videox_fun/models/qwenimage_transformer2d.py
index c4c022ac..abbcafa5 100644
--- a/videox_fun/models/qwenimage_transformer2d.py
+++ b/videox_fun/models/qwenimage_transformer2d.py
@@ -34,17 +34,11 @@
 from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
 from diffusers.loaders.single_file_model import FromOriginalModelMixin
 from diffusers.models.attention import Attention, FeedForward
-from diffusers.models.attention_processor import (
-    Attention, AttentionProcessor, CogVideoXAttnProcessor2_0,
-    FusedCogVideoXAttnProcessor2_0)
-from diffusers.models.embeddings import (CogVideoXPatchEmbed,
-                                         TimestepEmbedding, Timesteps,
-                                         get_3d_sincos_pos_embed)
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.normalization import (AdaLayerNorm,
-                                            AdaLayerNormContinuous,
-                                            CogVideoXLayerNormZero, RMSNorm)
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
 from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
                              scale_lora_layers, unscale_lora_layers)
 from diffusers.utils.torch_utils import maybe_allow_in_graph
diff --git a/videox_fun/models/qwenimage_transformer2d_control.py b/videox_fun/models/qwenimage_transformer2d_control.py
index 12063962..69d80df5 100644
--- a/videox_fun/models/qwenimage_transformer2d_control.py
+++ b/videox_fun/models/qwenimage_transformer2d_control.py
@@ -2,18 +2,21 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+from math import prod
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import torch.nn as nn
 from diffusers.configuration_utils import register_to_config
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version,
+from diffusers.utils import (USE_PEFT_BACKEND, is_torch_version, logging,
                              scale_lora_layers, unscale_lora_layers)
 
+from ..utils import cfg_skip
 from .qwenimage_transformer2d import (QwenImageTransformer2DModel,
                                       QwenImageTransformerBlock)
-from ..utils import cfg_skip
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 class QwenImageControlTransformerBlock(QwenImageTransformerBlock):
diff --git a/videox_fun/models/qwenimage_transformer2d_instantx.py b/videox_fun/models/qwenimage_transformer2d_instantx.py
index b9ee8ace..cf7a5d79 100644
--- a/videox_fun/models/qwenimage_transformer2d_instantx.py
+++ b/videox_fun/models/qwenimage_transformer2d_instantx.py
@@ -28,7 +28,6 @@
                                       register_to_config, scale_lora_layers,
                                       unscale_lora_layers)
 
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
diff --git a/videox_fun/models/wan_vae.py b/videox_fun/models/wan_vae.py
index 7815e7b8..4d3fad12 100755
--- a/videox_fun/models/wan_vae.py
+++ b/videox_fun/models/wan_vae.py
@@ -733,6 +733,13 @@ def __init__(
             4
         ],
         dropout = 0.0,
+        num_res_blocks = 2,
+        temperal_downsample = [
+            False,
+            True,
+            True
+        ],
+        z_dim = 16,
         latents_mean = [
             -0.7571,
             -0.7089,
@@ -769,13 +776,8 @@ def __init__(
             2.8251,
             1.916
         ],
-        num_res_blocks = 2,
-        temperal_downsample = [
-            False,
-            True,
-            True
-        ],
-        z_dim = 16
+        temporal_compression_ratio=4,
+        spatial_compression_ratio=8
     ):
         super().__init__()
         cfg = dict(
@@ -797,6 +799,8 @@ def __init__(
         self.attn_scales = attn_scales
         self.temperal_downsample = temperal_downsample
         self.temperal_upsample = temperal_downsample[::-1]
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.spatial_compression_ratio = spatial_compression_ratio
 
     def _encode(self, x: torch.Tensor) -> torch.Tensor:
         x = [
diff --git a/videox_fun/pipeline/pipeline_qwenimage_control.py b/videox_fun/pipeline/pipeline_qwenimage_control.py
index 94704a70..3d58a15c 100644
--- a/videox_fun/pipeline/pipeline_qwenimage_control.py
+++ b/videox_fun/pipeline/pipeline_qwenimage_control.py
@@ -490,7 +490,8 @@ def __call__(
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 512,
-        control_context_scale: float = 1.0
+        control_context_scale: float = 1.0,
+        comfyui_progressbar: bool = False,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -603,6 +604,9 @@ def __call__(
 
         device = self._execution_device
         weight_dtype = self.text_encoder.dtype
+        if comfyui_progressbar:
+            from comfy.utils import ProgressBar
+            pbar = ProgressBar(num_inference_steps + 2)
 
         has_neg_prompt = negative_prompt is not None or (
             negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
@@ -625,6 +629,9 @@ def __call__(
                 num_images_per_prompt=num_images_per_prompt,
                 max_sequence_length=max_sequence_length,
             )
+        if comfyui_progressbar:
+            pbar.update(1)
+
         # 4. Prepare latent variables
         num_channels_latents = self.transformer.config.in_channels // 4
         latents = self.prepare_latents(
@@ -709,6 +716,8 @@ def __call__(
         negative_txt_seq_lens = (
             negative_prompt_embeds_mask.sum(dim=1).tolist() if negative_prompt_embeds_mask is not None else None
         )
+        if comfyui_progressbar:
+            pbar.update(1)
 
         # 6. Denoising loop
         self.scheduler.set_begin_index(0)
@@ -794,6 +803,9 @@ def __call__(
                 if XLA_AVAILABLE:
                     xm.mark_step()
 
+                if comfyui_progressbar:
+                    pbar.update(1)
+
         self._current_timestep = None
         if output_type == "latent":
             image = latents
diff --git a/videox_fun/pipeline/pipeline_qwenimage_instantx.py b/videox_fun/pipeline/pipeline_qwenimage_instantx.py
index 1a5edfe7..d2e97a5a 100644
--- a/videox_fun/pipeline/pipeline_qwenimage_instantx.py
+++ b/videox_fun/pipeline/pipeline_qwenimage_instantx.py
@@ -22,26 +22,18 @@
 import PIL.Image
 import torch
 import torch.nn.functional as F
-import torchvision.transforms.functional as TF
 from diffusers import FlowMatchEulerDiscreteScheduler
-from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import QwenImageLoraLoaderMixin
-from diffusers.models.embeddings import get_1d_rotary_pos_embed
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (BaseOutput, deprecate, is_torch_xla_available,
                              logging, replace_example_docstring)
 from diffusers.utils.torch_utils import randn_tensor
-from diffusers.video_processor import VideoProcessor
-from einops import rearrange
-from PIL import Image
 
 from ..models import (AutoencoderKLQwenImage,
                       Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer,
-                      QwenImageInstantXControlNetModel, QwenImageTransformer2DModel,
-                      T5Tokenizer)
-
+                      QwenImageInstantXControlNetModel,
+                      QwenImageTransformer2DModel, T5Tokenizer)
 
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -162,7 +154,7 @@ class QwenImagePipelineOutput(BaseOutput):
     images: Union[List[PIL.Image.Image], np.ndarray]
 
 
-class QwenImageControlNetPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
+class QwenImageControlNetPipeline(DiffusionPipeline):
     r"""
     The QwenImage pipeline for text-to-image generation.
 
diff --git a/videox_fun/utils/__init__.py b/videox_fun/utils/__init__.py
index 009df372..7628f0c3 100755
--- a/videox_fun/utils/__init__.py
+++ b/videox_fun/utils/__init__.py
@@ -1,17 +1,20 @@
 import importlib.util
 
+from .cfg_optimization import cfg_skip
+from .discrete_sampler import DiscreteSampling
 from .fm_solvers import FlowDPMSolverMultistepScheduler
 from .fm_solvers_unipc import FlowUniPCMultistepScheduler
 from .fp8_optimization import (autocast_model_forward,
                                convert_model_weight_to_float8,
                                convert_weight_dtype_wrapper,
                                replace_parameters_by_name)
+from .group_offload import (register_auto_device_hook,
+                            safe_enable_group_offload,
+                            safe_remove_group_offloading)
 from .lora_utils import merge_lora, unmerge_lora
-from .utils import (filter_kwargs, get_image_latent, get_image_to_video_latent, get_autocast_dtype,
-                    get_video_to_video_latent, save_videos_grid)
-from .cfg_optimization import cfg_skip
-from .discrete_sampler import DiscreteSampling
-
+from .utils import (filter_kwargs, get_autocast_dtype, get_image_latent,
+                    get_image_to_video_latent, get_video_to_video_latent,
+                    save_videos_grid)
 
 # The pai_fuser is an internally developed acceleration package, which can be used on PAI.
 if importlib.util.find_spec("paifuser") is not None:
@@ -19,7 +22,8 @@
     #   FP8 Linear Kernel
     # --------------------------------------------------------------- #
     from paifuser.ops import (convert_model_weight_to_float8,
-                                convert_weight_dtype_wrapper)
+                              convert_weight_dtype_wrapper)
+
     from . import fp8_optimization
     fp8_optimization.convert_model_weight_to_float8 = convert_model_weight_to_float8
     fp8_optimization.convert_weight_dtype_wrapper = convert_weight_dtype_wrapper
diff --git a/videox_fun/utils/group_offload.py b/videox_fun/utils/group_offload.py
new file mode 100644
index 00000000..cfc00b1b
--- /dev/null
+++ b/videox_fun/utils/group_offload.py
@@ -0,0 +1,1440 @@
+# Modified from https://github.com/huggingface/diffusers/blob/v0.36.0/src/diffusers/hooks/group_offloading.py
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import hashlib
+import os
+import types
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass, replace
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+
+import safetensors.torch
+import torch
+from diffusers.utils import get_logger, is_accelerate_available
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload
+    from accelerate.utils import send_to_device
+
+logger = get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# fmt: off
+_GROUP_OFFLOADING = "group_offloading"
+_LAYER_EXECUTION_TRACKER = "layer_execution_tracker"
+_LAZY_PREFETCH_GROUP_OFFLOADING = "lazy_prefetch_group_offloading"
+_GROUP_ID_LAZY_LEAF = "lazy_leafs"
+# fmt: on
+
+_GO_LC_SUPPORTED_PYTORCH_LAYERS = (
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+    torch.nn.ConvTranspose1d,
+    torch.nn.ConvTranspose2d,
+    torch.nn.ConvTranspose3d,
+    torch.nn.Linear,
+    # TODO(aryan): look into torch.nn.LayerNorm, torch.nn.GroupNorm later, seems to be causing some issues with CogVideoX
+    # because of double invocation of the same norm layer in CogVideoXLayerNorm
+)
+
+
+class ModelHook:
+    r"""
+    A hook that contains callbacks to be executed just before and after the forward method of a model.
+    """
+
+    _is_stateful = False
+
+    def __init__(self):
+        self.fn_ref: "HookFunctionReference" = None
+
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        r"""
+        Hook that is executed when a model is initialized.
+
+        Args:
+            module (`torch.nn.Module`):
+                The module attached to this hook.
+        """
+        return module
+
+    def deinitalize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        r"""
+        Hook that is executed when a model is deinitalized.
+
+        Args:
+            module (`torch.nn.Module`):
+                The module attached to this hook.
+        """
+        return module
+
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs) -> Tuple[Tuple[Any], Dict[str, Any]]:
+        r"""
+        Hook that is executed just before the forward method of the model.
+
+        Args:
+            module (`torch.nn.Module`):
+                The module whose forward pass will be executed just after this event.
+            args (`Tuple[Any]`):
+                The positional arguments passed to the module.
+            kwargs (`Dict[Str, Any]`):
+                The keyword arguments passed to the module.
+        Returns:
+            `Tuple[Tuple[Any], Dict[Str, Any]]`:
+                A tuple with the treated `args` and `kwargs`.
+        """
+        return args, kwargs
+
+    def post_forward(self, module: torch.nn.Module, output: Any) -> Any:
+        r"""
+        Hook that is executed just after the forward method of the model.
+
+        Args:
+            module (`torch.nn.Module`):
+                The module whose forward pass been executed just before this event.
+            output (`Any`):
+                The output of the module.
+        Returns:
+            `Any`: The processed `output`.
+        """
+        return output
+
+    def detach_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        r"""
+        Hook that is executed when the hook is detached from a module.
+
+        Args:
+            module (`torch.nn.Module`):
+                The module detached from this hook.
+        """
+        return module
+
+    def reset_state(self, module: torch.nn.Module):
+        if self._is_stateful:
+            raise NotImplementedError("This hook is stateful and needs to implement the `reset_state` method.")
+        return module
+
+
+class HookFunctionReference:
+    def __init__(self) -> None:
+        """A container class that maintains mutable references to forward pass functions in a hook chain.
+
+        Its mutable nature allows the hook system to modify the execution chain dynamically without rebuilding the
+        entire forward pass structure.
+
+        Attributes:
+            pre_forward: A callable that processes inputs before the main forward pass.
+            post_forward: A callable that processes outputs after the main forward pass.
+            forward: The current forward function in the hook chain.
+            original_forward: The original forward function, stored when a hook provides a custom new_forward.
+
+        The class enables hook removal by allowing updates to the forward chain through reference modification rather
+        than requiring reconstruction of the entire chain. When a hook is removed, only the relevant references need to
+        be updated, preserving the execution order of the remaining hooks.
+        """
+        self.pre_forward = None
+        self.post_forward = None
+        self.forward = None
+        self.original_forward = None
+
+
+class HookRegistry:
+    def __init__(self, module_ref: torch.nn.Module) -> None:
+        super().__init__()
+
+        self.hooks: Dict[str, ModelHook] = {}
+
+        self._module_ref = module_ref
+        self._hook_order = []
+        self._fn_refs = []
+
+    def register_hook(self, hook: ModelHook, name: str) -> None:
+        if name in self.hooks.keys():
+            raise ValueError(
+                f"Hook with name {name} already exists in the registry. Please use a different name or "
+                f"first remove the existing hook and then add a new one."
+            )
+
+        self._module_ref = hook.initialize_hook(self._module_ref)
+
+        def create_new_forward(function_reference: HookFunctionReference):
+            def new_forward(module, *args, **kwargs):
+                args, kwargs = function_reference.pre_forward(module, *args, **kwargs)
+                output = function_reference.forward(*args, **kwargs)
+                return function_reference.post_forward(module, output)
+
+            return new_forward
+
+        forward = self._module_ref.forward
+
+        fn_ref = HookFunctionReference()
+        fn_ref.pre_forward = hook.pre_forward
+        fn_ref.post_forward = hook.post_forward
+        fn_ref.forward = forward
+
+        if hasattr(hook, "new_forward"):
+            fn_ref.original_forward = forward
+            fn_ref.forward = functools.update_wrapper(
+                functools.partial(hook.new_forward, self._module_ref), hook.new_forward
+            )
+
+        rewritten_forward = create_new_forward(fn_ref)
+        self._module_ref.forward = functools.update_wrapper(
+            functools.partial(rewritten_forward, self._module_ref), rewritten_forward
+        )
+
+        hook.fn_ref = fn_ref
+        self.hooks[name] = hook
+        self._hook_order.append(name)
+        self._fn_refs.append(fn_ref)
+
+    def get_hook(self, name: str) -> Optional[ModelHook]:
+        return self.hooks.get(name, None)
+
+    def remove_hook(self, name: str, recurse: bool = True) -> None:
+        if name in self.hooks.keys():
+            num_hooks = len(self._hook_order)
+            hook = self.hooks[name]
+            index = self._hook_order.index(name)
+            fn_ref = self._fn_refs[index]
+
+            old_forward = fn_ref.forward
+            if fn_ref.original_forward is not None:
+                old_forward = fn_ref.original_forward
+
+            if index == num_hooks - 1:
+                self._module_ref.forward = old_forward
+            else:
+                self._fn_refs[index + 1].forward = old_forward
+
+            self._module_ref = hook.deinitalize_hook(self._module_ref)
+            del self.hooks[name]
+            self._hook_order.pop(index)
+            self._fn_refs.pop(index)
+
+        if recurse:
+            for module_name, module in self._module_ref.named_modules():
+                if module_name == "":
+                    continue
+                if hasattr(module, "_diffusers_hook"):
+                    module._diffusers_hook.remove_hook(name, recurse=False)
+
+    def reset_stateful_hooks(self, recurse: bool = True) -> None:
+        for hook_name in reversed(self._hook_order):
+            hook = self.hooks[hook_name]
+            if hook._is_stateful:
+                hook.reset_state(self._module_ref)
+
+        if recurse:
+            for module_name, module in self._module_ref.named_modules():
+                if module_name == "":
+                    continue
+                if hasattr(module, "_diffusers_hook"):
+                    module._diffusers_hook.reset_stateful_hooks(recurse=False)
+
+    @classmethod
+    def check_if_exists_or_initialize(cls, module: torch.nn.Module) -> "HookRegistry":
+        if not hasattr(module, "_diffusers_hook"):
+            module._diffusers_hook = cls(module)
+        return module._diffusers_hook
+
+    def __repr__(self) -> str:
+        registry_repr = ""
+        for i, hook_name in enumerate(self._hook_order):
+            if self.hooks[hook_name].__class__.__repr__ is not object.__repr__:
+                hook_repr = self.hooks[hook_name].__repr__()
+            else:
+                hook_repr = self.hooks[hook_name].__class__.__name__
+            registry_repr += f"  ({i}) {hook_name} - {hook_repr}"
+            if i < len(self._hook_order) - 1:
+                registry_repr += "\n"
+        return f"HookRegistry(\n{registry_repr}\n)"
+
+
+class GroupOffloadingType(str, Enum):
+    BLOCK_LEVEL = "block_level"
+    LEAF_LEVEL = "leaf_level"
+
+
+@dataclass
+class GroupOffloadingConfig:
+    onload_device: torch.device
+    offload_device: torch.device
+    offload_type: GroupOffloadingType
+    non_blocking: bool
+    record_stream: bool
+    low_cpu_mem_usage: bool
+    num_blocks_per_group: Optional[int] = None
+    offload_to_disk_path: Optional[str] = None
+    stream: Optional[Union[torch.cuda.Stream, torch.Stream]] = None
+    block_modules: Optional[List[str]] = None
+    exclude_kwargs: Optional[List[str]] = None
+    module_prefix: Optional[str] = ""
+
+
+class ModuleGroup:
+    def __init__(
+        self,
+        modules: List[torch.nn.Module],
+        offload_device: torch.device,
+        onload_device: torch.device,
+        offload_leader: torch.nn.Module,
+        onload_leader: Optional[torch.nn.Module] = None,
+        parameters: Optional[List[torch.nn.Parameter]] = None,
+        buffers: Optional[List[torch.Tensor]] = None,
+        non_blocking: bool = False,
+        stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
+        record_stream: Optional[bool] = False,
+        low_cpu_mem_usage: bool = False,
+        onload_self: bool = True,
+        offload_to_disk_path: Optional[str] = None,
+        group_id: Optional[Union[int, str]] = None,
+    ) -> None:
+        self.modules = modules
+        self.offload_device = offload_device
+        self.onload_device = onload_device
+        self.offload_leader = offload_leader
+        self.onload_leader = onload_leader
+        self.parameters = parameters or []
+        self.buffers = buffers or []
+        self.non_blocking = non_blocking or stream is not None
+        self.stream = stream
+        self.record_stream = record_stream
+        self.onload_self = onload_self
+        self.low_cpu_mem_usage = low_cpu_mem_usage
+
+        self.offload_to_disk_path = offload_to_disk_path
+        self._is_offloaded_to_disk = False
+
+        if self.offload_to_disk_path is not None:
+            # Instead of `group_id or str(id(self))` we do this because `group_id` can be "" as well.
+            self.group_id = group_id if group_id is not None else str(id(self))
+            short_hash = _compute_group_hash(self.group_id)
+            self.safetensors_file_path = os.path.join(self.offload_to_disk_path, f"group_{short_hash}.safetensors")
+
+            all_tensors = []
+            for module in self.modules:
+                all_tensors.extend(list(module.parameters()))
+                all_tensors.extend(list(module.buffers()))
+            all_tensors.extend(self.parameters)
+            all_tensors.extend(self.buffers)
+            all_tensors = list(dict.fromkeys(all_tensors))  # Remove duplicates
+
+            self.tensor_to_key = {tensor: f"tensor_{i}" for i, tensor in enumerate(all_tensors)}
+            self.key_to_tensor = {v: k for k, v in self.tensor_to_key.items()}
+            self.cpu_param_dict = {}
+        else:
+            self.cpu_param_dict = self._init_cpu_param_dict()
+
+        self._torch_accelerator_module = (
+            getattr(torch, torch.accelerator.current_accelerator().type)
+            if hasattr(torch, "accelerator")
+            else torch.cuda
+        )
+
+    def _init_cpu_param_dict(self):
+        cpu_param_dict = {}
+        if self.stream is None:
+            return cpu_param_dict
+
+        for module in self.modules:
+            for param in module.parameters():
+                cpu_param_dict[param] = param.data.cpu() if self.low_cpu_mem_usage else param.data.cpu().pin_memory()
+            for buffer in module.buffers():
+                cpu_param_dict[buffer] = (
+                    buffer.data.cpu() if self.low_cpu_mem_usage else buffer.data.cpu().pin_memory()
+                )
+
+        for param in self.parameters:
+            cpu_param_dict[param] = param.data.cpu() if self.low_cpu_mem_usage else param.data.cpu().pin_memory()
+
+        for buffer in self.buffers:
+            cpu_param_dict[buffer] = buffer.data.cpu() if self.low_cpu_mem_usage else buffer.data.cpu().pin_memory()
+
+        return cpu_param_dict
+
+    @contextmanager
+    def _pinned_memory_tensors(self):
+        try:
+            pinned_dict = {
+                param: tensor.pin_memory() if not tensor.is_pinned() else tensor
+                for param, tensor in self.cpu_param_dict.items()
+            }
+            yield pinned_dict
+        finally:
+            pinned_dict = None
+
+    def _transfer_tensor_to_device(self, tensor, source_tensor, default_stream):
+        tensor.data = source_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+        if self.record_stream:
+            tensor.data.record_stream(default_stream)
+
+    def _process_tensors_from_modules(self, pinned_memory=None, default_stream=None):
+        for group_module in self.modules:
+            for param in group_module.parameters():
+                source = pinned_memory[param] if pinned_memory else param.data
+                self._transfer_tensor_to_device(param, source, default_stream)
+            for buffer in group_module.buffers():
+                source = pinned_memory[buffer] if pinned_memory else buffer.data
+                self._transfer_tensor_to_device(buffer, source, default_stream)
+
+        for param in self.parameters:
+            source = pinned_memory[param] if pinned_memory else param.data
+            self._transfer_tensor_to_device(param, source, default_stream)
+
+        for buffer in self.buffers:
+            source = pinned_memory[buffer] if pinned_memory else buffer.data
+            self._transfer_tensor_to_device(buffer, source, default_stream)
+
+    def _onload_from_disk(self):
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
+
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        current_stream = self._torch_accelerator_module.current_stream() if self.record_stream else None
+
+        with context:
+            # Load to CPU (if using streams) or directly to target device, pin, and async copy to device
+            device = str(self.onload_device) if self.stream is None else "cpu"
+            loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=device)
+
+            if self.stream is not None:
+                for key, tensor_obj in self.key_to_tensor.items():
+                    pinned_tensor = loaded_tensors[key].pin_memory()
+                    tensor_obj.data = pinned_tensor.to(self.onload_device, non_blocking=self.non_blocking)
+                    if self.record_stream:
+                        tensor_obj.data.record_stream(current_stream)
+            else:
+                onload_device = (
+                    self.onload_device.type if isinstance(self.onload_device, torch.device) else self.onload_device
+                )
+                loaded_tensors = safetensors.torch.load_file(self.safetensors_file_path, device=onload_device)
+                for key, tensor_obj in self.key_to_tensor.items():
+                    tensor_obj.data = loaded_tensors[key]
+
+    def _onload_from_memory(self):
+        if self.stream is not None:
+            # Wait for previous Host->Device transfer to complete
+            self.stream.synchronize()
+
+        context = nullcontext() if self.stream is None else self._torch_accelerator_module.stream(self.stream)
+        default_stream = self._torch_accelerator_module.current_stream() if self.stream is not None else None
+
+        with context:
+            if self.stream is not None:
+                with self._pinned_memory_tensors() as pinned_memory:
+                    self._process_tensors_from_modules(pinned_memory, default_stream=default_stream)
+            else:
+                self._process_tensors_from_modules(None)
+
+    def _offload_to_disk(self):
+        # TODO: we can potentially optimize this code path by checking if the _all_ the desired
+        # safetensor files exist on the disk and if so, skip this step entirely, reducing IO
+        # overhead. Currently, we just check if the given `safetensors_file_path` exists and if not
+        # we perform a write.
+        # Check if the file has been saved in this session or if it already exists on disk.
+        if not self._is_offloaded_to_disk and not os.path.exists(self.safetensors_file_path):
+            os.makedirs(os.path.dirname(self.safetensors_file_path), exist_ok=True)
+            tensors_to_save = {key: tensor.data.to(self.offload_device) for tensor, key in self.tensor_to_key.items()}
+            safetensors.torch.save_file(tensors_to_save, self.safetensors_file_path)
+
+        # The group is now considered offloaded to disk for the rest of the session.
+        self._is_offloaded_to_disk = True
+
+        # We do this to free up the RAM which is still holding the up tensor data.
+        for tensor_obj in self.tensor_to_key.keys():
+            tensor_obj.data = torch.empty_like(tensor_obj.data, device=self.offload_device)
+
+    def _offload_to_memory(self):
+        if self.stream is not None:
+            if not self.record_stream:
+                self._torch_accelerator_module.current_stream().synchronize()
+
+            for group_module in self.modules:
+                for param in group_module.parameters():
+                    param.data = self.cpu_param_dict[param]
+            for param in self.parameters:
+                param.data = self.cpu_param_dict[param]
+            for buffer in self.buffers:
+                buffer.data = self.cpu_param_dict[buffer]
+        else:
+            for group_module in self.modules:
+                group_module.to(self.offload_device, non_blocking=False)
+            for param in self.parameters:
+                param.data = param.data.to(self.offload_device, non_blocking=False)
+            for buffer in self.buffers:
+                buffer.data = buffer.data.to(self.offload_device, non_blocking=False)
+
+    @torch.compiler.disable()
+    def onload_(self):
+        r"""Onloads the group of parameters to the onload_device."""
+        if self.offload_to_disk_path is not None:
+            self._onload_from_disk()
+        else:
+            self._onload_from_memory()
+
+    @torch.compiler.disable()
+    def offload_(self):
+        r"""Offloads the group of parameters to the offload_device."""
+        if self.offload_to_disk_path:
+            self._offload_to_disk()
+        else:
+            self._offload_to_memory()
+
+
+class GroupOffloadingHook(ModelHook):
+    r"""
+    A hook that offloads groups of torch.nn.Module to the CPU for storage and onloads to accelerator device for
+    computation. Each group has one "onload leader" module that is responsible for onloading, and an "offload leader"
+    module that is responsible for offloading. If prefetching is enabled, the onload leader of the previous module
+    group is responsible for onloading the current module group.
+    """
+
+    _is_stateful = False
+
+    def __init__(self, group: ModuleGroup, *, config: GroupOffloadingConfig) -> None:
+        self.group = group
+        self.next_group: Optional[ModuleGroup] = None
+        self.config = config
+
+    def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return module
+
+    def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
+        # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
+        # method is the onload_leader of the group.
+        if self.group.onload_leader is None:
+            self.group.onload_leader = module
+
+        # If the current module is the onload_leader of the group, we onload the group if it is supposed
+        # to onload itself. In the case of using prefetching with streams, we onload the next group if
+        # it is not supposed to onload itself.
+        if self.group.onload_leader == module:
+            if self.group.onload_self:
+                self.group.onload_()
+
+            should_onload_next_group = self.next_group is not None and not self.next_group.onload_self
+            if should_onload_next_group:
+                self.next_group.onload_()
+
+            should_synchronize = (
+                not self.group.onload_self and self.group.stream is not None and not should_onload_next_group
+            )
+            if should_synchronize:
+                # If this group didn't onload itself, it means it was asynchronously onloaded by the
+                # previous group. We need to synchronize the side stream to ensure parameters
+                # are completely loaded to proceed with forward pass. Without this, uninitialized
+                # weights will be used in the computation, leading to incorrect results
+                # Also, we should only do this synchronization if we don't already do it from the sync call in
+                # self.next_group.onload_, hence the `not should_onload_next_group` check.
+                self.group.stream.synchronize()
+
+        args = send_to_device(args, self.group.onload_device, non_blocking=self.group.non_blocking)
+
+        # Some Autoencoder models use a feature cache that is passed through submodules
+        # and modified in place. The `send_to_device` call returns a copy of this feature cache object
+        # which breaks the inplace updates. Use `exclude_kwargs` to mark these cache features
+        exclude_kwargs = self.config.exclude_kwargs or []
+        if exclude_kwargs:
+            moved_kwargs = send_to_device(
+                {k: v for k, v in kwargs.items() if k not in exclude_kwargs},
+                self.group.onload_device,
+                non_blocking=self.group.non_blocking,
+            )
+            kwargs.update(moved_kwargs)
+        else:
+            kwargs = send_to_device(kwargs, self.group.onload_device, non_blocking=self.group.non_blocking)
+
+        return args, kwargs
+
+    def post_forward(self, module: torch.nn.Module, output):
+        if self.group.offload_leader == module:
+            self.group.offload_()
+        return output
+
+
+class LazyPrefetchGroupOffloadingHook(ModelHook):
+    r"""
+    A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
+    This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
+    invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
+    prefetching groups in the correct order.
+    """
+
+    _is_stateful = False
+
+    def __init__(self):
+        self.execution_order: List[Tuple[str, torch.nn.Module]] = []
+        self._layer_execution_tracker_module_names = set()
+
+    def initialize_hook(self, module):
+        def make_execution_order_update_callback(current_name, current_submodule):
+            def callback():
+                if not torch.compiler.is_compiling():
+                    logger.debug(f"Adding {current_name} to the execution order")
+                self.execution_order.append((current_name, current_submodule))
+
+            return callback
+
+        # To every submodule that contains a group offloading hook (at this point, no prefetching is enabled for any
+        # of the groups), we add a layer execution tracker hook that will be used to determine the order in which the
+        # layers are executed during the forward pass.
+        for name, submodule in module.named_modules():
+            if name == "" or not hasattr(submodule, "_diffusers_hook"):
+                continue
+
+            registry = HookRegistry.check_if_exists_or_initialize(submodule)
+            group_offloading_hook = registry.get_hook(_GROUP_OFFLOADING)
+
+            if group_offloading_hook is not None:
+                # For the first forward pass, we have to load in a blocking manner
+                group_offloading_hook.group.non_blocking = False
+                layer_tracker_hook = LayerExecutionTrackerHook(make_execution_order_update_callback(name, submodule))
+                registry.register_hook(layer_tracker_hook, _LAYER_EXECUTION_TRACKER)
+                self._layer_execution_tracker_module_names.add(name)
+
+        return module
+
+    def post_forward(self, module, output):
+        # At this point, for the current modules' submodules, we know the execution order of the layers. We can now
+        # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each
+        # group offloading hook.
+        num_executed = len(self.execution_order)
+        execution_order_module_names = {name for name, _ in self.execution_order}
+
+        # It may be possible that some layers were not executed during the forward pass. This can happen if the layer
+        # is not used in the forward pass, or if the layer is not executed due to some other reason. In such cases, we
+        # may not be able to apply prefetching in the correct order, which can lead to device-mismatch related errors
+        # if the missing layers end up being executed in the future.
+        if execution_order_module_names != self._layer_execution_tracker_module_names:
+            unexecuted_layers = list(self._layer_execution_tracker_module_names - execution_order_module_names)
+            if not torch.compiler.is_compiling():
+                logger.warning(
+                    "It seems like some layers were not executed during the forward pass. This may lead to problems when "
+                    "applying lazy prefetching with automatic tracing and lead to device-mismatch related errors. Please "
+                    "make sure that all layers are executed during the forward pass. The following layers were not executed:\n"
+                    f"{unexecuted_layers=}"
+                )
+
+        # Remove the layer execution tracker hooks from the submodules
+        base_module_registry = module._diffusers_hook
+        registries = [submodule._diffusers_hook for _, submodule in self.execution_order]
+        group_offloading_hooks = [registry.get_hook(_GROUP_OFFLOADING) for registry in registries]
+
+        for i in range(num_executed):
+            registries[i].remove_hook(_LAYER_EXECUTION_TRACKER, recurse=False)
+
+        # Remove the current lazy prefetch group offloading hook so that it doesn't interfere with the next forward pass
+        base_module_registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=False)
+
+        # LazyPrefetchGroupOffloadingHook is only used with streams, so we know that non_blocking should be True.
+        # We disable non_blocking for the first forward pass, but need to enable it for the subsequent passes to
+        # see the benefits of prefetching.
+        for hook in group_offloading_hooks:
+            hook.group.non_blocking = True
+
+        # Set required attributes for prefetching
+        if num_executed > 0:
+            base_module_group_offloading_hook = base_module_registry.get_hook(_GROUP_OFFLOADING)
+            base_module_group_offloading_hook.next_group = group_offloading_hooks[0].group
+            base_module_group_offloading_hook.next_group.onload_self = False
+
+        for i in range(num_executed - 1):
+            name1, _ = self.execution_order[i]
+            name2, _ = self.execution_order[i + 1]
+            if not torch.compiler.is_compiling():
+                logger.debug(f"Applying lazy prefetch group offloading from {name1} to {name2}")
+            group_offloading_hooks[i].next_group = group_offloading_hooks[i + 1].group
+            group_offloading_hooks[i].next_group.onload_self = False
+
+        return output
+
+
+class LayerExecutionTrackerHook(ModelHook):
+    r"""
+    A hook that tracks the order in which the layers are executed during the forward pass by calling back to the
+    LazyPrefetchGroupOffloadingHook to update the execution order.
+    """
+
+    _is_stateful = False
+
+    def __init__(self, execution_order_update_callback):
+        self.execution_order_update_callback = execution_order_update_callback
+
+    def pre_forward(self, module, *args, **kwargs):
+        self.execution_order_update_callback()
+        return args, kwargs
+
+
+def apply_group_offloading(
+    module: torch.nn.Module,
+    onload_device: Union[str, torch.device],
+    offload_device: Union[str, torch.device] = torch.device("cpu"),
+    offload_type: Union[str, GroupOffloadingType] = "block_level",
+    num_blocks_per_group: Optional[int] = None,
+    non_blocking: bool = False,
+    use_stream: bool = False,
+    record_stream: bool = False,
+    low_cpu_mem_usage: bool = False,
+    offload_to_disk_path: Optional[str] = None,
+    block_modules: Optional[List[str]] = None,
+    exclude_kwargs: Optional[List[str]] = None,
+) -> None:
+    r"""
+    Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is, and
+    where it is beneficial, we need to first provide some context on how other supported offloading methods work.
+
+    Typically, offloading is done at two levels:
+    - Module-level: In Diffusers, this can be enabled using the `ModelMixin::enable_model_cpu_offload()` method. It
+      works by offloading each component of a pipeline to the CPU for storage, and onloading to the accelerator device
+      when needed for computation. This method is more memory-efficient than keeping all components on the accelerator,
+      but the memory requirements are still quite high. For this method to work, one needs memory equivalent to size of
+      the model in runtime dtype + size of largest intermediate activation tensors to be able to complete the forward
+      pass.
+    - Leaf-level: In Diffusers, this can be enabled using the `ModelMixin::enable_sequential_cpu_offload()` method. It
+      works by offloading the lowest leaf-level parameters of the computation graph to the CPU for storage, and
+      onloading only the leafs to the accelerator device for computation. This uses the lowest amount of accelerator
+      memory, but can be slower due to the excessive number of device synchronizations.
+
+    Group offloading is a middle ground between the two methods. It works by offloading groups of internal layers,
+    (either `torch.nn.ModuleList` or `torch.nn.Sequential`). This method uses lower memory than module-level
+    offloading. It is also faster than leaf-level/sequential offloading, as the number of device synchronizations is
+    reduced.
+
+    Another supported feature (for CUDA devices with support for asynchronous data transfer streams) is the ability to
+    overlap data transfer and computation to reduce the overall execution time compared to sequential offloading. This
+    is enabled using layer prefetching with streams, i.e., the layer that is to be executed next starts onloading to
+    the accelerator device while the current layer is being executed - this increases the memory requirements slightly.
+    Note that this implementation also supports leaf-level offloading but can be made much faster when using streams.
+
+    Args:
+        module (`torch.nn.Module`):
+            The module to which group offloading is applied.
+        onload_device (`torch.device`):
+            The device to which the group of modules are onloaded.
+        offload_device (`torch.device`, defaults to `torch.device("cpu")`):
+            The device to which the group of modules are offloaded. This should typically be the CPU. Default is CPU.
+        offload_type (`str` or `GroupOffloadingType`, defaults to "block_level"):
+            The type of offloading to be applied. Can be one of "block_level" or "leaf_level". Default is
+            "block_level".
+        offload_to_disk_path (`str`, *optional*, defaults to `None`):
+            The path to the directory where parameters will be offloaded. Setting this option can be useful in limited
+            RAM environment settings where a reasonable speed-memory trade-off is desired.
+        num_blocks_per_group (`int`, *optional*):
+            The number of blocks per group when using offload_type="block_level". This is required when using
+            offload_type="block_level".
+        non_blocking (`bool`, defaults to `False`):
+            If True, offloading and onloading is done with non-blocking data transfer.
+        use_stream (`bool`, defaults to `False`):
+            If True, offloading and onloading is done asynchronously using a CUDA stream. This can be useful for
+            overlapping computation and data transfer.
+        record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
+            as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to the
+            [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) more
+            details.
+        low_cpu_mem_usage (`bool`, defaults to `False`):
+            If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them. This
+            option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be useful when
+            the CPU memory is a bottleneck but may counteract the benefits of using streams.
+        block_modules (`List[str]`, *optional*):
+            List of module names that should be treated as blocks for offloading. If provided, only these modules will
+            be considered for block-level offloading. If not provided, the default block detection logic will be used.
+        exclude_kwargs (`List[str]`, *optional*):
+            List of kwarg keys that should not be processed by send_to_device. This is useful for mutable state like
+            caching lists that need to maintain their object identity across forward passes. If not provided, will be
+            inferred from the module's `_skip_keys` attribute if it exists.
+
+    Example:
+        ```python
+        >>> from diffusers import CogVideoXTransformer3DModel
+        >>> from diffusers.hooks import apply_group_offloading
+
+        >>> transformer = CogVideoXTransformer3DModel.from_pretrained(
+        ...     "THUDM/CogVideoX-5b", subfolder="transformer", torch_dtype=torch.bfloat16
+        ... )
+
+        >>> apply_group_offloading(
+        ...     transformer,
+        ...     onload_device=torch.device("cuda"),
+        ...     offload_device=torch.device("cpu"),
+        ...     offload_type="block_level",
+        ...     num_blocks_per_group=2,
+        ...     use_stream=True,
+        ... )
+        ```
+    """
+
+    onload_device = torch.device(onload_device) if isinstance(onload_device, str) else onload_device
+    offload_device = torch.device(offload_device) if isinstance(offload_device, str) else offload_device
+    offload_type = GroupOffloadingType(offload_type)
+
+    stream = None
+    if use_stream:
+        if torch.cuda.is_available():
+            stream = torch.cuda.Stream()
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            stream = torch.Stream()
+        else:
+            raise ValueError("Using streams for data transfer requires a CUDA device, or an Intel XPU device.")
+
+    if not use_stream and record_stream:
+        raise ValueError("`record_stream` cannot be True when `use_stream=False`.")
+    if offload_type == GroupOffloadingType.BLOCK_LEVEL and num_blocks_per_group is None:
+        raise ValueError("`num_blocks_per_group` must be provided when using `offload_type='block_level'.")
+
+    _raise_error_if_accelerate_model_or_sequential_hook_present(module)
+
+    if block_modules is None:
+        block_modules = getattr(module, "_group_offload_block_modules", None)
+
+    if exclude_kwargs is None:
+        exclude_kwargs = getattr(module, "_skip_keys", None)
+
+    config = GroupOffloadingConfig(
+        onload_device=onload_device,
+        offload_device=offload_device,
+        offload_type=offload_type,
+        num_blocks_per_group=num_blocks_per_group,
+        non_blocking=non_blocking,
+        stream=stream,
+        record_stream=record_stream,
+        low_cpu_mem_usage=low_cpu_mem_usage,
+        offload_to_disk_path=offload_to_disk_path,
+        block_modules=block_modules,
+        exclude_kwargs=exclude_kwargs,
+    )
+    _apply_group_offloading(module, config)
+
+
+def _apply_group_offloading(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    if config.offload_type == GroupOffloadingType.BLOCK_LEVEL:
+        _apply_group_offloading_block_level(module, config)
+    elif config.offload_type == GroupOffloadingType.LEAF_LEVEL:
+        _apply_group_offloading_leaf_level(module, config)
+    else:
+        assert False
+
+
+def _apply_group_offloading_block_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    r"""
+    This function applies offloading to groups of torch.nn.ModuleList or torch.nn.Sequential blocks, and explicitly
+    defined block modules. In comparison to the "leaf_level" offloading, which is more fine-grained, this offloading is
+    done at the top-level blocks and modules specified in block_modules.
+
+    When block_modules is provided, only those modules will be treated as blocks for offloading. For each specified
+    module, recursively apply block offloading to it.
+    """
+    if config.stream is not None and config.num_blocks_per_group != 1:
+        logger.warning(
+            f"Using streams is only supported for num_blocks_per_group=1. Got {config.num_blocks_per_group=}. Setting it to 1."
+        )
+        config.num_blocks_per_group = 1
+
+    block_modules = set(config.block_modules) if config.block_modules is not None else set()
+
+    # Create module groups for ModuleList and Sequential blocks, and explicitly defined block modules
+    modules_with_group_offloading = set()
+    unmatched_modules = []
+    matched_module_groups = []
+
+    for name, submodule in module.named_children():
+        # Check if this is an explicitly defined block module
+        if name in block_modules:
+            # Track submodule using a prefix to avoid filename collisions during disk offload.
+            # Without this, submodules sharing the same model class would be assigned identical
+            # filenames (derived from the class name).
+            prefix = f"{config.module_prefix}{name}." if config.module_prefix else f"{name}."
+            submodule_config = replace(config, module_prefix=prefix)
+
+            _apply_group_offloading_block_level(submodule, submodule_config)
+            modules_with_group_offloading.add(name)
+
+        elif isinstance(submodule, (torch.nn.ModuleList, torch.nn.Sequential)):
+            # Handle ModuleList and Sequential blocks as before
+            for i in range(0, len(submodule), config.num_blocks_per_group):
+                current_modules = list(submodule[i : i + config.num_blocks_per_group])
+                if len(current_modules) == 0:
+                    continue
+
+                group_id = f"{config.module_prefix}{name}_{i}_{i + len(current_modules) - 1}"
+                group = ModuleGroup(
+                    modules=current_modules,
+                    offload_device=config.offload_device,
+                    onload_device=config.onload_device,
+                    offload_to_disk_path=config.offload_to_disk_path,
+                    offload_leader=current_modules[-1],
+                    onload_leader=current_modules[0],
+                    non_blocking=config.non_blocking,
+                    stream=config.stream,
+                    record_stream=config.record_stream,
+                    low_cpu_mem_usage=config.low_cpu_mem_usage,
+                    onload_self=True,
+                    group_id=group_id,
+                )
+                matched_module_groups.append(group)
+                for j in range(i, i + len(current_modules)):
+                    modules_with_group_offloading.add(f"{name}.{j}")
+        else:
+            # This is an unmatched module
+            unmatched_modules.append((name, submodule))
+
+    # Apply group offloading hooks to the module groups
+    for i, group in enumerate(matched_module_groups):
+        for group_module in group.modules:
+            _apply_group_offloading_hook(group_module, group, config=config)
+
+    # Parameters and Buffers of the top-level module need to be offloaded/onloaded separately
+    # when the forward pass of this module is called. This is because the top-level module is not
+    # part of any group (as doing so would lead to no VRAM savings).
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    parameters = [param for _, param in parameters]
+    buffers = [buffer for _, buffer in buffers]
+
+    # Create a group for the remaining unmatched submodules of the top-level
+    # module so that they are on the correct device when the forward pass is called.
+    unmatched_modules = [unmatched_module for _, unmatched_module in unmatched_modules]
+    if len(unmatched_modules) > 0 or len(parameters) > 0 or len(buffers) > 0:
+        unmatched_group = ModuleGroup(
+            modules=unmatched_modules,
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            onload_self=True,
+            group_id=f"{config.module_prefix}{module.__class__.__name__}_unmatched_group",
+        )
+        if config.stream is None:
+            _apply_group_offloading_hook(module, unmatched_group, config=config)
+        else:
+            _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+
+
+def _apply_group_offloading_leaf_level(module: torch.nn.Module, config: GroupOffloadingConfig) -> None:
+    r"""
+    This function applies offloading to groups of leaf modules in a torch.nn.Module. This method has minimal memory
+    requirements. However, it can be slower compared to other offloading methods due to the excessive number of device
+    synchronizations. When using devices that support streams to overlap data transfer and computation, this method can
+    reduce memory usage without any performance degradation.
+    """
+    # Create module groups for leaf modules and apply group offloading hooks
+    modules_with_group_offloading = set()
+    for name, submodule in module.named_modules():
+        if not isinstance(submodule, _GO_LC_SUPPORTED_PYTORCH_LAYERS):
+            continue
+        group = ModuleGroup(
+            modules=[submodule],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=submodule,
+            onload_leader=submodule,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(submodule, group, config=config)
+        modules_with_group_offloading.add(name)
+
+    # Parameters and Buffers at all non-leaf levels need to be offloaded/onloaded separately when the forward pass
+    # of the module is called
+    module_dict = dict(module.named_modules())
+    parameters = _gather_parameters_with_no_group_offloading_parent(module, modules_with_group_offloading)
+    buffers = _gather_buffers_with_no_group_offloading_parent(module, modules_with_group_offloading)
+
+    # Find closest module parent for each parameter and buffer, and attach group hooks
+    parent_to_parameters = {}
+    for name, param in parameters:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_parameters:
+            parent_to_parameters[parent_name].append(param)
+        else:
+            parent_to_parameters[parent_name] = [param]
+
+    parent_to_buffers = {}
+    for name, buffer in buffers:
+        parent_name = _find_parent_module_in_module_dict(name, module_dict)
+        if parent_name in parent_to_buffers:
+            parent_to_buffers[parent_name].append(buffer)
+        else:
+            parent_to_buffers[parent_name] = [buffer]
+
+    parent_names = set(parent_to_parameters.keys()) | set(parent_to_buffers.keys())
+    for name in parent_names:
+        parameters = parent_to_parameters.get(name, [])
+        buffers = parent_to_buffers.get(name, [])
+        parent_module = module_dict[name]
+        group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_leader=parent_module,
+            onload_leader=parent_module,
+            offload_to_disk_path=config.offload_to_disk_path,
+            parameters=parameters,
+            buffers=buffers,
+            non_blocking=config.non_blocking,
+            stream=config.stream,
+            record_stream=config.record_stream,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=name,
+        )
+        _apply_group_offloading_hook(parent_module, group, config=config)
+
+    if config.stream is not None:
+        # When using streams, we need to know the layer execution order for applying prefetching (to overlap data transfer
+        # and computation). Since we don't know the order beforehand, we apply a lazy prefetching hook that will find the
+        # execution order and apply prefetching in the correct order.
+        unmatched_group = ModuleGroup(
+            modules=[],
+            offload_device=config.offload_device,
+            onload_device=config.onload_device,
+            offload_to_disk_path=config.offload_to_disk_path,
+            offload_leader=module,
+            onload_leader=module,
+            parameters=None,
+            buffers=None,
+            non_blocking=False,
+            stream=None,
+            record_stream=False,
+            low_cpu_mem_usage=config.low_cpu_mem_usage,
+            onload_self=True,
+            group_id=_GROUP_ID_LAZY_LEAF,
+        )
+        _apply_lazy_group_offloading_hook(module, unmatched_group, config=config)
+
+
+def _apply_group_offloading_hook(
+    module: torch.nn.Module,
+    group: ModuleGroup,
+    *,
+    config: GroupOffloadingConfig,
+) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+
+    # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
+    # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
+    if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, config=config)
+        registry.register_hook(hook, _GROUP_OFFLOADING)
+
+
+def _apply_lazy_group_offloading_hook(
+    module: torch.nn.Module,
+    group: ModuleGroup,
+    *,
+    config: GroupOffloadingConfig,
+) -> None:
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+
+    # We may have already registered a group offloading hook if the module had a torch.nn.Parameter whose parent
+    # is the current module. In such cases, we don't want to overwrite the existing group offloading hook.
+    if registry.get_hook(_GROUP_OFFLOADING) is None:
+        hook = GroupOffloadingHook(group, config=config)
+        registry.register_hook(hook, _GROUP_OFFLOADING)
+
+    lazy_prefetch_hook = LazyPrefetchGroupOffloadingHook()
+    registry.register_hook(lazy_prefetch_hook, _LAZY_PREFETCH_GROUP_OFFLOADING)
+
+
+def _gather_parameters_with_no_group_offloading_parent(
+    module: torch.nn.Module, modules_with_group_offloading: Set[str]
+) -> List[torch.nn.Parameter]:
+    parameters = []
+    for name, parameter in module.named_parameters():
+        has_parent_with_group_offloading = False
+        atoms = name.split(".")
+        while len(atoms) > 0:
+            parent_name = ".".join(atoms)
+            if parent_name in modules_with_group_offloading:
+                has_parent_with_group_offloading = True
+                break
+            atoms.pop()
+        if not has_parent_with_group_offloading:
+            parameters.append((name, parameter))
+    return parameters
+
+
+def _gather_buffers_with_no_group_offloading_parent(
+    module: torch.nn.Module, modules_with_group_offloading: Set[str]
+) -> List[torch.Tensor]:
+    buffers = []
+    for name, buffer in module.named_buffers():
+        has_parent_with_group_offloading = False
+        atoms = name.split(".")
+        while len(atoms) > 0:
+            parent_name = ".".join(atoms)
+            if parent_name in modules_with_group_offloading:
+                has_parent_with_group_offloading = True
+                break
+            atoms.pop()
+        if not has_parent_with_group_offloading:
+            buffers.append((name, buffer))
+    return buffers
+
+
+def _find_parent_module_in_module_dict(name: str, module_dict: Dict[str, torch.nn.Module]) -> str:
+    atoms = name.split(".")
+    while len(atoms) > 0:
+        parent_name = ".".join(atoms)
+        if parent_name in module_dict:
+            return parent_name
+        atoms.pop()
+    return ""
+
+
+def _raise_error_if_accelerate_model_or_sequential_hook_present(module: torch.nn.Module) -> None:
+    if not is_accelerate_available():
+        return
+    for name, submodule in module.named_modules():
+        if not hasattr(submodule, "_hf_hook"):
+            continue
+        if isinstance(submodule._hf_hook, (AlignDevicesHook, CpuOffload)):
+            raise ValueError(
+                f"Cannot apply group offloading to a module that is already applying an alternative "
+                f"offloading strategy from Accelerate. If you want to apply group offloading, please "
+                f"disable the existing offloading strategy first. Offending module: {name} ({type(submodule)})"
+            )
+
+
+def _get_top_level_group_offload_hook(module: torch.nn.Module) -> Optional[GroupOffloadingHook]:
+    for submodule in module.modules():
+        if hasattr(submodule, "_diffusers_hook"):
+            group_offloading_hook = submodule._diffusers_hook.get_hook(_GROUP_OFFLOADING)
+            if group_offloading_hook is not None:
+                return group_offloading_hook
+    return None
+
+
+def _is_group_offload_enabled(module: torch.nn.Module) -> bool:
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    return top_level_group_offload_hook is not None
+
+
+def _get_group_onload_device(module: torch.nn.Module) -> torch.device:
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+    if top_level_group_offload_hook is not None:
+        return top_level_group_offload_hook.config.onload_device
+    raise ValueError("Group offloading is not enabled for the provided module.")
+
+
+def _compute_group_hash(group_id):
+    hashed_id = hashlib.sha256(group_id.encode("utf-8")).hexdigest()
+    # first 16 characters for a reasonably short but unique name
+    return hashed_id[:16]
+
+
+def _maybe_remove_and_reapply_group_offloading(module: torch.nn.Module) -> None:
+    r"""
+    Removes the group offloading hook from the module and re-applies it. This is useful when the module has been
+    modified in-place and the group offloading hook references-to-tensors needs to be updated. The in-place
+    modification can happen in a number of ways, for example, fusing QKV or unloading/loading LoRAs on-the-fly.
+
+    In this implementation, we make an assumption that group offloading has only been applied at the top-level module,
+    and therefore all submodules have the same onload and offload devices. If this assumption is not true, say in the
+    case where user has applied group offloading at multiple levels, this function will not work as expected.
+
+    There is some performance penalty associated with doing this when non-default streams are used, because we need to
+    retrace the execution order of the layers with `LazyPrefetchGroupOffloadingHook`.
+    """
+    top_level_group_offload_hook = _get_top_level_group_offload_hook(module)
+
+    if top_level_group_offload_hook is None:
+        return
+
+    registry = HookRegistry.check_if_exists_or_initialize(module)
+    registry.remove_hook(_GROUP_OFFLOADING, recurse=True)
+    registry.remove_hook(_LAYER_EXECUTION_TRACKER, recurse=True)
+    registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=True)
+
+    _apply_group_offloading(module, top_level_group_offload_hook.config)
+
+
+def remove_group_offloading(
+    module: torch.nn.Module,
+    exclude_modules: Optional[Union[str, List[str]]] = None,
+) -> None:
+    """
+    Removes group offloading hooks from a module and its submodules.
+    
+    Args:
+        module (`torch.nn.Module`):
+            The module from which to remove group offloading hooks.
+        exclude_modules (`Union[str, List[str]]`, *optional*, defaults to `None`):
+            List of modules to exclude from hook removal.
+    """
+    if isinstance(exclude_modules, str):
+        exclude_modules = [exclude_modules]
+    elif exclude_modules is None:
+        exclude_modules = []
+    
+    # Check if this is a pipeline with components
+    if hasattr(module, 'components'):
+        unknown = set(exclude_modules) - module.components.keys()
+        if unknown:
+            logger.info(
+                f"The following modules are not present in pipeline: {', '.join(unknown)}. Ignore if this is expected."
+            )
+        
+        # Remove hooks from each component
+        for name, component in module.components.items():
+            if name not in exclude_modules and isinstance(component, torch.nn.Module):
+                registry = HookRegistry.check_if_exists_or_initialize(component)
+                registry.remove_hook(_GROUP_OFFLOADING, recurse=True)
+                registry.remove_hook(_LAYER_EXECUTION_TRACKER, recurse=True)
+                registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=True)
+    else:
+        # Original behavior for single modules
+        registry = HookRegistry.check_if_exists_or_initialize(module)
+        registry.remove_hook(_GROUP_OFFLOADING, recurse=True)
+        registry.remove_hook(_LAYER_EXECUTION_TRACKER, recurse=True)
+        registry.remove_hook(_LAZY_PREFETCH_GROUP_OFFLOADING, recurse=True)
+
+
+def safe_remove_group_offloading(obj, *args, **kwargs):
+    """Safely call remove_group_offloading"""
+    return remove_group_offloading(obj, *args, **kwargs)
+
+
+def enable_group_offload(
+    self,
+    onload_device: torch.device,
+    offload_device: torch.device = torch.device("cpu"),
+    offload_type: str = "block_level",
+    num_blocks_per_group: Optional[int] = None,
+    non_blocking: bool = False,
+    use_stream: bool = False,
+    record_stream: bool = False,
+    low_cpu_mem_usage=False,
+    offload_to_disk_path: Optional[str] = None,
+    exclude_modules: Optional[Union[str, List[str]]] = None,
+) -> None:
+    r"""
+    Applies group offloading to the internal layers of a torch.nn.Module. To understand what group offloading is,
+    and where it is beneficial, we need to first provide some context on how other supported offloading methods
+    work.
+
+    Typically, offloading is done at two levels:
+    - Module-level: In Diffusers, this can be enabled using the `ModelMixin::enable_model_cpu_offload()` method. It
+    works by offloading each component of a pipeline to the CPU for storage, and onloading to the accelerator
+    device when needed for computation. This method is more memory-efficient than keeping all components on the
+    accelerator, but the memory requirements are still quite high. For this method to work, one needs memory
+    equivalent to size of the model in runtime dtype + size of largest intermediate activation tensors to be able
+    to complete the forward pass.
+    - Leaf-level: In Diffusers, this can be enabled using the `ModelMixin::enable_sequential_cpu_offload()` method.
+        It
+    works by offloading the lowest leaf-level parameters of the computation graph to the CPU for storage, and
+    onloading only the leafs to the accelerator device for computation. This uses the lowest amount of accelerator
+    memory, but can be slower due to the excessive number of device synchronizations.
+
+    Group offloading is a middle ground between the two methods. It works by offloading groups of internal layers,
+    (either `torch.nn.ModuleList` or `torch.nn.Sequential`). This method uses lower memory than module-level
+    offloading. It is also faster than leaf-level/sequential offloading, as the number of device synchronizations
+    is reduced.
+
+    Another supported feature (for CUDA devices with support for asynchronous data transfer streams) is the ability
+    to overlap data transfer and computation to reduce the overall execution time compared to sequential
+    offloading. This is enabled using layer prefetching with streams, i.e., the layer that is to be executed next
+    starts onloading to the accelerator device while the current layer is being executed - this increases the
+    memory requirements slightly. Note that this implementation also supports leaf-level offloading but can be made
+    much faster when using streams.
+
+    Args:
+        onload_device (`torch.device`):
+            The device to which the group of modules are onloaded.
+        offload_device (`torch.device`, defaults to `torch.device("cpu")`):
+            The device to which the group of modules are offloaded. This should typically be the CPU. Default is
+            CPU.
+        offload_type (`str` or `GroupOffloadingType`, defaults to "block_level"):
+            The type of offloading to be applied. Can be one of "block_level" or "leaf_level". Default is
+            "block_level".
+        offload_to_disk_path (`str`, *optional*, defaults to `None`):
+            The path to the directory where parameters will be offloaded. Setting this option can be useful in
+            limited RAM environment settings where a reasonable speed-memory trade-off is desired.
+        num_blocks_per_group (`int`, *optional*):
+            The number of blocks per group when using offload_type="block_level". This is required when using
+            offload_type="block_level".
+        non_blocking (`bool`, defaults to `False`):
+            If True, offloading and onloading is done with non-blocking data transfer.
+        use_stream (`bool`, defaults to `False`):
+            If True, offloading and onloading is done asynchronously using a CUDA stream. This can be useful for
+            overlapping computation and data transfer.
+        record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
+            as having been used by this stream. It is faster at the expense of slightly more memory usage. Refer to
+            the [PyTorch official docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html)
+            more details.
+        low_cpu_mem_usage (`bool`, defaults to `False`):
+            If True, the CPU memory usage is minimized by pinning tensors on-the-fly instead of pre-pinning them.
+            This option only matters when using streamed CPU offloading (i.e. `use_stream=True`). This can be
+            useful when the CPU memory is a bottleneck but may counteract the benefits of using streams.
+        exclude_modules (`Union[str, List[str]]`, defaults to `None`): List of modules to exclude from offloading.
+
+    Example:
+        ```python
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+
+        >>> pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image", torch_dtype=torch.bfloat16)
+
+        >>> pipe.enable_group_offload(
+        ...     onload_device=torch.device("cuda"),
+        ...     offload_device=torch.device("cpu"),
+        ...     offload_type="leaf_level",
+        ...     use_stream=True,
+        ... )
+        >>> image = pipe("a beautiful sunset").images[0]
+        ```
+    """
+    if isinstance(exclude_modules, str):
+        exclude_modules = [exclude_modules]
+    elif exclude_modules is None:
+        exclude_modules = []
+
+    unknown = set(exclude_modules) - self.components.keys()
+    if unknown:
+        logger.info(
+            f"The following modules are not present in pipeline: {', '.join(unknown)}. Ignore if this is expected."
+        )
+
+    group_offload_kwargs = {
+        "onload_device": onload_device,
+        "offload_device": offload_device,
+        "offload_type": offload_type,
+        "num_blocks_per_group": num_blocks_per_group,
+        "non_blocking": non_blocking,
+        "use_stream": use_stream,
+        "record_stream": record_stream,
+        "low_cpu_mem_usage": low_cpu_mem_usage,
+        "offload_to_disk_path": offload_to_disk_path,
+    }
+    for name, component in self.components.items():
+        if name not in exclude_modules and isinstance(component, torch.nn.Module):
+            apply_group_offloading(module=component, **group_offload_kwargs)
+
+    if exclude_modules:
+        for module_name in exclude_modules:
+            module = getattr(self, module_name, None)
+            if module is not None and isinstance(module, torch.nn.Module):
+                module.to(onload_device)
+                logger.debug(f"Placed `{module_name}` on {onload_device} device as it was in `exclude_modules`.")
+
+
+def safe_enable_group_offload(obj, *args, **kwargs):
+    """Safely call enable_group_offload, register default implementation if not exists"""
+    
+    if not hasattr(obj, 'enable_group_offload'):
+        obj.enable_group_offload = types.MethodType(enable_group_offload, obj)
+    
+    return obj.enable_group_offload(*args, **kwargs)
+
+
+def register_auto_device_hook(model):
+    """
+    Register forward pre-hooks for all modules to automatically transfer device
+    
+    Args:
+        model: The model to process
+    
+    Returns:
+        model: The model with registered hooks
+    """
+    
+    def auto_device_hook(module, input: Tuple[Any, ...]):
+        """
+        Forward pre-hook function to automatically transfer device before forward
+        
+        Args:
+            module: Current module
+            input: Forward input arguments (in tuple form)
+        """
+        # Get the device of input tensor
+        input_device = None
+        
+        # Traverse input tuple to find the first tensor
+        for item in input:
+            if isinstance(item, torch.Tensor):
+                input_device = item.device
+                break
+            # Handle nested cases (like list, tuple, etc.)
+            elif isinstance(item, (list, tuple)):
+                for sub_item in item:
+                    if isinstance(sub_item, torch.Tensor):
+                        input_device = sub_item.device
+                        break
+                if input_device is not None:
+                    break
+        
+        # If no tensor input found, return directly
+        if input_device is None:
+            return
+        
+        # Get current device of the module
+        module_device = None
+        try:
+            # Try to get device from parameters
+            module_device = next(module.parameters()).device
+        except StopIteration:
+            # If no parameters, try to get from buffers
+            try:
+                module_device = next(module.buffers()).device
+            except StopIteration:
+                # No parameters or buffers, no need to transfer
+                return
+        
+        # Check if device transfer is needed
+        # Condition: module_device is not 'meta' and different from input_device
+        if module_device.type != 'meta' and module_device != input_device:
+            # print(f"Moving {module.__class__.__name__} from {module_device} to {input_device}")
+            module.to(input_device)
+    
+    # Register hooks for all submodules
+    hooks = []
+    for module in model.modules():
+        hook = module.register_forward_pre_hook(auto_device_hook)
+        hooks.append(hook)
+    
+    # Save hooks to model for later removal
+    model._auto_device_hooks = hooks
+    
+    return model
+
+
+def remove_auto_device_hook(model):
+    """
+    Remove previously registered auto device hooks
+    
+    Args:
+        model: The model to process
+    """
+    if hasattr(model, '_auto_device_hooks'):
+        for hook in model._auto_device_hooks:
+            hook.remove()
+        delattr(model, '_auto_device_hooks')
+        print("Auto device hooks removed")
\ No newline at end of file