mindspore-lab · zhanghuiyao · Oct 17, 2025 · Oct 13, 2025 · Oct 17, 2025
@@ -0,0 +1,3 @@
+### Image Credits
+
+The images in this folder are taken from the [Hugging Face Diffusers repository](https://github.com/huggingface/diffusers/tree/main/docs/source/en/imgs) and are subject to the Apache 2.0 license of the Diffusers project.
@@ -5,7 +5,8 @@
 > 我们的开发和验证基于Ascend Atlas 800T A2硬件，相关环境如下：
 > | mindspore  | ascend driver  |  firmware   | cann toolkit/kernel |
 > |:----------:|:--------------:|:-----------:|:------------------:|
-> |    2.5     |    24.1.RC2    | 7.5.0.1.129 |      8.0.0.beta1       |
+> |    2.6.0     |    24.1.RC2    | 7.3.0.1.231 |      8.1.RC1       |
+> |    2.7.0     |    24.1.RC2    | 7.3.0.1.231 |      8.2.RC1       |
 
 <table align="center">
 <tr>
@@ -409,3 +410,7 @@ NODE_RANK="0"
 当前训练脚本并不完全支持原仓代码的所有训练参数，详情参见[`args.py`](./scripts/args.py)中的`check_args()`。
 
 其中一个主要的限制来自于CogVideoX模型中的[3D Causual VAE不支持静态图](https://gist.github.com/townwish4git/b6cd0d213b396eaedfb69b3abcd742da)，这导致我们**不支持静态图模式下VAE参与训练**，因此在静态图模式下必须提前进行数据预处理以获取VAE-latents/text-encoder-embeddings cache。
+
+
+### 注意
+训练结束后若出现 `Exception ignored: OSError [Errno 9] Bad file descriptor`，仅为 Python 关闭时的提示，不影响训练结果；使用 Python 3.11，该提示不再出现。
@@ -31,7 +31,6 @@
 from mindone.diffusers.models.layers_compat import pad
 from mindone.diffusers.models.modeling_outputs import AutoencoderKLOutput
 from mindone.diffusers.models.modeling_utils import ModelMixin
-from mindone.diffusers.models.normalization import GroupNorm
 from mindone.diffusers.models.upsampling import CogVideoXUpsample3D
 from mindone.diffusers.utils import logging
 
@@ -40,7 +39,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class GroupNorm_SP(GroupNorm):
+class GroupNorm_SP(mint.nn.GroupNorm):
     def set_frame_group_size(self, frame_group_size):
         self.frame_group_size = frame_group_size
 

@@ -32,7 +32,7 @@ AMP_LEVEL=O2
 DATA_ROOT="preprocessed-dataset"
 CAPTION_COLUMN="prompts.txt"
 VIDEO_COLUMN="videos.txt"
-MODEL_NAME_OR_PATH="THUDM/CogVideoX1.5-5b"
+MODEL_NAME_OR_PATH="THUDM/CogVideoX1.5-5B"
 H=768
 W=1360
 F=77

@@ -40,7 +40,7 @@ DEEPSPEED_ZERO_STAGE=3
 DATA_ROOT="preprocessed-dataset"
 CAPTION_COLUMN="prompts.txt"
 VIDEO_COLUMN="videos.txt"
-MODEL_NAME_OR_PATH="THUDM/CogVideoX1.5-5b"
+MODEL_NAME_OR_PATH="THUDM/CogVideoX1.5-5B"
 H=768
 W=1360
 F=77

@@ -29,7 +29,7 @@ cd mindone
 pip install -e .
 # NOTE: transformers requires >=4.46.0
 
-cd examples/cogview
+cd examples/diffusers/cogview
 ```
 
 

@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+sys.path.append("..")
+from examples.diffusers.test_examples_utils import ExamplesTests, run_command  # noqa: E402
+
+ExamplesTests._launch_args = ["python"]
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class ControlNet(ExamplesTests):
+    def test_controlnet_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/diffusers/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --revision refs/pr/4
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=6
+            --checkpoints_total_limit=2
+            --checkpointing_steps=2
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/diffusers/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --revision refs/pr/4
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=6
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4", "checkpoint-6"},
+            )
+
+            resume_run_args = f"""
+            examples/diffusers/controlnet/train_controlnet.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
+            --revision refs/pr/4
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
+            --max_train_steps=8
+            --checkpointing_steps=2
+            --resume_from_checkpoint=checkpoint-6
+            --checkpoints_total_limit=2
+            """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
+
+
+class ControlNetSDXL(ExamplesTests):
+    def test_controlnet_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/diffusers/controlnet/train_controlnet_sdxl.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
+            --revision refs/pr/2
+            --dataset_name=hf-internal-testing/fill10
+            --output_dir={tmpdir}
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
+            --max_train_steps=4
+            --checkpointing_steps=2
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
+
+
+class ControlNetflux(ExamplesTests):
+    def test_controlnet_flux(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+            examples/diffusers/controlnet/train_controlnet_flux.py
+            --pretrained_model_name_or_path=hf-internal-testing/tiny-flux-pipe
+            --output_dir={tmpdir}
+            --dataset_name=hf-internal-testing/fill10
+            --conditioning_image_column=conditioning_image
+            --image_column=image
+            --caption_column=text
+            --resolution=64
+            --train_batch_size=1
+            --gradient_accumulation_steps=1
+            --max_train_steps=4
+            --checkpointing_steps=2
+            --num_double_layers=1
+            --num_single_layers=1
+            """.split()
+
+            run_command(self._launch_args + test_args)
+
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
@@ -879,8 +879,8 @@ def __len__(self):
             if is_master(args):
                 logger.info(f"Resuming from checkpoint {path}")
             # TODO: load optimizer & grad scaler etc. like accelerator.load_state
-            input_model_file = os.path.join(args.output_dir, path, "pytorch_model.ckpt")
-            ms.load_param_into_net(unet, ms.load_checkpoint(input_model_file), strict_load=True)
+            input_model_file = os.path.join(args.output_dir, path, "unet/diffusion_pytorch_model.safetensors")
+            ms.load_param_into_net(unet, ms.load_checkpoint(input_model_file, format="safetensors"), strict_load=True)
             global_step = int(path.split("-")[1])
 
             initial_global_step = global_step
@@ -939,8 +939,7 @@ def __len__(self):
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         # TODO: save optimizer & grad scaler etc. like accelerator.save_state
                         os.makedirs(save_path, exist_ok=True)
-                        output_model_file = os.path.join(save_path, "pytorch_model.ckpt")
-                        ms.save_checkpoint(unet, output_model_file)
+                        unet.save_pretrained(os.path.join(save_path, "unet"))
                         logger.info(f"Saved state to {save_path}")
 
                 if args.validation_prompt is not None and global_step % args.validation_steps == 0:

@@ -35,7 +35,7 @@
 from mindspore.dataset import GeneratorDataset, transforms, vision
 
 from mindone.diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler, FluxTransformer2DModel
-from mindone.diffusers.models.controlnet_flux import FluxControlNetModel
+from mindone.diffusers.models.controlnets.controlnet_flux import FluxControlNetModel
 from mindone.diffusers.models.layers_compat import set_amp_strategy
 from mindone.diffusers.optimization import get_scheduler
 from mindone.diffusers.pipelines.flux.pipeline_flux_controlnet import FluxControlNetPipeline

@@ -990,8 +990,8 @@ def __len__(self):
             if is_master(args):
                 logger.info(f"Resuming from checkpoint {path}")
             # TODO: load optimizer & grad scaler etc. like accelerator.load_state
-            input_model_file = os.path.join(args.output_dir, path, "pytorch_model.ckpt")
-            ms.load_param_into_net(unet, ms.load_checkpoint(input_model_file), strict_load=True)
+            input_model_file = os.path.join(args.output_dir, path, "unet/diffusion_pytorch_model.safetensors")
+            ms.load_param_into_net(unet, ms.load_checkpoint(input_model_file, format="safetensors"), strict_load=True)
             global_step = int(path.split("-")[1])
 
             initial_global_step = global_step
@@ -1050,8 +1050,7 @@ def __len__(self):
                         save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         # TODO: save optimizer & grad scaler etc. like accelerator.save_state
                         os.makedirs(save_path, exist_ok=True)
-                        output_model_file = os.path.join(save_path, "pytorch_model.ckpt")
-                        ms.save_checkpoint(unet, output_model_file)
+                        unet.save_pretrained(os.path.join(save_path, "unet"))
                         logger.info(f"Saved state to {save_path}")
 
                 if args.validation_prompt is not None and global_step % args.validation_steps == 0:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		### Image Credits

		The images in this folder are taken from the [Hugging Face Diffusers repository](https://github.com/huggingface/diffusers/tree/main/docs/source/en/imgs) and are subject to the Apache 2.0 license of the Diffusers project.