diff --git a/examples/moviegen/README.md b/examples/moviegen/README.md
index c4befe5145..ffa1490100 100644
--- a/examples/moviegen/README.md
+++ b/examples/moviegen/README.md
@@ -63,9 +63,9 @@ this project!
-| MindSpore | Ascend Driver | Firmware | CANN toolkit/kernel |
-|:---------:|:-------------:|:-----------:|:-------------------:|
-| 2.5.0 | 24.0.0 | 7.5.0.3.220 | 8.0.0.beta1 |
+| MindSpore | Ascend Driver | Firmware | CANN toolkit/kernel |
+|:----------------:|:-------------:|:-----------:|:-------------------:|
+| 2.5.0 and higher | 24.0.0 | 7.5.0.3.220 | 8.0.0.beta1 |
@@ -256,7 +256,7 @@ For more details, check `scripts/tae/encode_tae.sh` or run `scripts/inference_ta
### Performance
-Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.3.1 in Graph mode.
+Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.7.0 in Graph mode.
> [!NOTE]
> We trained all the models using BF16 precision and JIT level `O1`.
@@ -264,15 +264,15 @@ Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.3.
| Model | Cards | Stage | Batch size | Resolution | Compile time | Recompute | Gradient Acc | ZeRO | Sequence Parallel | TAE Cache | Time (s/step) | Config |
|:-----:|:-----:|:---------:|:---------------------:|:--------------------:|:------------:|:------------------------:|:------------:|:----:|:-----------------:|:---------:|:-------------:|:--------------------------------------------------------------:|
-| 30B | 8 | 1 (T2I) | 10 | 256x455 | 4m 40s | ON | 1 | 3 | No | Yes | 3.37 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
-| 30B | 8 | 2 (T2V) | Video: 1 | 256x256x455 | 7m 40s | ON | 1 | 3 | 8 shards | Yes | 2.58 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
-| 30B | 8 | 3 (T2V) | Video: 1 | 256x576x1024 | 7m 40s | ON | 1 | 3 | 8 shards | Yes | 31.9 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
-| 5B | 8 | 1 (T2I) | 10 | 256px | 2m 30s | OFF | 1 | 3 | No | Yes | 0.64 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
-| 5B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 2m 40s | OFF | 5 | 2 | No | Yes | 1.54 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
-| 5B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 4m | ON | 5 | 2 | No | Yes | 82.8 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
-| 1B | 8 | 1 (T2I) | 10 | 256px | 1m 40s | OFF | 1 | No | No | Yes | 0.34 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
-| 1B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 40s | OFF | 5 | No | No | Yes | 0.55 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
-| 1B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 2m | ON
(No FA recompute) | 5 | No | No | Yes | 22.1 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
+| 30B | 8 | 1 (T2I) | 10 | 256x455 | 3m 10s | ON | 1 | 3 | No | Yes | 3.62 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
+| 30B | 8 | 2 (T2V) | Video: 1 | 256x256x455 | 4m 10s | ON | 1 | 3 | 8 shards | Yes | 2.57 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
+| 30B | 8 | 3 (T2V) | Video: 1 | 256x576x1024 | 4m 50s | ON | 1 | 3 | 8 shards | Yes | 29.7 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
+| 5B | 8 | 1 (T2I) | 10 | 256px | 2m 30s | OFF | 1 | 3 | No | Yes | 0.65 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
+| 5B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 30s | ON
(No FA recompute) | 5 | 2 | No | Yes | 1.76 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
+| 5B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 2m 30s | ON | 5 | 2 | No | Yes | 66.2 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
+| 1B | 8 | 1 (T2I) | 10 | 256px | 1m 15s | OFF | 1 | No | No | Yes | 0.33 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) |
+| 1B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 20s | OFF | 5 | No | No | Yes | 0.56 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) |
+| 1B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 1m 40s | ON
(No FA recompute) | 5 | No | No | Yes | 22.1 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) |
### Validation During Training
diff --git a/examples/moviegen/scripts/inference_tae.py b/examples/moviegen/scripts/inference_tae.py
index 9e5b679e2b..cdd4615853 100644
--- a/examples/moviegen/scripts/inference_tae.py
+++ b/examples/moviegen/scripts/inference_tae.py
@@ -67,7 +67,7 @@ def encode(args, tae: TemporalAutoencoder, save_dir: Path, rank_id: int, device_
mean = np.transpose(mean, (0, 2, 1, 3, 4))
std = np.transpose(std, (0, 2, 1, 3, 4))
- for m, s, path in zip(mean, std, samples[1].tolist()):
+ for m, s, path in zip(mean, std, samples[1].asnumpy().tolist()):
out_path = save_dir / path
out_path.parent.mkdir(parents=True, exist_ok=True)
np.savez(out_path.with_suffix(".npz"), latent_mean=m, latent_std=s)
diff --git a/examples/moviegen/scripts/moviegen/train_perf_bench.sh b/examples/moviegen/scripts/moviegen/train_perf_bench.sh
new file mode 100644
index 0000000000..f814aca02b
--- /dev/null
+++ b/examples/moviegen/scripts/moviegen/train_perf_bench.sh
@@ -0,0 +1,260 @@
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# log level
+export GLOG_v=2
+
+MODE=0
+
+# ---------------------------- 1B ----------------------------
+
+# Stage 1
+output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8200 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage1_t2i_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name llama-1B \
+ --model.recompute_every_nth_block "" \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob 0 \
+ --dataset.deterministic_sample=True \
+ --dataloader.batch_size=10 \
+ --valid.dataset "" \
+ --train.ema "" \
+ --train.save.ckpt_save_policy=latest_k \
+ --train.output_path "$output_dir" \
+ --train.steps=500
+
+echo "Completed 1B stage 1: $output_dir"
+rm -rf "$output_dir"/ckpt
+
+# Stage 2
+output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage2_t2iv_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name llama-1B \
+ --model.recompute_every_nth_block "" \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob=0 \
+ --dataset.deterministic_sample=True \
+ --train.ema "" \
+ --train.settings.gradient_accumulation_steps=5 \
+ --train.output_path "$output_dir" \
+ --train.steps=300
+
+echo "Completed 1B stage 2: $output_dir"
+rm -rf "$output_dir"/ckpt
+
+# Stage 3
+output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage3_t2iv_768px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name llama-1B \
+ --model.not_recompute_fa True \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob=0 \
+ --dataset.deterministic_sample=True \
+ --train.ema "" \
+ --train.settings.gradient_accumulation_steps=5 \
+ --train.output_path "$output_dir" \
+ --train.steps=30
+
+echo "Completed 1B stage 3: $output_dir"
+rm -rf "$output_dir"/ckpt
+
+
+# ---------------------------- 5B ----------------------------
+
+# Stage 1
+output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8210 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage1_t2i_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --train.settings.zero_stage 3 \
+ --model.recompute_every_nth_block "" \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob 0 \
+ --dataset.deterministic_sample=True \
+ --dataloader.batch_size=10 \
+ --valid.dataset "" \
+ --train.ema "" \
+ --train.save.ckpt_save_policy=latest_k \
+ --train.output_path "$output_dir" \
+ --train.steps=300
+
+echo "Completed 5B stage 1: $output_dir"
+find "$output_dir" -name '*.ckpt' -type f -delete
+
+# Stage 2
+output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage2_t2iv_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --train.settings.zero_stage 2 \
+ --model.not_recompute_fa True \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob=0 \
+ --dataset.deterministic_sample=True \
+ --train.ema "" \
+ --train.settings.gradient_accumulation_steps=5 \
+ --train.output_path "$output_dir" \
+ --train.steps=200
+
+echo "Completed 5B stage 2: $output_dir"
+rm -rf "$output_dir"/ckpt
+
+# Stage 3
+output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage3_t2iv_768px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --train.settings.zero_stage 2 \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob=0 \
+ --dataset.deterministic_sample=True \
+ --train.ema "" \
+ --train.settings.gradient_accumulation_steps=5 \
+ --train.output_path "$output_dir" \
+ --train.steps=10
+
+echo "Completed 5B stage 3: $output_dir"
+rm -rf "$output_dir"/ckpt
+
+
+# ---------------------------- 30B ----------------------------
+
+# Stage 1
+output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8210 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage1_t2i_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name=llama-30B \
+ --train.settings.zero_stage 3 \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob 0 \
+ --dataset.deterministic_sample=True \
+ --dataloader.batch_size=10 \
+ --valid.dataset "" \
+ --train.optimizer.name adamw_re \
+ --train.ema "" \
+ --train.save.ckpt_save_policy=latest_k \
+ --train.output_path "$output_dir" \
+ --train.steps=100
+
+echo "Completed 30B stage 1: $output_dir"
+find "$output_dir" -name '*.ckpt' -type f -delete
+
+# Stage 2
+output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage2_t2iv_256px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name=llama-30B \
+ --train.settings.zero_stage 3 \
+ --train.sequence_parallel.shards 8 \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob 0 \
+ --dataset.sample_n_frames 32 \
+ --dataset.deterministic_sample=True \
+ --dataloader.batch_size 1 \
+ --train.optimizer.name adamw_re \
+ --train.ema "" \
+ --train.output_path "$output_dir" \
+ --train.steps=100
+
+echo "Completed 30B stage 2: $output_dir"
+find "$output_dir" -name '*.ckpt' -type f -delete
+
+# Stage 3
+output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S")
+
+msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \
+python scripts/train.py \
+ --config configs/train/stage3_t2iv_768px.yaml \
+ --env.mode $MODE \
+ --env.max_device_memory 59GB \
+ --env.distributed True \
+ --model.name=llama-30B \
+ --train.settings.zero_stage 3 \
+ --train.sequence_parallel.shards 8 \
+ --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \
+ --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \
+ --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \
+ --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \
+ --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \
+ --dataset.text_drop_prob=0 \
+ --dataset.sample_n_frames 32 \
+ --dataset.deterministic_sample=True \
+ --dataloader.batch_size 1 \
+ --train.optimizer.name adamw_re \
+ --train.ema "" \
+ --train.output_path "$output_dir" \
+ --train.steps=10
+
+echo "Completed 30B stage 3: $output_dir"
+find "$output_dir" -name '*.ckpt' -type f -delete