diff --git a/examples/moviegen/README.md b/examples/moviegen/README.md index c4befe5145..ffa1490100 100644 --- a/examples/moviegen/README.md +++ b/examples/moviegen/README.md @@ -63,9 +63,9 @@ this project!
-| MindSpore | Ascend Driver | Firmware | CANN toolkit/kernel | -|:---------:|:-------------:|:-----------:|:-------------------:| -| 2.5.0 | 24.0.0 | 7.5.0.3.220 | 8.0.0.beta1 | +| MindSpore | Ascend Driver | Firmware | CANN toolkit/kernel | +|:----------------:|:-------------:|:-----------:|:-------------------:| +| 2.5.0 and higher | 24.0.0 | 7.5.0.3.220 | 8.0.0.beta1 |
@@ -256,7 +256,7 @@ For more details, check `scripts/tae/encode_tae.sh` or run `scripts/inference_ta ### Performance -Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.3.1 in Graph mode. +Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.7.0 in Graph mode. > [!NOTE] > We trained all the models using BF16 precision and JIT level `O1`. @@ -264,15 +264,15 @@ Experiments were conducted on Ascend Atlas 800T A2 machines using MindSpore 2.3. | Model | Cards | Stage | Batch size | Resolution | Compile time | Recompute | Gradient Acc | ZeRO | Sequence Parallel | TAE Cache | Time (s/step) | Config | |:-----:|:-----:|:---------:|:---------------------:|:--------------------:|:------------:|:------------------------:|:------------:|:----:|:-----------------:|:---------:|:-------------:|:--------------------------------------------------------------:| -| 30B | 8 | 1 (T2I) | 10 | 256x455 | 4m 40s | ON | 1 | 3 | No | Yes | 3.37 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | -| 30B | 8 | 2 (T2V) | Video: 1 | 256x256x455 | 7m 40s | ON | 1 | 3 | 8 shards | Yes | 2.58 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | -| 30B | 8 | 3 (T2V) | Video: 1 | 256x576x1024 | 7m 40s | ON | 1 | 3 | 8 shards | Yes | 31.9 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | -| 5B | 8 | 1 (T2I) | 10 | 256px | 2m 30s | OFF | 1 | 3 | No | Yes | 0.64 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | -| 5B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 2m 40s | OFF | 5 | 2 | No | Yes | 1.54 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | -| 5B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 4m | ON | 5 | 2 | No | Yes | 82.8 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | -| 1B | 8 | 1 (T2I) | 10 | 256px | 1m 40s | OFF | 1 | No | No | Yes | 0.34 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | -| 1B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 40s | OFF | 5 | No | No | Yes | 0.55 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | -| 1B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 2m | ON
(No FA recompute) | 5 | No | No | Yes | 22.1 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | +| 30B | 8 | 1 (T2I) | 10 | 256x455 | 3m 10s | ON | 1 | 3 | No | Yes | 3.62 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | +| 30B | 8 | 2 (T2V) | Video: 1 | 256x256x455 | 4m 10s | ON | 1 | 3 | 8 shards | Yes | 2.57 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | +| 30B | 8 | 3 (T2V) | Video: 1 | 256x576x1024 | 4m 50s | ON | 1 | 3 | 8 shards | Yes | 29.7 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | +| 5B | 8 | 1 (T2I) | 10 | 256px | 2m 30s | OFF | 1 | 3 | No | Yes | 0.65 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | +| 5B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 30s | ON
(No FA recompute) | 5 | 2 | No | Yes | 1.76 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | +| 5B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 2m 30s | ON | 5 | 2 | No | Yes | 66.2 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | +| 1B | 8 | 1 (T2I) | 10 | 256px | 1m 15s | OFF | 1 | No | No | Yes | 0.33 | [stage1_t2i_256px.yaml](configs/train/stage1_t2i_256px.yaml) | +| 1B | 8 | 2 (T2I/V) | Image: 1
Video: 1 | 256px
256 frames | 1m 20s | OFF | 5 | No | No | Yes | 0.56 | [stage2_t2iv_256px.yaml](configs/train/stage2_t2iv_256px.yaml) | +| 1B | 8 | 3 (T2I/V) | Image: 1
Video: 1 | 768px
256 frames | 1m 40s | ON
(No FA recompute) | 5 | No | No | Yes | 22.1 | [stage3_t2iv_768px.yaml](configs/train/stage3_t2iv_768px.yaml) | ### Validation During Training diff --git a/examples/moviegen/scripts/inference_tae.py b/examples/moviegen/scripts/inference_tae.py index 9e5b679e2b..cdd4615853 100644 --- a/examples/moviegen/scripts/inference_tae.py +++ b/examples/moviegen/scripts/inference_tae.py @@ -67,7 +67,7 @@ def encode(args, tae: TemporalAutoencoder, save_dir: Path, rank_id: int, device_ mean = np.transpose(mean, (0, 2, 1, 3, 4)) std = np.transpose(std, (0, 2, 1, 3, 4)) - for m, s, path in zip(mean, std, samples[1].tolist()): + for m, s, path in zip(mean, std, samples[1].asnumpy().tolist()): out_path = save_dir / path out_path.parent.mkdir(parents=True, exist_ok=True) np.savez(out_path.with_suffix(".npz"), latent_mean=m, latent_std=s) diff --git a/examples/moviegen/scripts/moviegen/train_perf_bench.sh b/examples/moviegen/scripts/moviegen/train_perf_bench.sh new file mode 100644 index 0000000000..f814aca02b --- /dev/null +++ b/examples/moviegen/scripts/moviegen/train_perf_bench.sh @@ -0,0 +1,260 @@ +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +# log level +export GLOG_v=2 + +MODE=0 + +# ---------------------------- 1B ---------------------------- + +# Stage 1 +output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8200 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage1_t2i_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name llama-1B \ + --model.recompute_every_nth_block "" \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob 0 \ + --dataset.deterministic_sample=True \ + --dataloader.batch_size=10 \ + --valid.dataset "" \ + --train.ema "" \ + --train.save.ckpt_save_policy=latest_k \ + --train.output_path "$output_dir" \ + --train.steps=500 + +echo "Completed 1B stage 1: $output_dir" +rm -rf "$output_dir"/ckpt + +# Stage 2 +output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage2_t2iv_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name llama-1B \ + --model.recompute_every_nth_block "" \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob=0 \ + --dataset.deterministic_sample=True \ + --train.ema "" \ + --train.settings.gradient_accumulation_steps=5 \ + --train.output_path "$output_dir" \ + --train.steps=300 + +echo "Completed 1B stage 2: $output_dir" +rm -rf "$output_dir"/ckpt + +# Stage 3 +output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage3_t2iv_768px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name llama-1B \ + --model.not_recompute_fa True \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob=0 \ + --dataset.deterministic_sample=True \ + --train.ema "" \ + --train.settings.gradient_accumulation_steps=5 \ + --train.output_path "$output_dir" \ + --train.steps=30 + +echo "Completed 1B stage 3: $output_dir" +rm -rf "$output_dir"/ckpt + + +# ---------------------------- 5B ---------------------------- + +# Stage 1 +output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8210 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage1_t2i_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --train.settings.zero_stage 3 \ + --model.recompute_every_nth_block "" \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob 0 \ + --dataset.deterministic_sample=True \ + --dataloader.batch_size=10 \ + --valid.dataset "" \ + --train.ema "" \ + --train.save.ckpt_save_policy=latest_k \ + --train.output_path "$output_dir" \ + --train.steps=300 + +echo "Completed 5B stage 1: $output_dir" +find "$output_dir" -name '*.ckpt' -type f -delete + +# Stage 2 +output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage2_t2iv_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --train.settings.zero_stage 2 \ + --model.not_recompute_fa True \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob=0 \ + --dataset.deterministic_sample=True \ + --train.ema "" \ + --train.settings.gradient_accumulation_steps=5 \ + --train.output_path "$output_dir" \ + --train.steps=200 + +echo "Completed 5B stage 2: $output_dir" +rm -rf "$output_dir"/ckpt + +# Stage 3 +output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage3_t2iv_768px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --train.settings.zero_stage 2 \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob=0 \ + --dataset.deterministic_sample=True \ + --train.ema "" \ + --train.settings.gradient_accumulation_steps=5 \ + --train.output_path "$output_dir" \ + --train.steps=10 + +echo "Completed 5B stage 3: $output_dir" +rm -rf "$output_dir"/ckpt + + +# ---------------------------- 30B ---------------------------- + +# Stage 1 +output_dir=output/test/stage1_t2i_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8210 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage1_t2i_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name=llama-30B \ + --train.settings.zero_stage 3 \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents_images \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob 0 \ + --dataset.deterministic_sample=True \ + --dataloader.batch_size=10 \ + --valid.dataset "" \ + --train.optimizer.name adamw_re \ + --train.ema "" \ + --train.save.ckpt_save_policy=latest_k \ + --train.output_path "$output_dir" \ + --train.steps=100 + +echo "Completed 30B stage 1: $output_dir" +find "$output_dir" -name '*.ckpt' -type f -delete + +# Stage 2 +output_dir=output/test/stage2_t2iv_256px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8220 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage2_t2iv_256px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name=llama-30B \ + --train.settings.zero_stage 3 \ + --train.sequence_parallel.shards 8 \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob 0 \ + --dataset.sample_n_frames 32 \ + --dataset.deterministic_sample=True \ + --dataloader.batch_size 1 \ + --train.optimizer.name adamw_re \ + --train.ema "" \ + --train.output_path "$output_dir" \ + --train.steps=100 + +echo "Completed 30B stage 2: $output_dir" +find "$output_dir" -name '*.ckpt' -type f -delete + +# Stage 3 +output_dir=output/test/stage3_t2iv_768px/$(date +"%Y.%m.%d-%H.%M.%S") + +msrun --bind_core=True --master_port=8230 --worker_num=8 --local_worker_num=8 --log_dir="$output_dir" --join=True \ +python scripts/train.py \ + --config configs/train/stage3_t2iv_768px.yaml \ + --env.mode $MODE \ + --env.max_device_memory 59GB \ + --env.distributed True \ + --model.name=llama-30B \ + --train.settings.zero_stage 3 \ + --train.sequence_parallel.shards 8 \ + --dataset.csv_path ../../../datasets/mixkit-100videos/video_caption_train_updated.csv \ + --dataset.video_folder ../../../datasets/mixkit-100videos/mixkit \ + --dataset.tae_latent_folder ../../../datasets/mixkit-100videos/high_tae_latents \ + --dataset.text_emb_folder.ul2 ../../../datasets/mixkit-100videos/ul2_emb_300 \ + --dataset.text_emb_folder.byt5 ../../../datasets/mixkit-100videos/byt5_emb_100 \ + --dataset.text_drop_prob=0 \ + --dataset.sample_n_frames 32 \ + --dataset.deterministic_sample=True \ + --dataloader.batch_size 1 \ + --train.optimizer.name adamw_re \ + --train.ema "" \ + --train.output_path "$output_dir" \ + --train.steps=10 + +echo "Completed 30B stage 3: $output_dir" +find "$output_dir" -name '*.ckpt' -type f -delete