Constructing...
- RoPE
- Time Embedding
- Attention Mask
accelerate launch --mixed_precision fp16 train.py --model Transfusion-XL --data_path /path/to/ImageNet/train
@inproceedings{Zhou2024TransfusionPT,
title = {Transfusion: Predict the Next Token and Diffuse Images with One Multi-Modal Model},
author = {Chunting Zhou and Lili Yu and Arun Babu and Kushal Tirumala and Michihiro Yasunaga and Leonid Shamis and Jacob Kahn and Xuezhe Ma and Luke Zettlemoyer and Omer Levy},
year = {2024},
url = {https://api.semanticscholar.org/CorpusID:271909855}
}