diff --git a/VAE_model.py b/VAE_model.py
new file mode 100644
index 000000000000..16f8834f8023
--- /dev/null
+++ b/VAE_model.py
@@ -0,0 +1,353 @@
+# video_vae_modular_final.py
+
+# ==============================================================================
+# 1. IMPORTS & CONFIGURATION
+# ==============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel, AutoProcessor, AutoModelForCausalLM
+from typing import List, Dict
+from dataclasses import dataclass
+
+@dataclass
+class VideoVAEConfig:
+    in_channels: int = 3
+    base_ch: int = 64
+    num_blocks: int = 4
+    quant_emb_dim: int = 16
+    alignment_dim: int = 256
+    quant_align_loss_weight: float = 0.1
+    likelihood_loss_weight: float = 0.2
+    dino_loss_weight: float = 0.25
+
+# ==============================================================================
+# 2. PERCEPTUAL & TEXT MODULES
+# ==============================================================================
+
+class DINOv2Extractor(nn.Module):
+    """
+    A frozen DINOv2 model to extract perceptual features from video frames.
+    """
+    def __init__(self, device="cuda"):
+        super().__init__()
+        self.device = device
+        model_name = "facebook/dinov2-base"
+        print("Loading DINOv2 model and processor...")
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(self.device).eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+        print("DINOv2 loaded and frozen successfully. 🦖")
+
+    def forward(self, video_tensor: torch.Tensor) -> torch.Tensor:
+        b, c, t, h, w = video_tensor.shape
+        video_tensor = video_tensor.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        inputs = self.processor(images=video_tensor, return_tensors="pt", do_rescale=False).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Return the features of the [CLS] token
+        return outputs.last_hidden_state[:, 0].view(b, t, -1)
+
+class QwenVLTextEncoder(nn.Module):
+    """A frozen Qwen-VL model to extract text embeddings."""
+    def __init__(self, device="cuda"):
+        super().__init__()
+        model_id = "Qwen/Qwen2.5-VL-Instruct"
+        self.device = device
+        print("Loading Qwen-VL model and processor...")
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto", trust_remote_code=True).eval()
+        for param in self.model.parameters(): param.requires_grad = False
+        print("Qwen-VL loaded and frozen successfully. 🥶")
+
+    def forward(self, text_prompts: list[str]):
+        messages = [[{"role": "user", "content": [{"type": "text", "text": prompt}]}] for prompt in text_prompts]
+        text_inputs = self.processor(conversations=messages, return_tensors="pt", padding=True).to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model(**text_inputs, output_hidden_states=True)
+        return outputs.hidden_states[-1].to(self.device)
+
+class TextVideoCrossAttention(nn.Module):
+    """Performs cross-attention between video features (Q) and text features (K,V)."""
+    def __init__(self, video_channels, text_embed_dim):
+        super().__init__()
+        self.q_proj, self.k_proj, self.v_proj = nn.Linear(video_channels, video_channels), nn.Linear(text_embed_dim, video_channels), nn.Linear(text_embed_dim, video_channels)
+        self.out_proj = nn.Linear(video_channels, video_channels)
+
+    def forward(self, video_feat, text_embedding):
+        B, C, T, H, W = video_feat.shape
+        video_seq = video_feat.permute(0, 2, 3, 4, 1).reshape(B, T * H * W, C)
+        q, k, v = self.q_proj(video_seq), self.k_proj(text_embedding), self.v_proj(text_embedding)
+        attn_output = F.scaled_dot_product_attention(q.unsqueeze(1), k, v).squeeze(1)
+        return self.out_proj(attn_output).reshape(B, T, H, W, C).permute(0, 4, 1, 2, 3)
+
+# ==============================================================================
+# 3. CORE ARCHITECTURAL BLOCKS
+# ==============================================================================
+
+class ProjectedLFQ(nn.Module):
+    """Projects features and quantizes them, returning an entropy loss."""
+    def __init__(self, in_channels, quant_channels, entropy_loss_weight=0.1):
+        super().__init__()
+        self.project = nn.Conv3d(in_channels, quant_channels, 1)
+        self.entropy_loss_weight = entropy_loss_weight
+
+    def forward(self, x):
+        x_proj = self.project(x)
+        quantized_x_hard = torch.where(x_proj > 0, 1.0, -1.0)
+        quantized_x = x_proj + (quantized_x_hard - x_proj).detach()
+        indices = (quantized_x > 0).long()
+        probs = indices.float().mean(dim=(0, 2, 3, 4))
+        entropy = - (probs * torch.log(probs.clamp(min=1e-8)) + (1 - probs) * torch.log((1 - probs).clamp(min=1e-8)))
+        entropy_loss = -entropy.mean() * self.entropy_loss_weight
+        return quantized_x, indices, entropy_loss
+
+class VideoVAEEncoderBlock(nn.Module):
+    """Standard VAE encoder block for downsampling."""
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.conv1 = nn.Conv3d(in_ch, out_ch, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv3d(out_ch, out_ch, kernel_size=3, padding=1)
+        self.pool = nn.MaxPool3d(kernel_size=2, stride=2)
+        self.norm = nn.BatchNorm3d(out_ch)
+        self.act = nn.GELU()
+
+    def forward(self, x):
+        h = self.act(self.norm(self.conv1(x)))
+        h = self.act(self.norm(self.conv2(h)))
+        return self.pool(h)
+
+class PyramidalLFQBlock(nn.Module):
+    """A block in the pyramidal upsampler: upsample -> fuse -> text-attend -> quantize."""
+    def __init__(self, in_ch, skip_ch, out_ch, text_embed_dim, quant_emb_dim):
+        super().__init__()
+        self.upsample = nn.ConvTranspose3d(in_ch, out_ch, kernel_size=4, stride=2, padding=1)
+        self.conv = nn.Conv3d(out_ch + skip_ch, out_ch, kernel_size=3, padding=1)
+        self.text_cross_attn = TextVideoCrossAttention(out_ch, text_embed_dim)
+        self.lfq = ProjectedLFQ(out_ch, quant_channels=quant_emb_dim)
+        self.norm = nn.BatchNorm3d(out_ch)
+        self.act = nn.GELU()
+
+    def forward(self, x, skip, text_embedding):
+        x_up = self.upsample(x)
+        x_fused = self.act(self.norm(self.conv(torch.cat([x_up, skip], dim=1))))
+        h_attn = x_fused + self.text_cross_attn(x_fused, text_embedding)
+        q, indices, entropy_loss = self.lfq(h_attn)
+        return h_attn, q, indices, entropy_loss
+
+class VideoVAEDecoderBlock(nn.Module):
+    """Standard VAE decoder block for upsampling."""
+    def __init__(self, in_ch, out_ch):
+        super().__init__()
+        self.upsample = nn.ConvTranspose3d(in_ch, out_ch, kernel_size=4, stride=2, padding=1)
+        self.conv = nn.Conv3d(out_ch, out_ch, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm3d(out_ch)
+        self.act = nn.GELU()
+
+    def forward(self, x):
+        h = self.act(self.norm(self.upsample(x)))
+        return self.act(self.norm(self.conv(h)))
+
+# ==============================================================================
+# 4. PRIMARY VideoVAE MODEL
+# ==============================================================================
+
+class VideoVAE(nn.Module):
+    """
+    A modular, text-conditioned Video VAE with a Pyramidal LFQ structure
+    and multiple perception-based losses for high-quality synthesis.
+    """
+    def __init__(self, cfg: VideoVAEConfig, device="cuda"):
+        super().__init__()
+        self.cfg = cfg
+        self.device = device
+
+        # --- Sub-models (Text, Perception) ---
+        self.text_encoder = QwenVLTextEncoder(device=device)
+        text_embed_dim = self.text_encoder.model.config.hidden_size
+        if self.training: # Only load DINOv2 if we are in training mode
+            self.dino_extractor = DINOv2Extractor(device=device)
+
+        # --- VAE Encoder ---
+        self.enc_blocks = nn.ModuleList()
+        chs = [cfg.base_ch * (2**i) for i in range(cfg.num_blocks)]
+        current_ch = cfg.in_channels
+        for ch in chs:
+            self.enc_blocks.append(VideoVAEEncoderBlock(current_ch, ch))
+            current_ch = ch
+
+        # --- Pyramidal LFQ Upsampler ---
+        rev_channels = list(reversed(chs))
+        self.pyramid_blocks = nn.ModuleList()
+        for i in range(2): # 2 stages for 4x total upscaling
+            self.pyramid_blocks.append(
+                PyramidalLFQBlock(rev_channels[i], rev_channels[i+1], rev_channels[i+1], text_embed_dim, cfg.quant_emb_dim)
+            )
+
+        # --- VAE Decoder ---
+        self.dec_blocks = nn.ModuleList()
+        decoder_channels = [chs[1], chs[0]]
+        for i in range(len(decoder_channels)):
+            in_ch = decoder_channels[i]
+            out_ch = decoder_channels[i+1] if i + 1 < len(decoder_channels) else cfg.base_ch
+            self.dec_blocks.append(VideoVAEDecoderBlock(in_ch, out_ch))
+        self.out_conv = nn.Conv3d(cfg.base_ch, cfg.in_channels, 1)
+
+        # --- Loss-specific Modules ---
+        codebook_size = 2**cfg.quant_emb_dim
+        self.quant_embedding = nn.Embedding(codebook_size, text_embed_dim)
+        self.to_quant_logits = nn.Linear(text_embed_dim, codebook_size)
+        quant_pooled_dim = chs[2] + chs[1]
+        self.quant_proj = nn.Linear(quant_pooled_dim, cfg.alignment_dim)
+        self.text_proj_for_quant = nn.Linear(text_embed_dim, cfg.alignment_dim)
+
+    def forward(self, x: torch.Tensor, text_prompts: List[str]) -> Dict[str, torch.Tensor]:
+        """
+        Core inference path. Encodes, quantizes via pyramid, and decodes.
+        Returns all intermediate products needed for loss calculation.
+        """
+        text_embedding = self.text_encoder(text_prompts)
+        
+        encoder_features = []
+        h = x
+        for block in self.enc_blocks:
+            h = block(h)
+            encoder_features.append(h)
+
+        rev_features = list(reversed(encoder_features))
+        h = rev_features[0]
+        pyramid_outputs = {'q': [], 'indices': [], 'entropies': []}
+        for i, block in enumerate(self.pyramid_blocks):
+            h, q, indices, entropy = block(h, rev_features[i + 1], text_embedding)
+            pyramid_outputs['q'].append(q)
+            pyramid_outputs['indices'].append(indices)
+            pyramid_outputs['entropies'].append(entropy)
+        
+        dec_in = h
+        for block in self.dec_blocks:
+            dec_in = block(dec_in)
+        reconstruction = torch.tanh(self.out_conv(dec_in))
+
+        return {
+            "reconstruction": reconstruction,
+            "text_embedding": text_embedding,
+            "pyramid_outputs": pyramid_outputs
+        }
+
+    def calculate_losses(self, original_video: torch.Tensor, forward_outputs: Dict) -> Dict:
+        """
+        Calculates all training-specific losses. This method should only be
+        called during the training loop.
+        """
+        if not self.training:
+            raise RuntimeError("calculate_losses() should only be called in training mode.")
+            
+        # Unpack forward pass results
+        recon = forward_outputs["reconstruction"]
+        text_emb = forward_outputs["text_embedding"]
+        pyramid_out = forward_outputs["pyramid_outputs"]
+        all_q, all_indices, all_entropies = pyramid_out['q'], pyramid_out['indices'], pyramid_out['entropies']
+
+        # 1. Reconstruction Loss
+        recon_loss = F.mse_loss(recon, original_video)
+
+        # 2. Entropy Loss
+        entropy_loss = sum(all_entropies)
+        
+        # 3. P(Q|text) Likelihood Loss
+        B = text_emb.size(0)
+        seqs = [idx.view(B, self.cfg.quant_emb_dim, -1) for idx in all_indices]
+        full_seq_bits = torch.cat(seqs, dim=2).permute(0, 2, 1)
+        powers_of_2 = (2**torch.arange(self.cfg.quant_emb_dim, device=self.device)).float()
+        quant_token_ids = (full_seq_bits * powers_of_2).sum(dim=2).long()
+        quant_embeds = self.quant_embedding(quant_token_ids)
+        combined_embeds = torch.cat([text_emb, quant_embeds], dim=1)
+        with torch.no_grad():
+            qwen_outputs = self.text_encoder.model(inputs_embeds=combined_embeds, output_hidden_states=True)
+        last_hidden = qwen_outputs.hidden_states[-1][:, text_emb.shape[1] - 1:-1, :]
+        pred_logits = self.to_quant_logits(last_hidden)
+        likelihood_loss = F.cross_entropy(pred_logits.reshape(-1, pred_logits.size(-1)), quant_token_ids.reshape(-1))
+        
+        # 4. Quantized Vector-Text Alignment Loss
+        q_pooled = [F.adaptive_avg_pool3d(q, 1).view(B, -1) for q in all_q]
+        q_pooled_cat = torch.cat(q_pooled, dim=1)
+        text_pooled = text_emb.mean(dim=1)
+        q_aligned = self.quant_proj(q_pooled_cat)
+        text_aligned = self.text_proj_for_quant(text_pooled)
+        quant_align_loss = F.cosine_embedding_loss(q_aligned, text_aligned, torch.ones(B, device=self.device))
+        
+        # 5. DINOv2 Perceptual Loss (KL Divergence)
+        orig_dino_feats = self.dino_extractor(original_video)
+        recon_dino_feats = self.dino_extractor(recon)
+        p = F.softmax(orig_dino_feats, dim=-1)
+        q = F.log_softmax(recon_dino_feats, dim=-1)
+        dino_loss = F.kl_div(q, p, reduction='batchmean')
+
+        # --- Final Weighted Sum ---
+        total_loss = (recon_loss + entropy_loss +
+                      self.cfg.likelihood_loss_weight * likelihood_loss +
+                      self.cfg.quant_align_loss_weight * quant_align_loss +
+                      self.cfg.dino_loss_weight * dino_loss)
+
+        return {
+            "total_loss": total_loss, "reconstruction_loss": recon_loss,
+            "entropy_loss": entropy_loss, "likelihood_loss": likelihood_loss,
+            "quant_alignment_loss": quant_align_loss, "dino_perceptual_loss": dino_loss
+        }
+
+# ==============================================================================
+# 5. EXAMPLE USAGE
+# ==============================================================================
+if __name__ == '__main__':
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device == "cpu": print("WARNING: Running on CPU. This will be extremely slow.")
+
+    try:
+        config = VideoVAEConfig(quant_emb_dim=16) # Set LFQ size to 16
+        model = VideoVAE(config, device=device).to(device)
+
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("-" * 40)
+        print(f"Trainable model parameters: {trainable_params:,}")
+        print("(This should NOT include frozen DINOv2 or Qwen-VL models)")
+        print("-" * 40)
+
+        # --- SIMULATED TRAINING STEP ---
+        print("\n--- 1. Simulating Training Step ---")
+        model.train() # Set model to training mode
+        batch_size = 2
+        video_input = torch.randn(batch_size, 3, 16, 64, 64).to(device)
+        prompts = ["A stunning sunrise over a calm ocean.", "A busy city street at night with neon lights."]
+        
+        # In a real training loop, this would be inside the loop
+        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+        optimizer.zero_grad()
+        
+        forward_outputs = model(video_input, text_prompts=prompts)
+        losses = model.calculate_losses(video_input, forward_outputs)
+        
+        # Backpropagation
+        losses["total_loss"].backward()
+        optimizer.step()
+        
+        print("Training step successful. Losses calculated:")
+        for name, value in losses.items(): print(f"  - {name:<25}: {value.item():.4f}")
+
+        # --- SIMULATED INFERENCE STEP ---
+        print("\n--- 2. Simulating Inference Step ---")
+        model.eval() # Set model to evaluation mode
+        with torch.no_grad():
+            # Notice we only call the forward pass and don't need the loss function
+            inference_outputs = model(video_input, text_prompts=prompts)
+            reconstructed_video = inference_outputs["reconstruction"]
+
+        print("Inference step successful.")
+        print("Input Video Shape:         ", video_input.shape)
+        print("Reconstructed Video Shape: ", reconstructed_video.shape)
+
+    except Exception as e:
+        print(f"\n--- ❌ An Error Occurred ---")
+        print(f"Error: {e}")
+        if "out of memory" in str(e).lower():
+            print("\n💡 Suggestion: CUDA Out-of-Memory. Try reducing `base_ch`, `num_blocks`, or input resolution.")
diff --git a/run_wan_infer.py b/run_wan_infer.py
new file mode 100644
index 000000000000..58487fd7d05b
--- /dev/null
+++ b/run_wan_infer.py
@@ -0,0 +1,40 @@
+import torch
+import numpy as np
+from diffusers import WanPipeline, AutoencoderKLWan, WanTransformer3DModel, UniPCMultistepScheduler
+from diffusers.utils import export_to_video, load_image
+
+import pandas as pd
+
+
+dtype = torch.bfloat16
+device = "cuda"
+
+model_id = "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+vae = AutoencoderKLWan.from_pretrained("onkarsus13/qwen-wan-MM-3B", subfolder="vae", torch_dtype=torch.float32, cache_dir="/data2/onkar/video_diffusion_weights")
+pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=dtype, cache_dir="/data2/onkar/video_diffusion_weights")
+pipe.to(device)
+
+# pipe.save_pretrained("/data2/onkar/video_diffusion_weights/video_model")
+
+height = 720
+width = 1280
+num_frames = 121
+num_inference_steps = 50
+guidance_scale = 12
+
+
+df = pd.read_csv("")
+
+for i in df['Detailed Description']:
+    prompt = i
+    output = pipe(
+        prompt=prompt,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        max_sequence_length = 512,
+    ).frames[0]
+
+    export_to_video(output, "5bit2v_output.mp4", fps=24)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d96acc3818d8..9ed101e9c497 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -191,6 +191,7 @@
             "AutoencoderKLQwenImage",
             "AutoencoderKLTemporalDecoder",
             "AutoencoderKLWan",
+            "AutoencoderKLMMQuant",
             "AutoencoderOobleck",
             "AutoencoderTiny",
             "AutoModel",
@@ -875,6 +876,7 @@
             AutoencoderKLQwenImage,
             AutoencoderKLTemporalDecoder,
             AutoencoderKLWan,
+            AutoencoderKLMMQuant,
             AutoencoderOobleck,
             AutoencoderTiny,
             AutoModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index 49ac2a1c56fd..02ee124fb1f5 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -41,6 +41,7 @@
     _import_structure["autoencoders.autoencoder_kl_qwenimage"] = ["AutoencoderKLQwenImage"]
     _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
     _import_structure["autoencoders.autoencoder_kl_wan"] = ["AutoencoderKLWan"]
+    _import_structure["autoencoders.autoencoder_kl_MMQuant"] = ["AutoencoderKLMMQuant"]
     _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
     _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
     _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
@@ -136,6 +137,7 @@
             AutoencoderKLQwenImage,
             AutoencoderKLTemporalDecoder,
             AutoencoderKLWan,
+            AutoencoderKLMMQuant,
             AutoencoderOobleck,
             AutoencoderTiny,
             ConsistencyDecoderVAE,
diff --git a/src/diffusers/models/autoencoders/__init__.py b/src/diffusers/models/autoencoders/__init__.py
index c008a45298e8..67ac6984e072 100644
--- a/src/diffusers/models/autoencoders/__init__.py
+++ b/src/diffusers/models/autoencoders/__init__.py
@@ -11,6 +11,7 @@
 from .autoencoder_kl_qwenimage import AutoencoderKLQwenImage
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
 from .autoencoder_kl_wan import AutoencoderKLWan
+from .autoencoder_kl_MMQuant import AutoencoderKLMMQuant
 from .autoencoder_oobleck import AutoencoderOobleck
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
diff --git a/src/diffusers/models/autoencoders/autoencoder_kl_MMQuant.py b/src/diffusers/models/autoencoders/autoencoder_kl_MMQuant.py
new file mode 100644
index 000000000000..74269ca439ab
--- /dev/null
+++ b/src/diffusers/models/autoencoders/autoencoder_kl_MMQuant.py
@@ -0,0 +1,1405 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin
+from ...utils import logging
+from ...utils.accelerate_utils import apply_forward_hook
+from ..activations import get_activation
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution
+
+
+logger = logging.get_logger(__name__)
+
+CACHE_T = 2
+
+
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+
+
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+
+
+class MMQuantCausalConv3d(nn.Conv3d):
+    r"""
+    A custom 3D causal convolution layer with feature caching support.
+
+    This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
+    caching for efficient inference.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+
+        # Set up causal padding
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+        return super().forward(x)
+
+
+class MMQuantRMS_norm(nn.Module):
+    r"""
+    A custom RMS normalization layer.
+
+    Args:
+        dim (int): The number of dimensions to normalize over.
+        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
+            Default is True.
+        images (bool, optional): Whether the input represents image data. Default is True.
+        bias (bool, optional): Whether to include a learnable bias term. Default is False.
+    """
+
+    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bias: bool = False) -> None:
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+
+
+class MMQuantUpsample(nn.Upsample):
+    r"""
+    Perform upsampling while ensuring the output tensor has the same data type as the input.
+
+    Args:
+        x (torch.Tensor): Input tensor to be upsampled.
+
+    Returns:
+        torch.Tensor: Upsampled tensor with the same data type as the input.
+    """
+
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+
+
+class MMQuantResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+
+    def __init__(self, dim: int, mode: str, upsample_out_dim: int = None) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # default to dim //2
+        if upsample_out_dim is None:
+            upsample_out_dim = dim // 2
+
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                MMQuantUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                MMQuantUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+            self.time_conv = MMQuantCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = MMQuantCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+
+class MMQuantResidualBlock(nn.Module):
+    r"""
+    A custom residual block module.
+
+    Args:
+        in_dim (int): Number of input channels.
+        out_dim (int): Number of output channels.
+        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
+        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dropout: float = 0.0,
+        non_linearity: str = "silu",
+    ) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.nonlinearity = get_activation(non_linearity)
+
+        # layers
+        self.norm1 = MMQuantRMS_norm(in_dim, images=False)
+        self.conv1 = MMQuantCausalConv3d(in_dim, out_dim, 3, padding=1)
+        self.norm2 = MMQuantRMS_norm(out_dim, images=False)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = MMQuantCausalConv3d(out_dim, out_dim, 3, padding=1)
+        self.conv_shortcut = MMQuantCausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+
+        # Dropout
+        x = self.dropout(x)
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+
+            x = self.conv2(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x)
+
+        # Add residual connection
+        return x + h
+
+
+class MMQuantAttentionBlock(nn.Module):
+    r"""
+    Causal self-attention with a single head.
+
+    Args:
+        dim (int): The number of channels in the input tensor.
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+        # layers
+        self.norm = MMQuantRMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+    def forward(self, x):
+        identity = x
+        batch_size, channels, time, height, width = x.size()
+
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * time, channels, height, width)
+        x = self.norm(x)
+
+        # compute query, key, value
+        qkv = self.to_qkv(x)
+        qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+        qkv = qkv.permute(0, 1, 3, 2).contiguous()
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        # apply attention
+        x = F.scaled_dot_product_attention(q, k, v)
+
+        x = x.squeeze(1).permute(0, 2, 1).reshape(batch_size * time, channels, height, width)
+
+        # output projection
+        x = self.proj(x)
+
+        # Reshape back: [(b*t), c, h, w] -> [b, c, t, h, w]
+        x = x.view(batch_size, time, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4)
+
+        return x + identity
+
+
+class MMQuantMidBlock(nn.Module):
+    """
+    Middle block for MMQuantVAE encoder and decoder.
+
+    Args:
+        dim (int): Number of input/output channels.
+        dropout (float): Dropout rate.
+        non_linearity (str): Type of non-linearity to use.
+    """
+
+    def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str = "silu", num_layers: int = 1):
+        super().__init__()
+        self.dim = dim
+
+        # Create the components
+        resnets = [MMQuantResidualBlock(dim, dim, dropout, non_linearity)]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(MMQuantAttentionBlock(dim))
+            resnets.append(MMQuantResidualBlock(dim, dim, dropout, non_linearity))
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # First residual block
+        x = self.resnets[0](x, feat_cache, feat_idx)
+
+        # Process through attention and residual blocks
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                x = attn(x)
+
+            x = resnet(x, feat_cache, feat_idx)
+
+        return x
+
+
+class MMQuantResidualDownBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, num_res_blocks, temperal_downsample=False, down_flag=False):
+        super().__init__()
+
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+
+        # Main path with residual blocks and downsample
+        resnets = []
+        for _ in range(num_res_blocks):
+            resnets.append(MMQuantResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            self.downsampler = MMQuantResample(out_dim, mode=mode)
+        else:
+            self.downsampler = None
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            x = resnet(x, feat_cache, feat_idx)
+        if self.downsampler is not None:
+            x = self.downsampler(x, feat_cache, feat_idx)
+
+        return x + self.avg_shortcut(x_copy)
+
+
+class MMQuantEncoder3d(nn.Module):
+    r"""
+    A 3D encoder module.
+
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_downsample (list of bool): Whether to downsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        is_residual: bool = False,  # MMQuant 2.2 vae use a residual downblock
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.nonlinearity = get_activation(non_linearity)
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv_in = MMQuantCausalConv3d(in_channels, dims[0], 3, padding=1)
+
+        # downsample blocks
+        self.down_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if is_residual:
+                self.down_blocks.append(
+                    MMQuantResidualDownBlock(
+                        in_dim,
+                        out_dim,
+                        dropout,
+                        num_res_blocks,
+                        temperal_downsample=temperal_downsample[i] if i != len(dim_mult) - 1 else False,
+                        down_flag=i != len(dim_mult) - 1,
+                    )
+                )
+            else:
+                for _ in range(num_res_blocks):
+                    self.down_blocks.append(MMQuantResidualBlock(in_dim, out_dim, dropout))
+                    if scale in attn_scales:
+                        self.down_blocks.append(MMQuantAttentionBlock(out_dim))
+                    in_dim = out_dim
+
+                # downsample block
+                if i != len(dim_mult) - 1:
+                    mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                    self.down_blocks.append(MMQuantResample(out_dim, mode=mode))
+                    scale /= 2.0
+
+        # middle blocks
+        self.mid_block = MMQuantMidBlock(out_dim, dropout, non_linearity, num_layers=1)
+
+        # output blocks
+        self.norm_out = MMQuantRMS_norm(out_dim, images=False)
+        self.conv_out = MMQuantCausalConv3d(out_dim, z_dim, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+
+
+class MMQuantResidualUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the MMQuantVAE decoder.
+
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        temperal_upsample (bool): Whether to upsample on temporal dimension
+        up_flag (bool): Whether to upsample or not
+        non_linearity (str): Type of non-linearity to use
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        temperal_upsample: bool = False,
+        up_flag: bool = False,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2,
+            )
+        else:
+            self.avg_shortcut = None
+
+        # create residual blocks
+        resnets = []
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(MMQuantResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+
+        self.resnets = nn.ModuleList(resnets)
+
+        # Add upsampling layer if needed
+        if up_flag:
+            upsample_mode = "upsample3d" if temperal_upsample else "upsample2d"
+            self.upsampler = MMQuantResample(out_dim, mode=upsample_mode, upsample_out_dim=out_dim)
+        else:
+            self.upsampler = None
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        """
+        Forward pass through the upsampling block.
+
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        x_copy = x.clone()
+
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+
+        if self.upsampler is not None:
+            if feat_cache is not None:
+                x = self.upsampler(x, feat_cache, feat_idx)
+            else:
+                x = self.upsampler(x)
+
+        if self.avg_shortcut is not None:
+            x = x + self.avg_shortcut(x_copy, first_chunk=first_chunk)
+
+        return x
+
+
+class MMQuantUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the MMQuantVAE decoder.
+
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
+        non_linearity (str): Type of non-linearity to use
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        upsample_mode: Optional[str] = None,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # Create layers list
+        resnets = []
+        # Add residual blocks and attention if needed
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(MMQuantResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+
+        self.resnets = nn.ModuleList(resnets)
+
+        # Add upsampling layer if needed
+        self.upsamplers = None
+        if upsample_mode is not None:
+            self.upsamplers = nn.ModuleList([MMQuantResample(out_dim, mode=upsample_mode)])
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=None):
+        """
+        Forward pass through the upsampling block.
+
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+
+        if self.upsamplers is not None:
+            if feat_cache is not None:
+                x = self.upsamplers[0](x, feat_cache, feat_idx)
+            else:
+                x = self.upsamplers[0](x)
+        return x
+
+
+class MMQuantDecoder3d(nn.Module):
+    r"""
+    A 3D decoder module.
+
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_upsample (list of bool): Whether to upsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        out_channels: int = 3,
+        is_residual: bool = False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+
+        # init block
+        self.conv_in = MMQuantCausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.mid_block = MMQuantMidBlock(dims[0], dropout, non_linearity, num_layers=1)
+
+        # upsample blocks
+        self.up_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i > 0 and not is_residual:
+                # MMQuant vae 2.1
+                in_dim = in_dim // 2
+
+            # determine if we need upsampling
+            up_flag = i != len(dim_mult) - 1
+            # determine upsampling mode, if not upsampling, set to None
+            upsample_mode = None
+            if up_flag and temperal_upsample[i]:
+                upsample_mode = "upsample3d"
+            elif up_flag:
+                upsample_mode = "upsample2d"
+            # Create and add the upsampling block
+            if is_residual:
+                up_block = MMQuantResidualUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    temperal_upsample=temperal_upsample[i] if up_flag else False,
+                    up_flag=up_flag,
+                    non_linearity=non_linearity,
+                )
+            else:
+                up_block = MMQuantUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    upsample_mode=upsample_mode,
+                    non_linearity=non_linearity,
+                )
+            self.up_blocks.append(up_block)
+
+        # output blocks
+        self.norm_out = MMQuantRMS_norm(out_dim, images=False)
+        self.conv_out = MMQuantCausalConv3d(out_dim, out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        ## conv1
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+
+        ## upsamples
+        for up_block in self.up_blocks:
+            x = up_block(x, feat_cache, feat_idx, first_chunk=first_chunk)
+
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+
+
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, channels, frames, height, width]
+    batch_size, channels, frames, height, width = x.shape
+
+    # Ensure height and width are divisible by patch_size
+    if height % patch_size != 0 or width % patch_size != 0:
+        raise ValueError(f"Height ({height}) and width ({width}) must be divisible by patch_size ({patch_size})")
+
+    # Reshape to [batch_size, channels, frames, height//patch_size, patch_size, width//patch_size, patch_size]
+    x = x.view(batch_size, channels, frames, height // patch_size, patch_size, width // patch_size, patch_size)
+
+    # Rearrange to [batch_size, channels * patch_size * patch_size, frames, height//patch_size, width//patch_size]
+    x = x.permute(0, 1, 6, 4, 2, 3, 5).contiguous()
+    x = x.view(batch_size, channels * patch_size * patch_size, frames, height // patch_size, width // patch_size)
+
+    return x
+
+
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, (channels * patch_size * patch_size), frame, height, width]
+    batch_size, c_patches, frames, height, width = x.shape
+    channels = c_patches // (patch_size * patch_size)
+
+    # Reshape to [b, c, patch_size, patch_size, f, h, w]
+    x = x.view(batch_size, channels, patch_size, patch_size, frames, height, width)
+
+    # Rearrange to [b, c, f, h * patch_size, w * patch_size]
+    x = x.permute(0, 1, 4, 5, 3, 6, 2).contiguous()
+    x = x.view(batch_size, channels, frames, height * patch_size, width * patch_size)
+
+    return x
+
+
+class AutoencoderKLMMQuant(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
+    Introduced in [MMQuant 2.1].
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        base_dim: int = 96,
+        decoder_base_dim: Optional[int] = None,
+        z_dim: int = 16,
+        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        num_res_blocks: int = 2,
+        attn_scales: List[float] = [],
+        temperal_downsample: List[bool] = [False, True, True],
+        dropout: float = 0.0,
+        latents_mean: List[float] = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ],
+        latents_std: List[float] = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ],
+        is_residual: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        patch_size: Optional[int] = None,
+        scale_factor_temporal: Optional[int] = 4,
+        scale_factor_spatial: Optional[int] = 8,
+    ) -> None:
+        super().__init__()
+
+        self.z_dim = z_dim
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        if decoder_base_dim is None:
+            decoder_base_dim = base_dim
+
+        self.encoder = MMQuantEncoder3d(
+            in_channels=in_channels,
+            dim=base_dim,
+            z_dim=z_dim * 2,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_downsample=temperal_downsample,
+            dropout=dropout,
+            is_residual=is_residual,
+        )
+        self.quant_conv = MMQuantCausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.post_quant_conv = MMQuantCausalConv3d(z_dim, z_dim, 1)
+
+        self.decoder = MMQuantDecoder3d(
+            dim=decoder_base_dim,
+            z_dim=z_dim,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_upsample=self.temperal_upsample,
+            dropout=dropout,
+            out_channels=out_channels,
+            is_residual=is_residual,
+        )
+
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
+
+        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
+        # to perform decoding of a single video latent at a time.
+        self.use_slicing = False
+
+        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
+        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
+        # intermediate tiles together, the memory requirement can be lowered.
+        self.use_tiling = False
+
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+
+        # The minimal distance between two spatial tiles
+        self.tile_sample_stride_height = 192
+        self.tile_sample_stride_width = 192
+
+        # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
+        self._cached_conv_counts = {
+            "decoder": sum(isinstance(m, MMQuantCausalConv3d) for m in self.decoder.modules())
+            if self.decoder is not None
+            else 0,
+            "encoder": sum(isinstance(m, MMQuantCausalConv3d) for m in self.encoder.modules())
+            if self.encoder is not None
+            else 0,
+        }
+
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_sample_stride_height: Optional[float] = None,
+        tile_sample_stride_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_sample_stride_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension.
+            tile_sample_stride_width (`int`, *optional*):
+                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
+                artifacts produced across the width dimension.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
+        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def clear_cache(self):
+        # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call
+        self._conv_num = self._cached_conv_counts["decoder"]
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = self._cached_conv_counts["encoder"]
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+
+    def _encode(self, x: torch.Tensor):
+        _, _, num_frame, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+
+        self.clear_cache()
+        if self.config.patch_size is not None:
+            x = patchify(x, patch_size=self.config.patch_size)
+        iter_ = 1 + (num_frame - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+
+        enc = self.quant_conv(out)
+        self.clear_cache()
+        return enc
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        r"""
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True):
+        _, _, num_frame, height, width = z.shape
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+
+        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        self.clear_cache()
+        x = self.post_quant_conv(z)
+        for i in range(num_frame):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=True
+                )
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                out = torch.cat([out, out_], 2)
+
+        if self.config.patch_size is not None:
+            out = unpatchify(out, patch_size=self.config.patch_size)
+
+        out = torch.clamp(out, min=-1.0, max=1.0)
+
+        self.clear_cache()
+        if not return_dict:
+            return (out,)
+
+        return DecoderOutput(sample=out)
+
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, num_frames, height, width = x.shape
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+
+        blend_height = tile_latent_min_height - tile_latent_stride_height
+        blend_width = tile_latent_min_width - tile_latent_stride_width
+
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, self.tile_sample_stride_height):
+            row = []
+            for j in range(0, width, self.tile_sample_stride_width):
+                self.clear_cache()
+                time = []
+                frame_range = 1 + (num_frames - 1) // 4
+                for k in range(frame_range):
+                    self._enc_conv_idx = [0]
+                    if k == 0:
+                        tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                    else:
+                        tile = x[
+                            :,
+                            :,
+                            1 + 4 * (k - 1) : 1 + 4 * k,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+                    tile = self.quant_conv(tile)
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, num_frames, height, width = z.shape
+        sample_height = height * self.spatial_compression_ratio
+        sample_width = width * self.spatial_compression_ratio
+
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+
+        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
+        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, tile_latent_stride_height):
+            row = []
+            for j in range(0, width, tile_latent_stride_width):
+                self.clear_cache()
+                time = []
+                for k in range(num_frames):
+                    self._conv_idx = [0]
+                    tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
+                    tile = self.post_quant_conv(tile)
+                    decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx)
+                    time.append(decoded)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, return_dict=return_dict)
+        return dec
diff --git a/test.py b/test.py
new file mode 100644
index 000000000000..399af99787e1
--- /dev/null
+++ b/test.py
@@ -0,0 +1,5 @@
+from diffusers import AutoencoderKLMMQuant
+import torch
+
+model = AutoencoderKLMMQuant.from_pretrained("onkarsus13/MMVQVae", subfolder="vae", torch_dtype=torch.float32)
+