From f481dca4df9dbddc456bfec5365f71df80136c07 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 9 Jan 2026 08:28:09 +0100
Subject: [PATCH 01/38] Run on 8gb vram laptop

---
 .../text_encoders/gemma/encoders/base_encoder.py       | 10 +++++++++-
 .../src/ltx_pipelines/ti2vid_two_stages.py             |  6 ++++++
 .../ltx-pipelines/src/ltx_pipelines/utils/helpers.py   |  9 +++++++--
 .../src/ltx_pipelines/utils/model_ledger.py            |  6 +++++-
 4 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
index e689c1af..deba4f45 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
@@ -244,8 +244,16 @@ def module_ops_from_gemma_root(gemma_root: str) -> tuple[ModuleOps, ...]:
     tokenizer_path = _find_matching_dir(gemma_root, "tokenizer.model")
 
     def load_gemma(module: GemmaTextEncoderModelBase) -> GemmaTextEncoderModelBase:
+        # Reserve 2GB VRAM for context window and activations
+        # Limit Gemma to 6GB, forcing more layers to CPU RAM
+        max_memory = {0: "6GiB", "cpu": "32GiB"}  # GPU 0: 6GB, CPU: 32GB
+        
         module.model = Gemma3ForConditionalGeneration.from_pretrained(
-            gemma_path, local_files_only=True, torch_dtype=torch.bfloat16
+            gemma_path, 
+            local_files_only=True, 
+            torch_dtype=torch.bfloat16,
+            device_map="auto",  # Enable sequential offloading
+            max_memory=max_memory  # Reserve 2GB VRAM for inference
         )
         module._gemma_root = module._gemma_root or gemma_root
         return module
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py b/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
index b835bfe5..0ae75ba5 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
@@ -111,6 +111,12 @@ def __call__(  # noqa: PLR0913
         v_context_n, a_context_n = context_n
 
         torch.cuda.synchronize()
+        # For device-mapped models, need to explicitly remove hooks before deletion
+        if hasattr(text_encoder, 'model') and hasattr(text_encoder.model, 'hf_device_map'):
+            # Remove all hooks to fully release memory
+            from accelerate.hooks import remove_hook_from_module
+            remove_hook_from_module(text_encoder.model, recurse=True)
+            text_encoder.model = None
         del text_encoder
         cleanup_memory()
 
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index 867db18f..31d27643 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -33,9 +33,14 @@ def get_device() -> torch.device:
 
 
 def cleanup_memory() -> None:
+    """Clean up GPU and system memory, including device-mapped models."""
     gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        # Second pass to ensure device-mapped tensors are released
+        gc.collect()
+        torch.cuda.empty_cache()
 
 
 def image_conditionings_by_replacing_latent(
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
index c507ff4c..edec562f 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
@@ -218,7 +218,11 @@ def text_encoder(self) -> AVGemmaTextEncoderModel:
                 "ModelLedger constructor."
             )
 
-        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+        model = self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype)
+        # If the model has device mapping (from device_map="auto"), don't call .to() as it's already distributed
+        if hasattr(model, 'model') and hasattr(model.model, 'hf_device_map') and model.model.hf_device_map:
+            return model.eval()
+        return model.to(self.device).eval()
 
     def audio_decoder(self) -> AudioDecoder:
         if not hasattr(self, "audio_decoder_builder"):

From 5ae02a4f9734bb14f0e2450da37c70ab21e70cbc Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Wed, 14 Jan 2026 03:34:17 +0100
Subject: [PATCH 02/38] Optimize for 8Gb vram

---
 .../src/ltx_core/loader/fuse_loras.py         |  33 ++-
 .../ltx-core/src/ltx_core/loader/kernels.py   | 139 +++++-----
 .../loader/single_gpu_model_builder.py        |  98 +++++--
 .../src/ltx_core/model/transformer/model.py   |  24 +-
 .../ltx_core/model/transformer/transformer.py |  15 +-
 .../gemma/encoders/base_encoder.py            |  24 +-
 .../prompts/gemma_t2v_system_prompt.txt       |  59 ++---
 .../text_encoders/gemma/feature_extractor.py  |   3 +-
 .../src/ltx_pipelines/distilled.py            | 106 ++++++--
 .../src/ltx_pipelines/ti2vid_two_stages.py    | 240 +++++++++++++++++-
 .../src/ltx_pipelines/utils/helpers.py        |  43 ++--
 .../src/ltx_pipelines/utils/model_ledger.py   |  22 +-
 12 files changed, 589 insertions(+), 217 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
index 66269dd7..0da89375 100644
--- a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
+++ b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
@@ -1,13 +1,41 @@
 import torch
-import triton
+# import triton
 
 from ltx_core.loader.kernels import fused_add_round_kernel
 from ltx_core.loader.primitives import LoraStateDictWithStrength, StateDict
 
 BLOCK_SIZE = 1024
 
+from line_profiler import profile
 
+@profile
 def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
+    """
+    Native PyTorch implementation of fused_add_round_launch.
+
+    Note:
+    1. Requires PyTorch 2.1 or newer for torch.float8 support.
+    2. The 'seed' argument is accepted to maintain API compatibility but is ignored
+       because native PyTorch addition uses deterministic Round-To-Nearest-Even (RTNE)
+       rather than stochastic rounding.
+    """
+    # Validation logic from original function
+    if original_weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        raise ValueError("Unsupported dtype")
+
+    if target_weight.dtype != torch.bfloat16:
+        raise ValueError("target_weight dtype must be bfloat16")
+
+    # Implementation:
+    # 1. Cast original_weight (fp8) to target_weight dtype (bf16).
+    #    Since bf16 has higher dynamic range and precision than fp8, this upcast is exact.
+    # 2. Add in-place.
+    target_weight.add_(original_weight.to(target_weight.dtype))
+
+    return target_weight
+
+
+def fused_add_round_launch__(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
     if original_weight.dtype == torch.float8_e4m3fn:
         exponent_bits, mantissa_bits, exponent_bias = 4, 3, 7
     elif original_weight.dtype == torch.float8_e5m2:
@@ -20,7 +48,8 @@ def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.T
 
     # Calculate grid and block sizes
     n_elements = original_weight.numel()
-    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+    #grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+    grid = 0
 
     # Launch kernel
     fused_add_round_kernel[grid](
diff --git a/packages/ltx-core/src/ltx_core/loader/kernels.py b/packages/ltx-core/src/ltx_core/loader/kernels.py
index ee4cefbe..765f367e 100644
--- a/packages/ltx-core/src/ltx_core/loader/kernels.py
+++ b/packages/ltx-core/src/ltx_core/loader/kernels.py
@@ -1,72 +1,79 @@
-# ruff: noqa: ANN001, ANN201, ERA001, N803, N806
-import triton
-import triton.language as tl
+import torch
 
+from line_profiler import profile
 
-@triton.jit
+@profile
 def fused_add_round_kernel(
-    x_ptr,
-    output_ptr,  # contents will be added to the output
-    seed,
-    n_elements,
-    EXPONENT_BIAS,
-    MANTISSA_BITS,
-    BLOCK_SIZE: tl.constexpr,
+        x: torch.Tensor,
+        output: torch.Tensor,
+        seed: int,
+        n_elements: int,  # Kept for signature compatibility, but unused
+        EXPONENT_BIAS: int,
+        MANTISSA_BITS: int,
+        BLOCK_SIZE: int = None,  # Kept for signature compatibility, but unused
 ):
     """
-    A kernel to upcast 8bit quantized weights to bfloat16 with stochastic rounding
-    and add them to bfloat16 output weights. Might be used to upcast original model weights
-    and to further add them to precalculated deltas coming from LoRAs.
+    Native PyTorch implementation of the fused_add_round_kernel.
+
+    This performs:
+    1. Upcast 8-bit weights (x) to match output precision.
+    2. Add output weights (deltas) to x.
+    3. Calculate the epsilon (quantization noise step) based on the target
+       Float8 parameters (EXPONENT_BIAS, MANTISSA_BITS).
+    4. Apply stochastic rounding (add noise proportional to epsilon).
+    5. Store back to output.
     """
-    # Get program ID and compute offsets
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-
-    # Load data
-    x = tl.load(x_ptr + offsets, mask=mask)
-    rand_vals = tl.rand(seed, offsets) - 0.5
-
-    x = tl.cast(x, tl.float16)
-    delta = tl.load(output_ptr + offsets, mask=mask)
-    delta = tl.cast(delta, tl.float16)
-    x = x + delta
-
-    x_bits = tl.cast(x, tl.int16, bitcast=True)
-
-    # Calculate the exponent. Unbiased fp16 exponent is ((x_bits & 0x7C00) >> 10) - 15 for
-    # normal numbers and -14 for subnormals.
-    fp16_exponent_bits = (x_bits & 0x7C00) >> 10
-    fp16_normals = fp16_exponent_bits > 0
-    fp16_exponent = tl.where(fp16_normals, fp16_exponent_bits - 15, -14)
-
-    # Add the target dtype's exponent bias and clamp to the target dtype's exponent range.
-    exponent = fp16_exponent + EXPONENT_BIAS
-    MAX_EXPONENT = 2 * EXPONENT_BIAS + 1
-    exponent = tl.where(exponent > MAX_EXPONENT, MAX_EXPONENT, exponent)
-    exponent = tl.where(exponent < 0, 0, exponent)
-
-    # Normal ULP exponent, expressed as an fp16 exponent field:
-    # (exponent - EXPONENT_BIAS - MANTISSA_BITS) + 15
-    # Simplifies to: fp16_exponent - MANTISSA_BITS + 15
-    # See https://en.wikipedia.org/wiki/Unit_in_the_last_place
-    eps_exp = tl.maximum(0, tl.minimum(31, exponent - EXPONENT_BIAS - MANTISSA_BITS + 15))
-
-    # Calculate epsilon in the target dtype
-    eps_normal = tl.cast(tl.cast(eps_exp << 10, tl.int16), tl.float16, bitcast=True)
-
-    # Subnormal ULP: 2^(1 - EXPONENT_BIAS - MANTISSA_BITS) ->
-    # fp16 exponent bits: (1 - EXPONENT_BIAS - MANTISSA_BITS) + 15 =
-    # 16 - EXPONENT_BIAS - MANTISSA_BITS
-    eps_subnormal = tl.cast(tl.cast((16 - EXPONENT_BIAS - MANTISSA_BITS) << 10, tl.int16), tl.float16, bitcast=True)
-    eps = tl.where(exponent > 0, eps_normal, eps_subnormal)
-
-    # Apply zero mask to epsilon
-    eps = tl.where(x == 0, 0.0, eps)
-
-    # Apply stochastic rounding
-    output = tl.cast(x + rand_vals * eps, tl.bfloat16)
-
-    # Store the result
-    tl.store(output_ptr + offsets, output, mask=mask)
+
+    # 1. Setup Generators for stochastic rounding
+    # We use a specific generator to respect the seed argument
+    gen = torch.Generator(device=output.device).manual_seed(seed)
+
+    # 2. Load and Cast to calculation precision (Float32 for safety, or Float16)
+    # Using Float32 ensures high precision during the intermediate math
+    val_x = x.to(torch.float32)
+    val_delta = output.to(torch.float32)
+
+    # x = x + delta
+    val = val_x + val_delta
+
+    # 3. Calculate Epsilon (The Stochastic Rounding Step)
+    # The Triton kernel calculates epsilon based on the magnitude of 'val'
+    # mapped onto the specific Float8 exponent grid.
+
+    # Extract exponent: val = mantissa * 2^exp.
+    # torch.frexp returns exp such that 0.5 <= |mantissa| < 1.0.
+    # IEEE 754 log2(x) is (exp - 1).
+    _, exp_obj = torch.frexp(val)
+    unbiased_exp = exp_obj - 1
+
+    # Map to target Float8 exponent space
+    target_exp = unbiased_exp + EXPONENT_BIAS
+
+    # Clamp exponent to target dtype range.
+    # Max is standard formulation (2*Bias + 1).
+    # Min is 1. Why 1? In the original Triton kernel, subnormals (exp <= 0)
+    # utilize a constant epsilon calculated based on exponent=1 (the smallest normal).
+    max_exponent = 2 * EXPONENT_BIAS + 1
+    target_exp_clamped = torch.clamp(target_exp, min=1, max=max_exponent)
+
+    # Calculate ULP exponent: E_target - BIAS - Mantissa_Bits
+    eps_exponent = target_exp_clamped - EXPONENT_BIAS - MANTISSA_BITS
+
+    # Convert exponent to actual epsilon value: 2^eps_exponent
+    eps = torch.pow(2.0, eps_exponent.to(torch.float32))
+
+    # Mask epsilon where value is exactly 0 (matches `tl.where(x == 0, 0.0, eps)`)
+    eps = torch.where(val == 0, 0.0, eps)
+
+    # 4. Generate Random Noise [-0.5, 0.5]
+    rand_vals = torch.rand(val.shape, generator=gen, device=val.device) - 0.5
+
+    # 5. Apply Stochastic Rounding
+    # output = x + (noise * epsilon)
+    result = val + (rand_vals * eps)
+
+    # 6. Store Result
+    # In-place update of the output tensor, cast to bfloat16
+    output.copy_(result.to(torch.bfloat16))
+
+    # No return value needed as operation is in-place on output_ptr/output
\ No newline at end of file
diff --git a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
index 9e8853a4..55119036 100644
--- a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
+++ b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
@@ -21,11 +21,14 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+from loguru import logger
+from accelerate import dispatch_model, infer_auto_device_map
+
 
 @dataclass(frozen=True)
 class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType], LoRAAdaptableProtocol):
     """
-    Builder for PyTorch models residing on a single GPU.
+    Builder for PyTorch models residing on a single GPU or offloaded via Accelerate.
     """
 
     model_class_configurator: type[ModelConfigurator[ModelType]]
@@ -69,33 +72,78 @@ def _return_model(self, meta_model: ModelType, device: torch.device) -> ModelTyp
         retval = meta_model.to(device)
         return retval
 
-    def build(self, device: torch.device | None = None, dtype: torch.dtype | None = None) -> ModelType:
-        device = torch.device("cuda") if device is None else device
+    def build(
+            self,
+            device: torch.device | None = None,
+            dtype: torch.dtype | None = None,
+            max_memory: dict[int | str, str] | None = None
+    ) -> ModelType:
+        target_device = torch.device("cuda") if device is None else device
+
+        # 1. Get Config and Meta Model
         config = self.model_config()
         meta_model = self.meta_model(config, self.module_ops)
+
+        # 2. Load Base State Dict
         model_paths = self.model_path if isinstance(self.model_path, tuple) else [self.model_path]
-        model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry, device=device)
+        load_device = target_device if max_memory is None else torch.device("cpu")
+        model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry,
+                                        device=load_device)
 
+        # 3. Handle LoRAs
         lora_strengths = [lora.strength for lora in self.loras]
+        final_sd_map = {}
+
         if not lora_strengths or (min(lora_strengths) == 0 and max(lora_strengths) == 0):
-            sd = model_state_dict.sd
-            if dtype is not None:
-                sd = {key: value.to(dtype=dtype) for key, value in model_state_dict.sd.items()}
-            meta_model.load_state_dict(sd, strict=False, assign=True)
-            return self._return_model(meta_model, device)
-
-        lora_state_dicts = [
-            self.load_sd([lora.path], sd_ops=lora.sd_ops, registry=self.registry, device=device) for lora in self.loras
-        ]
-        lora_sd_and_strengths = [
-            LoraStateDictWithStrength(sd, strength)
-            for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
-        ]
-        final_sd = apply_loras(
-            model_sd=model_state_dict,
-            lora_sd_and_strengths=lora_sd_and_strengths,
-            dtype=dtype,
-            destination_sd=model_state_dict if isinstance(self.registry, DummyRegistry) else None,
-        )
-        meta_model.load_state_dict(final_sd.sd, strict=False, assign=True)
-        return self._return_model(meta_model, device)
+            final_sd_map = model_state_dict.sd
+        else:
+            # Convert LoRAs to float32 on CPU to prevent slow BF16 emulation
+            lora_state_dicts = []
+            for lora in self.loras:
+                lsd = self.load_sd([lora.path], sd_ops=lora.sd_ops, registry=self.registry, device=load_device)
+
+                if load_device.type == "cpu":
+                    # In-place conversion of LoRA tensors to float32 for speed
+                    # This speeds up the matmul in apply_loras significantly
+                    for k, v in lsd.sd.items():
+                        if v.dtype in [torch.bfloat16, torch.float16]:
+                            lsd.sd[k] = v.to(dtype=torch.float32)
+
+                lora_state_dicts.append(lsd)
+
+            lora_sd_and_strengths = [
+                LoraStateDictWithStrength(sd, strength)
+                for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
+            ]
+
+            dest_sd = model_state_dict if isinstance(self.registry, DummyRegistry) else None
+
+            final_sd_obj = apply_loras(
+                model_sd=model_state_dict,
+                lora_sd_and_strengths=lora_sd_and_strengths,
+                dtype=dtype,
+                destination_sd=dest_sd,
+            )
+            final_sd_map = final_sd_obj.sd
+
+        # 4. Cast Dtypes if requested
+        if dtype is not None:
+            final_sd_map = {k: v.to(dtype=dtype) for k, v in final_sd_map.items()}
+
+        # 5. Load State Dict into Model
+        meta_model.load_state_dict(final_sd_map, strict=False, assign=True)
+
+        # 6. Return based on Offloading strategy
+        if max_memory is not None:
+            logger.info(f"Dispatching model with max_memory constraints: {max_memory}")
+            no_split_modules = getattr(self.model_class_configurator, "no_split_modules", None)
+            device_map = infer_auto_device_map(
+                meta_model,
+                max_memory=max_memory,
+                no_split_module_classes=no_split_modules,
+                dtype=dtype
+            )
+            model = dispatch_model(meta_model, device_map=device_map)
+            return model
+
+        return self._return_model(meta_model, target_device)
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/model.py b/packages/ltx-core/src/ltx_core/model/transformer/model.py
index 411e3b42..dc4d662f 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/model.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/model.py
@@ -16,6 +16,8 @@
 )
 from ltx_core.utils import to_denoised
 
+#from line_profiler import profile
+
 
 class LTXModelType(Enum):
     AudioVideo = "ltx av model"
@@ -35,6 +37,7 @@ class LTXModel(torch.nn.Module):
     This class implements the transformer blocks for the LTX model.
     """
 
+    #@profile 1.37738 s
     def __init__(  # noqa: PLR0913
         self,
         *,
@@ -105,7 +108,7 @@ def __init__(  # noqa: PLR0913
 
         self._init_preprocessors(cross_pe_max_pos)
         # Initialize transformer blocks
-        self._init_transformer_blocks(
+        self._init_transformer_blocks(  # 98.2%
             num_layers=num_layers,
             attention_head_dim=attention_head_dim if model_type.is_video_enabled() else 0,
             cross_attention_dim=cross_attention_dim,
@@ -115,6 +118,7 @@ def __init__(  # noqa: PLR0913
             attention_type=attention_type,
         )
 
+    #@profile 0.0069204 s
     def _init_video(
         self,
         in_channels: int,
@@ -139,6 +143,7 @@ def _init_video(
         self.norm_out = torch.nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=norm_eps)
         self.proj_out = torch.nn.Linear(self.inner_dim, out_channels)
 
+    #@profile 0.0063044 s
     def _init_audio(
         self,
         in_channels: int,
@@ -166,6 +171,7 @@ def _init_audio(
         self.audio_norm_out = torch.nn.LayerNorm(self.audio_inner_dim, elementwise_affine=False, eps=norm_eps)
         self.audio_proj_out = torch.nn.Linear(self.audio_inner_dim, out_channels)
 
+    #@profile 0.0111731 s
     def _init_audio_video(
         self,
         num_scale_shift_values: int,
@@ -191,6 +197,7 @@ def _init_audio_video(
             embedding_coefficient=1,
         )
 
+    #@profile 0.0002355 s
     def _init_preprocessors(
         self,
         cross_pe_max_pos: int | None = None,
@@ -263,6 +270,7 @@ def _init_preprocessors(
                 rope_type=self.rope_type,
             )
 
+    #@profile 1.3519 s
     def _init_transformer_blocks(
         self,
         num_layers: int,
@@ -296,7 +304,7 @@ def _init_transformer_blocks(
         )
         self.transformer_blocks = torch.nn.ModuleList(
             [
-                BasicAVTransformerBlock(
+                BasicAVTransformerBlock(  # 99.9%
                     idx=idx,
                     video=video_config,
                     audio=audio_config,
@@ -308,6 +316,7 @@ def _init_transformer_blocks(
             ]
         )
 
+    #@profile unused
     def set_gradient_checkpointing(self, enable: bool) -> None:
         """Enable or disable gradient checkpointing for transformer blocks.
         Gradient checkpointing trades compute for memory by recomputing activations
@@ -318,6 +327,7 @@ def set_gradient_checkpointing(self, enable: bool) -> None:
         """
         self._enable_gradient_checkpointing = enable
 
+    #@profile 498.557 s
     def _process_transformer_blocks(
         self,
         video: TransformerArgs | None,
@@ -340,7 +350,7 @@ def _process_transformer_blocks(
                     use_reentrant=False,
                 )
             else:
-                video, audio = block(
+                video, audio = block(  # 100%
                     video=video,
                     audio=audio,
                     perturbations=perturbations,
@@ -348,6 +358,7 @@ def _process_transformer_blocks(
 
         return video, audio
 
+    #@profile 0.0648487 s
     def _process_output(
         self,
         scale_shift_table: torch.Tensor,
@@ -368,6 +379,7 @@ def _process_output(
         x = proj_out(x)
         return x
 
+    #@profile 502.847 s
     def forward(
         self, video: Modality | None, audio: Modality | None, perturbations: BatchedPerturbationConfig
     ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -384,7 +396,7 @@ def forward(
         video_args = self.video_args_preprocessor.prepare(video) if video is not None else None
         audio_args = self.audio_args_preprocessor.prepare(audio) if audio is not None else None
         # Process transformer blocks
-        video_out, audio_out = self._process_transformer_blocks(
+        video_out, audio_out = self._process_transformer_blocks(  # 99.1%
             video=video_args,
             audio=audio_args,
             perturbations=perturbations,
@@ -450,7 +462,7 @@ class X0Model(torch.nn.Module):
     def __init__(self, velocity_model: LTXModel):
         super().__init__()
         self.velocity_model = velocity_model
-
+    #@profile 502.854 s
     def forward(
         self,
         video: Modality | None,
@@ -462,7 +474,7 @@ def forward(
         Returns:
             Denoised video and audio
         """
-        vx, ax = self.velocity_model(video, audio, perturbations)
+        vx, ax = self.velocity_model(video, audio, perturbations)  # 100%
         denoised_video = to_denoised(video.latent, vx, video.timesteps) if vx is not None else None
         denoised_audio = to_denoised(audio.latent, ax, audio.timesteps) if ax is not None else None
         return denoised_video, denoised_audio
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
index 047faaab..78173ddd 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
@@ -9,6 +9,8 @@
 from ltx_core.model.transformer.transformer_args import TransformerArgs
 from ltx_core.utils import rms_norm
 
+#from line_profiler import profile
+
 
 @dataclass
 class TransformerConfig:
@@ -103,17 +105,19 @@ def __init__(
 
         self.norm_eps = norm_eps
 
+    #@profile 1.26368 s
     def get_ada_values(
         self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice
     ) -> tuple[torch.Tensor, ...]:
         num_ada_params = scale_shift_table.shape[0]
 
         ada_values = (
-            scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=timestep.device, dtype=timestep.dtype)
+            scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=timestep.device, dtype=timestep.dtype)  # 89.6%
             + timestep.reshape(batch_size, timestep.shape[1], num_ada_params, -1)[:, :, indices, :]
         ).unbind(dim=2)
         return ada_values
 
+    #@profile 0.925723 s
     def get_av_ca_ada_values(
         self,
         scale_shift_table: torch.Tensor,
@@ -122,7 +126,7 @@ def get_av_ca_ada_values(
         gate_timestep: torch.Tensor,
         num_scale_shift_values: int = 4,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale_shift_ada_values = self.get_ada_values(
+        scale_shift_ada_values = self.get_ada_values(  # 86%
             scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None)
         )
         gate_ada_values = self.get_ada_values(
@@ -134,6 +138,7 @@ def get_av_ca_ada_values(
 
         return (*scale_shift_chunks, *gate_ada_values)
 
+    #@profile 859.862 s
     def forward(  # noqa: PLR0915
         self,
         video: TransformerArgs | None,
@@ -160,9 +165,9 @@ def forward(  # noqa: PLR0915
             if not perturbations.all_in_batch(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx):
                 norm_vx = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_msa) + vshift_msa
                 v_mask = perturbations.mask_like(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx, vx)
-                vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings) * vgate_msa * v_mask
+                vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings) * vgate_msa * v_mask  # 24%
 
-            vx = vx + self.attn2(rms_norm(vx, eps=self.norm_eps), context=video.context, mask=video.context_mask)
+            vx = vx + self.attn2(rms_norm(vx, eps=self.norm_eps), context=video.context, mask=video.context_mask)  # 14%
 
             del vshift_msa, vscale_msa, vgate_msa
 
@@ -258,7 +263,7 @@ def forward(  # noqa: PLR0915
                 self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None)
             )
             vx_scaled = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_mlp) + vshift_mlp
-            vx = vx + self.ff(vx_scaled) * vgate_mlp
+            vx = vx + self.ff(vx_scaled) * vgate_mlp  # 33%
 
             del vshift_mlp, vscale_mlp, vgate_mlp
 
diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
index deba4f45..4c14ed48 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
@@ -3,7 +3,7 @@
 
 import torch
 from einops import rearrange
-from transformers import AutoImageProcessor, Gemma3ForConditionalGeneration, Gemma3Processor
+from transformers import AutoImageProcessor, Gemma3ForConditionalGeneration, Gemma3Processor, BitsAndBytesConfig
 
 from ltx_core.loader.module_ops import ModuleOps
 from ltx_core.text_encoders.gemma.feature_extractor import GemmaFeaturesExtractorProjLinear
@@ -44,12 +44,16 @@ def _run_feature_extractor(
         encoded_text_features = torch.stack(hidden_states, dim=-1)
         encoded_text_features_dtype = encoded_text_features.dtype
 
+        print(encoded_text_features_dtype)
+
         sequence_lengths = attention_mask.sum(dim=-1)
         normed_concated_encoded_text_features = _norm_and_concat_padded_batch(
             encoded_text_features, sequence_lengths, padding_side=padding_side
         )
+        print("normed_concated_encoded_text_features")
+        print(normed_concated_encoded_text_features.dtype)
 
-        return self.feature_extractor_linear(normed_concated_encoded_text_features.to(encoded_text_features_dtype))
+        return self.feature_extractor_linear(normed_concated_encoded_text_features.to(torch.bfloat16))
 
     def _convert_to_additive_mask(self, attention_mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
         return (attention_mask - 1).to(dtype).reshape(
@@ -244,13 +248,21 @@ def module_ops_from_gemma_root(gemma_root: str) -> tuple[ModuleOps, ...]:
     tokenizer_path = _find_matching_dir(gemma_root, "tokenizer.model")
 
     def load_gemma(module: GemmaTextEncoderModelBase) -> GemmaTextEncoderModelBase:
+        #max_memory = {0: "8GiB", "cpu": "32GiB"}
+        # 2. Load the model
+        #module.model = Gemma3ForConditionalGeneration.from_pretrained(
+        #    gemma_path,
+        #    local_files_only=True,
+        #    device_map="auto",
+        #    max_memory=max_memory
+        #)
+
         # Reserve 2GB VRAM for context window and activations
         # Limit Gemma to 6GB, forcing more layers to CPU RAM
-        max_memory = {0: "6GiB", "cpu": "32GiB"}  # GPU 0: 6GB, CPU: 32GB
-        
+        max_memory = {0: "3GiB", "cpu": "32GiB"}  # GPU 0: 6GB, CPU: 32GB
         module.model = Gemma3ForConditionalGeneration.from_pretrained(
-            gemma_path, 
-            local_files_only=True, 
+            gemma_path,
+            local_files_only=True,
             torch_dtype=torch.bfloat16,
             device_map="auto",  # Enable sequential offloading
             max_memory=max_memory  # Reserve 2GB VRAM for inference
diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/prompts/gemma_t2v_system_prompt.txt b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/prompts/gemma_t2v_system_prompt.txt
index e8642019..f16acd88 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/prompts/gemma_t2v_system_prompt.txt
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/prompts/gemma_t2v_system_prompt.txt
@@ -1,40 +1,23 @@
-You are a Creative Assistant. Given a user's raw input prompt describing a scene or concept, expand it into a detailed video generation prompt with specific visuals and integrated audio to guide a text-to-video model.
-
-#### Guidelines
-- Strictly follow all aspects of the user's raw input: include every element requested (style, visuals, motions, actions, camera movement, audio).
-    - If the input is vague, invent concrete details: lighting, textures, materials, scene settings, etc.
-        - For characters: describe gender, clothing, hair, expressions. DO NOT invent unrequested characters.
-- Use active language: present-progressive verbs ("is walking," "speaking"). If no action specified, describe natural movements.
-- Maintain chronological flow: use temporal connectors ("as," "then," "while").
-- Audio layer: Describe complete soundscape (background audio, ambient sounds, SFX, speech/music when requested). Integrate sounds chronologically alongside actions. Be specific (e.g., "soft footsteps on tile"), not vague (e.g., "ambient sound is present").
-- Speech (only when requested):
-    - For ANY speech-related input (talking, conversation, singing, etc.), ALWAYS include exact words in quotes with voice characteristics (e.g., "The man says in an excited voice: 'You won't believe what I just saw!'").
-    - Specify language if not English and accent if relevant.
-- Style: Include visual style at the beginning: "Style: <style>, <rest of prompt>." Default to cinematic-realistic if unspecified. Omit if unclear.
-- Visual and audio only: NO non-visual/auditory senses (smell, taste, touch).
-- Restrained language: Avoid dramatic/exaggerated terms. Use mild, natural phrasing.
-    - Colors: Use plain terms ("red dress"), not intensified ("vibrant blue," "bright red").
-    - Lighting: Use neutral descriptions ("soft overhead light"), not harsh ("blinding light").
-    - Facial features: Use delicate modifiers for subtle features (i.e., "subtle freckles").
-
-#### Important notes:
-- Analyze the user's raw input carefully. In cases of FPV or POV, exclude the description of the subject whose POV is requested.
-- Camera motion: DO NOT invent camera motion unless requested by the user.
-- Speech: DO NOT modify user-provided character dialogue unless it's a typo.
-- No timestamps or cuts: DO NOT use timestamps or describe scene cuts unless explicitly requested.
-- Format: DO NOT use phrases like "The scene opens with...". Start directly with Style (optional) and chronological scene description.
-- Format: DO NOT start your response with special characters.
-- DO NOT invent dialogue unless the user mentions speech/talking/singing/conversation.
-- If the user's raw input prompt is highly detailed, chronological and in the requested format: DO NOT make major edits or introduce new elements. Add/enhance audio descriptions if missing.
-
-#### Output Format (Strict):
-- Single continuous paragraph in natural language (English).
-- NO titles, headings, prefaces, code fences, or Markdown.
-- If unsafe/invalid, return original user prompt. Never ask questions or clarifications.
-
-Your output quality is CRITICAL. Generate visually rich, dynamic prompts with integrated audio for high-quality video generation.
-
-#### Example
+You are a Creative Assistant. Expand the user's raw input into a detailed video generation prompt with integrated audio.
+Output Structure (Strict):
+Provide a single continuous paragraph in natural English.
+NO markdown, titles, code fences, or timestamps.
+NO meta-phrases (e.g., "The scene opens," "A video of").
+Start immediately with: "Style: [style], [description]." (Default to "cinematic-realistic" if unspecified).
+Content Guidelines:
+Visuals & Logic: Strictly follow user requests. For vague inputs, invent concrete details (lighting, textures, materials, clothing, setting). Maintain chronological flow using temporal connectors ("as," "while"). For POV/FPV inputs, exclude the subject's description.
+Language: Use active, present-progressive verbs ("is walking"). Use restrained, natural phrasing; avoid exaggerated adjectives (e.g., use "red" instead of "vibrant red"; "soft light" instead of "blinding light"). Do not describe non-visual/auditory senses (smell/touch).
+Audio: Integrate specific ambient sounds and SFX chronologically alongside actions (e.g., "heavy footsteps on gravel"). Avoid vague descriptions like "background sound."
+Speech:
+If requested/implied: Always include exact words in quotes with specific voice/tone descriptors (e.g., "She speaks in a raspy whisper: 'Hello.'"). Specify language/accent if needed.
+If NOT requested: Do not invent dialogue.
+Constraints:
+DO NOT invent camera movement unless requested.
+DO NOT invent unrequested characters.
+If the user input is already detailed/formatted, do not alter visuals; only enhance audio/flow.
+If unsafe, return the original prompt.
+Max lel for output 512 tokens.
+Example:
 Input: "A woman at a coffee shop talking on the phone"
 Output:
-Style: realistic with cinematic lighting. In a medium close-up, a woman in her early 30s with shoulder-length brown hair sits at a small wooden table by the window. She wears a cream-colored turtleneck sweater, holding a white ceramic coffee cup in one hand and a smartphone to her ear with the other. Ambient cafe sounds fill the space—espresso machine hiss, quiet conversations, gentle clinking of cups. The woman listens intently, nodding slightly, then takes a sip of her coffee and sets it down with a soft clink. Her face brightens into a warm smile as she speaks in a clear, friendly voice, 'That sounds perfect! I'd love to meet up this weekend. How about Saturday afternoon?' She laughs softly—a genuine chuckle—and shifts in her chair. Behind her, other patrons move subtly in and out of focus. 'Great, I'll see you then,' she concludes cheerfully, lowering the phone.
+Style: realistic with cinematic lighting. In a medium close-up, a woman in her 30s with shoulder-length brown hair sits at a wooden table. She wears a cream turtleneck, holding a coffee cup and a phone. Ambient cafe sounds fill the space—espresso machine hiss, quiet chatter, clinking cups. She listens intently, nods, then sips her coffee. Her face brightens as she speaks in a friendly voice, 'That sounds perfect! Saturday afternoon?' She chuckles softly. Behind her, patrons move out of focus. 'Great, see you then,' she says, lowering the phone.
\ No newline at end of file
diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/feature_extractor.py b/packages/ltx-core/src/ltx_core/text_encoders/gemma/feature_extractor.py
index ab41dd6d..2b66fbd7 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/feature_extractor.py
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/feature_extractor.py
@@ -19,7 +19,7 @@ def __init__(self) -> None:
         The input dimension is expected to be 3840 * 49, and the output is 3840.
         """
         super().__init__()
-        self.aggregate_embed = torch.nn.Linear(3840 * 49, 3840, bias=False)
+        self.aggregate_embed = torch.nn.Linear(3840 * 49, 3840, bias=False, dtype=torch.bfloat16)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -29,6 +29,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Returns:
             torch.Tensor: Output tensor of shape (batch_size, 3840).
         """
+        print(x.dtype)
         return self.aggregate_embed(x)
 
     @classmethod
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index e3b51ba8..d98c9c62 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from collections.abc import Iterator
 
 import torch
@@ -70,42 +71,103 @@ def __init__(
             device=device,
         )
 
+    @torch.inference_mode()
+    #@profile
     def __call__(
-        self,
-        prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        images: list[tuple[str, int, float]],
-        tiling_config: TilingConfig | None = None,
-        enhance_prompt: bool = False,
+            self,
+            prompt: str,
+            seed: int,
+            height: int,
+            width: int,
+            num_frames: int,
+            frame_rate: float,
+            images: list[tuple[str, int, float]],
+            tiling_config: TilingConfig | None = None,
+            enhance_prompt: bool = False,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
+        import hashlib
+        import os
+
+        print("Start Call")
+        startAt = time.time()
         assert_resolution(height=height, width=width, is_two_stage=True)
 
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
+        print("starting text encoder", time.time() - startAt)
+
+        # --- DISK CACHE LOGIC START ---
+        CACHE_DIR = "./prompt_embeddings_cache"
+        os.makedirs(CACHE_DIR, exist_ok=True)
+
+        # 1. Create a unique hash string based on inputs that affect text encoding
+        # Distilled pipeline usually doesn't use negative prompts, so we exclude it from the hash
+        image_identifier = images[0][0] if (len(images) > 0 and enhance_prompt) else "no_img"
+
+        hash_input_str = (
+            f"prompt:{prompt}|"
+            f"pipeline:distilled|"  # Differentiates from standard t2v if they share a folder
+            f"enhance:{enhance_prompt}|"
+            f"seed:{seed if enhance_prompt else 'ignored'}|"
+            f"img:{image_identifier}"
+        )
+
+        # Create MD5 hash for filename
+        cache_filename = hashlib.md5(hash_input_str.encode('utf-8')).hexdigest() + ".pt"
+        cache_path = os.path.join(CACHE_DIR, cache_filename)
+
+        context_p = None
+
+        if os.path.exists(cache_path):
+            print(f"Disk cache hit! Loading embeddings from {cache_path}")
+            try:
+                # Load directly to the correct device
+                # For distilled, we only saved context_p
+                context_p = torch.load(cache_path, map_location=self.device)
+            except Exception as e:
+                print(f"Failed to load cache (corrupted?): {e}. Regenerating.")
+
+        # If cache miss or load failed
+        if context_p is None:
+            print("Disk cache miss. Running text encoder.")
+            text_encoder = self.model_ledger.text_encoder()
 
-        text_encoder = self.model_ledger.text_encoder()
-        if enhance_prompt:
-            prompt = generate_enhanced_prompt(text_encoder, prompt, images[0][0] if len(images) > 0 else None)
-        context_p = encode_text(text_encoder, prompts=[prompt])[0]
+            # Logic to handle prompt enhancement
+            current_prompt = prompt
+            if enhance_prompt:
+                current_prompt = generate_enhanced_prompt(
+                    text_encoder, prompt, images[0][0] if len(images) > 0 else None
+                )
+
+            # In distilled pipeline, we usually only take the first element (positive)
+            # and there is no negative context generated
+            context_p = encode_text(text_encoder, prompts=[current_prompt])[0]
+
+            # Save to disk for next time
+            print(f"Saving embeddings to {cache_path}")
+            torch.save(context_p, cache_path)
+
+            torch.cuda.synchronize()
+            del text_encoder
+            cleanup_memory()
+        # --- DISK CACHE LOGIC END ---
+
+        # Unpack the positive context (Distilled usually splits this into video/audio context)
         video_context, audio_context = context_p
 
-        torch.cuda.synchronize()
-        del text_encoder
-        cleanup_memory()
+        print("end text encoder", time.time() - startAt)
 
+        print("Stage 1: Initial low resolution video generation.", time.time() - startAt)
         # Stage 1: Initial low resolution video generation.
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
 
         def denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
         ) -> tuple[LatentState, LatentState]:
             return euler_denoising_loop(
                 sigmas=sigmas,
@@ -134,7 +196,7 @@ def denoising_loop(
             dtype=dtype,
             device=self.device,
         )
-
+        print("Stage 1: Starting denoising loop.", time.time() - startAt)
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
@@ -146,11 +208,14 @@ def denoising_loop(
             dtype=dtype,
             device=self.device,
         )
+        print("Stage 1: End denoising loop.", time.time() - startAt)
 
+        print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
         # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=self.model_ledger.spatial_upsampler()
         )
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
 
         torch.cuda.synchronize()
         cleanup_memory()
@@ -179,7 +244,7 @@ def denoising_loop(
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
         )
-
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
         torch.cuda.synchronize()
         del transformer
         del video_encoder
@@ -189,6 +254,7 @@ def denoising_loop(
         decoded_audio = vae_decode_audio(
             audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
         )
+        print("Stage 2:vae decode video end.", time.time() - startAt)
         return decoded_video, decoded_audio
 
 
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py b/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
index 0ae75ba5..4d35ff45 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/ti2vid_two_stages.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from collections.abc import Iterator
 
 import torch
@@ -35,6 +36,8 @@
 from ltx_pipelines.utils.media_io import encode_video
 from ltx_pipelines.utils.types import PipelineComponents
 
+from line_profiler import profile
+
 device = get_device()
 
 
@@ -56,6 +59,8 @@ def __init__(
         device: str = device,
         fp8transformer: bool = False,
     ):
+        print("Start Init")
+        startAt = time.time()
         self.device = device
         self.dtype = torch.bfloat16
         self.stage_1_model_ledger = ModelLedger(
@@ -76,9 +81,10 @@ def __init__(
             dtype=self.dtype,
             device=device,
         )
+        print("End Init", time.time() - startAt)
 
     @torch.inference_mode()
-    def __call__(  # noqa: PLR0913
+    def old__call__(  # noqa: PLR0913
         self,
         prompt: str,
         negative_prompt: str,
@@ -93,6 +99,8 @@ def __call__(  # noqa: PLR0913
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
+        print("Start Call")
+        startAt = time.time()
         assert_resolution(height=height, width=width, is_two_stage=True)
 
         generator = torch.Generator(device=self.device).manual_seed(seed)
@@ -100,7 +108,7 @@ def __call__(  # noqa: PLR0913
         stepper = EulerDiffusionStep()
         cfg_guider = CFGGuider(cfg_guidance_scale)
         dtype = torch.bfloat16
-
+        print("starting text encoder", time.time() - startAt)
         text_encoder = self.stage_1_model_ledger.text_encoder()
         if enhance_prompt:
             prompt = generate_enhanced_prompt(
@@ -109,17 +117,12 @@ def __call__(  # noqa: PLR0913
         context_p, context_n = encode_text(text_encoder, prompts=[prompt, negative_prompt])
         v_context_p, a_context_p = context_p
         v_context_n, a_context_n = context_n
+        print("end text encoder", time.time() - startAt)
 
         torch.cuda.synchronize()
-        # For device-mapped models, need to explicitly remove hooks before deletion
-        if hasattr(text_encoder, 'model') and hasattr(text_encoder.model, 'hf_device_map'):
-            # Remove all hooks to fully release memory
-            from accelerate.hooks import remove_hook_from_module
-            remove_hook_from_module(text_encoder.model, recurse=True)
-            text_encoder.model = None
         del text_encoder
         cleanup_memory()
-
+        print("Stage 1: Initial low resolution video generation.", time.time() - startAt)
         # Stage 1: Initial low resolution video generation.
         video_encoder = self.stage_1_model_ledger.video_encoder()
         transformer = self.stage_1_model_ledger.transformer()
@@ -158,6 +161,7 @@ def first_stage_denoising_loop(
             dtype=dtype,
             device=self.device,
         )
+        print("Stage 1: Starting denoising loop.", time.time() - startAt)
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
@@ -169,17 +173,19 @@ def first_stage_denoising_loop(
             dtype=dtype,
             device=self.device,
         )
-
+        print("Stage 1: End denoising loop.", time.time() - startAt)
         torch.cuda.synchronize()
         del transformer
         cleanup_memory()
 
+        print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
         # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1],
             video_encoder=video_encoder,
             upsampler=self.stage_2_model_ledger.spatial_upsampler(),
         )
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
 
         torch.cuda.synchronize()
         cleanup_memory()
@@ -225,7 +231,7 @@ def second_stage_denoising_loop(
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
         )
-
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
         torch.cuda.synchronize()
         del transformer
         del video_encoder
@@ -235,7 +241,219 @@ def second_stage_denoising_loop(
         decoded_audio = vae_decode_audio(
             audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
         )
+        print("Stage 2:vae decode video end.", time.time() - startAt)
+        return decoded_video, decoded_audio
+
+    @torch.inference_mode()
+    def __call__(  # noqa: PLR0913
+            self,
+            prompt: str,
+            negative_prompt: str,
+            seed: int,
+            height: int,
+            width: int,
+            num_frames: int,
+            frame_rate: float,
+            num_inference_steps: int,
+            cfg_guidance_scale: float,
+            images: list[tuple[str, int, float]],
+            tiling_config: TilingConfig | None = None,
+            enhance_prompt: bool = False,
+    ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
+        import hashlib
+        import os
+
+        print("Start Call")
+        startAt = time.time()
+        assert_resolution(height=height, width=width, is_two_stage=True)
+
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        cfg_guider = CFGGuider(cfg_guidance_scale)
+        dtype = torch.bfloat16
+        print("starting text encoder", time.time() - startAt)
+
+        # --- DISK CACHE LOGIC START ---
+        CACHE_DIR = "./prompt_embeddings_cache"
+        os.makedirs(CACHE_DIR, exist_ok=True)
+
+        # 1. Create a unique hash string based on inputs that affect text encoding
+        # Note: We only include seed and image if enhance_prompt is True,
+        # because otherwise, they don't change the text embedding.
+        image_identifier = images[0][0] if (len(images) > 0 and enhance_prompt) else "no_img"
+
+        hash_input_str = (
+            f"prompt:{prompt}|"
+            f"neg:{negative_prompt}|"
+            f"enhance:{enhance_prompt}|"
+            f"seed:{seed if enhance_prompt else 'ignored'}|"
+            f"img:{image_identifier}"
+        )
+
+        # Create MD5 hash for filename
+        cache_filename = hashlib.md5(hash_input_str.encode('utf-8')).hexdigest() + ".pt"
+        cache_path = os.path.join(CACHE_DIR, cache_filename)
+
+        context_p = None
+        context_n = None
+
+        if os.path.exists(cache_path):
+            print(f"Disk cache hit! Loading embeddings from {cache_path}")
+            # Load directly to the correct device
+            try:
+                cached_data = torch.load(cache_path, map_location=self.device)
+                context_p, context_n = cached_data
+            except Exception as e:
+                print(f"Failed to load cache (corrupted?): {e}. Regenerating.")
+
+        # If cache miss or load failed
+        if context_p is None:
+            print("Disk cache miss. Running text encoder.")
+            text_encoder = self.stage_1_model_ledger.text_encoder()
+
+            # Logic to handle prompt enhancement
+            current_prompt = prompt
+            if enhance_prompt:
+                current_prompt = generate_enhanced_prompt(
+                    text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
+                )
+
+            context_p, context_n = encode_text(text_encoder, prompts=[current_prompt, negative_prompt])
+
+            # Save to disk for next time
+            print(f"Saving embeddings to {cache_path}")
+            torch.save((context_p, context_n), cache_path)
+
+            torch.cuda.synchronize()
+            del text_encoder
+            cleanup_memory()
+        # --- DISK CACHE LOGIC END ---
+
+        v_context_p, a_context_p = context_p
+        v_context_n, a_context_n = context_n
+        print("end text encoder", time.time() - startAt)
+
+        print("Stage 1: Initial low resolution video generation.", time.time() - startAt)
+        # Stage 1: Initial low resolution video generation.
+        video_encoder = self.stage_1_model_ledger.video_encoder()
+        transformer = self.stage_1_model_ledger.transformer()
+        sigmas = LTX2Scheduler().execute(steps=num_inference_steps).to(dtype=torch.float32, device=self.device)
+
+        def first_stage_denoising_loop(
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
+        ) -> tuple[LatentState, LatentState]:
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=guider_denoising_func(
+                    cfg_guider,
+                    v_context_p,
+                    v_context_n,
+                    a_context_p,
+                    a_context_n,
+                    transformer=transformer,  # noqa: F821
+                ),
+            )
+
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width // 2,
+            height=height // 2,
+            fps=frame_rate,
+        )
+        stage_1_conditionings = image_conditionings_by_replacing_latent(
+            images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        print("Stage 1: Starting denoising loop.", time.time() - startAt)
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=sigmas,
+            stepper=stepper,
+            denoising_loop_fn=first_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+        )
+        print("Stage 1: End denoising loop.", time.time() - startAt)
+        #torch.cuda.synchronize()
+        del transformer
+        cleanup_memory()
+
+        print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
+        # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
+        )
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
+
+        #torch.cuda.synchronize()
+        cleanup_memory()
 
+        transformer = self.stage_2_model_ledger.transformer()
+        distilled_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+
+        def second_stage_denoising_loop(
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
+        ) -> tuple[LatentState, LatentState]:
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=v_context_p,
+                    audio_context=a_context_p,
+                    transformer=transformer,  # noqa: F821
+                ),
+            )
+
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = image_conditionings_by_replacing_latent(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=distilled_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=second_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=distilled_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=audio_state.latent,
+        )
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
+        #torch.cuda.synchronize()
+        del transformer
+        del video_encoder
+        cleanup_memory()
+
+        decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)
+        decoded_audio = vae_decode_audio(
+            audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
+        )
+        print("Stage 2:vae decode video end.", time.time() - startAt)
         return decoded_video, decoded_audio
 
 
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index 31d27643..4b9df1ff 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -33,14 +33,9 @@ def get_device() -> torch.device:
 
 
 def cleanup_memory() -> None:
-    """Clean up GPU and system memory, including device-mapped models."""
     gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        # Second pass to ensure device-mapped tensors are released
-        gc.collect()
-        torch.cuda.empty_cache()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
 
 
 def image_conditionings_by_replacing_latent(
@@ -95,7 +90,7 @@ def image_conditionings_by_adding_guiding_latent(
         )
     return conditionings
 
-
+#@profile 293.788 s
 def euler_denoising_loop(
     sigmas: torch.Tensor,
     video_state: LatentState,
@@ -135,7 +130,7 @@ def euler_denoising_loop(
         audio latent states after completing the denoising loop.
     """
     for step_idx, _ in enumerate(tqdm(sigmas[:-1])):
-        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)
+        denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)  # 100%
 
         denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
         denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
@@ -145,7 +140,7 @@ def euler_denoising_loop(
 
     return (video_state, audio_state)
 
-
+#@profile unused
 def gradient_estimating_euler_denoising_loop(
     sigmas: torch.Tensor,
     video_state: LatentState,
@@ -205,7 +200,7 @@ def update_velocity_and_sample(
 
     return (video_state, audio_state)
 
-
+#@profile 0.13212 s
 def noise_video_state(
     output_shape: VideoPixelShape,
     noiser: Noiser,
@@ -228,7 +223,7 @@ def noise_video_state(
         scale_factors=components.video_scale_factors,
     )
     video_tools = VideoLatentTools(components.video_patchifier, video_latent_shape, output_shape.fps)
-    video_state = create_noised_state(
+    video_state = create_noised_state(  # 99.9%
         tools=video_tools,
         conditionings=conditionings,
         noiser=noiser,
@@ -240,7 +235,7 @@ def noise_video_state(
 
     return video_state, video_tools
 
-
+#@profile 0.0061205 s
 def noise_audio_state(
     output_shape: VideoPixelShape,
     noiser: Noiser,
@@ -271,7 +266,7 @@ def noise_audio_state(
 
     return audio_state, audio_tools
 
-
+#@profile 0.138001 s
 def create_noised_state(
     tools: LatentTools,
     conditionings: list[ConditioningItem],
@@ -285,13 +280,13 @@ def create_noised_state(
     Creates an empty latent state, applies conditionings, and then adds noise
     using the provided noiser. Returns the final noised state ready for diffusion.
     """
-    state = tools.create_initial_state(device, dtype, initial_latent)
+    state = tools.create_initial_state(device, dtype, initial_latent)  # 92.1%
     state = state_with_conditionings(state, conditionings, tools)
     state = noiser(state, noise_scale)
 
     return state
 
-
+#@profile 8.7e-06 s
 def state_with_conditionings(
     latent_state: LatentState, conditioning_items: list[ConditioningItem], latent_tools: LatentTools
 ) -> LatentState:
@@ -304,12 +299,12 @@ def state_with_conditionings(
 
     return latent_state
 
-
+#@profile 0.0054349 s
 def post_process_latent(denoised: torch.Tensor, denoise_mask: torch.Tensor, clean: torch.Tensor) -> torch.Tensor:
     """Blend denoised output with clean state based on mask."""
     return (denoised * denoise_mask + clean.float() * (1 - denoise_mask)).to(denoised.dtype)
 
-
+#@profile 0.0025289 s
 def modality_from_latent_state(
     state: LatentState, context: torch.Tensor, sigma: float | torch.Tensor, enabled: bool = True
 ) -> Modality:
@@ -326,7 +321,7 @@ def modality_from_latent_state(
         context_mask=None,
     )
 
-
+#@profile 0.0011484 s
 def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor) -> torch.Tensor:
     """Compute timesteps from a denoise mask and sigma value.
     Multiplies the denoise mask by sigma to produce timesteps for each position
@@ -334,7 +329,7 @@ def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor)
     """
     return denoise_mask * sigma
 
-
+#@profile
 def simple_denoising_func(
     video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model
 ) -> DenoisingFunc:
@@ -345,12 +340,12 @@ def simple_denoising_step(
         pos_video = modality_from_latent_state(video_state, video_context, sigma)
         pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
 
-        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)
+        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)  # 100%
         return denoised_video, denoised_audio
 
     return simple_denoising_step
 
-
+#@profile unused
 def guider_denoising_func(
     guider: GuiderProtocol,
     v_context_p: torch.Tensor,
@@ -380,7 +375,7 @@ def guider_denoising_step(
 
     return guider_denoising_step
 
-
+#@profile 293.957 s
 def denoise_audio_video(  # noqa: PLR0913
     output_shape: VideoPixelShape,
     conditionings: list[ConditioningItem],
@@ -416,7 +411,7 @@ def denoise_audio_video(  # noqa: PLR0913
         initial_latent=initial_audio_latent,
     )
 
-    video_state, audio_state = denoising_loop_fn(
+    video_state, audio_state = denoising_loop_fn(  # 99.9%
         sigmas,
         video_state,
         audio_state,
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
index edec562f..2925ea3a 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
@@ -177,6 +177,7 @@ def with_loras(self, loras: LoraPathStrengthAndSDOps) -> "ModelLedger":
         )
 
     def transformer(self) -> X0Model:
+        offload_config = {0: "0.5GiB", "cpu": "32GiB"}
         if not hasattr(self, "transformer_builder"):
             raise ValueError(
                 "Transformer not initialized. Please provide a checkpoint path to the ModelLedger constructor."
@@ -187,12 +188,11 @@ def transformer(self) -> X0Model:
                 module_ops=(UPCAST_DURING_INFERENCE,),
                 model_sd_ops=LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
             )
-            return X0Model(fp8_builder.build(device=self._target_device())).to(self.device).eval()
+            return X0Model(fp8_builder.build(device=self._target_device(), max_memory=offload_config))#.to(self.device).eval()
         else:
             return (
-                X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype))
-                .to(self.device)
-                .eval()
+                X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config))
+                #.to(self.device).eval()
             )
 
     def video_decoder(self) -> VideoDecoder:
@@ -208,8 +208,8 @@ def video_encoder(self) -> VideoEncoder:
             raise ValueError(
                 "Video encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
             )
-
-        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+        offload_config = {0: "0.1GiB", "cpu": "32GiB"}
+        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config)  # .to(self.device).eval()
 
     def text_encoder(self) -> AVGemmaTextEncoderModel:
         if not hasattr(self, "text_encoder_builder"):
@@ -218,11 +218,7 @@ def text_encoder(self) -> AVGemmaTextEncoderModel:
                 "ModelLedger constructor."
             )
 
-        model = self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype)
-        # If the model has device mapping (from device_map="auto"), don't call .to() as it's already distributed
-        if hasattr(model, 'model') and hasattr(model.model, 'hf_device_map') and model.model.hf_device_map:
-            return model.eval()
-        return model.to(self.device).eval()
+        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype)  #.to(self.device).eval()
 
     def audio_decoder(self) -> AudioDecoder:
         if not hasattr(self, "audio_decoder_builder"):
@@ -243,5 +239,5 @@ def vocoder(self) -> Vocoder:
     def spatial_upsampler(self) -> LatentUpsampler:
         if not hasattr(self, "upsampler_builder"):
             raise ValueError("Upsampler not initialized. Please provide upsampler path to the ModelLedger constructor.")
-
-        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+        offload_config = {0: "0.1GiB", "cpu": "32GiB"}
+        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config) #.to(self.device).eval()

From f703b47f6f97d1a98f08e20c12e08941f5695feb Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Wed, 14 Jan 2026 06:30:49 +0100
Subject: [PATCH 03/38] Add web ui

---
 README.md                                     |  73 ++++-
 .../src/ltx_pipelines/distilled.py            |  29 +-
 .../src/ltx_pipelines/utils/model_ledger.py   |  10 +-
 web_ui_v2.py                                  | 298 ++++++++++++++++++
 4 files changed, 389 insertions(+), 21 deletions(-)
 create mode 100644 web_ui_v2.py

diff --git a/README.md b/README.md
index 777beec5..0c557912 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,75 @@
-# LTX-2
+# LTX-2 Optimized (8GB VRAM Edition) + Web UI
+
+This repository contains a **modified and optimized version of the LTX-2 Video Generation Model**, designed specifically to run on consumer hardware with as little as **8GB VRAM**. 
+
+It includes a fully-featured **Gradio Web Interface** to make generating videos, managing presets, and applying LoRAs easy without needing to remember complex command-line arguments.
+
+
+## 🚀 Features
+
+*   **8GB VRAM Optimization:** Runs locally on cards like the RTX 3070/4060Ti using FP8 quantization and memory management tweaks.
+*   **Windows 11 support!!!** You can even run it on Windows (not supported in original model).
+*   **User-Friendly Web UI:** Control everything from your browser.
+*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors.
+*   **Real-time Logging:** View the generation progress and console output directly in the web interface.
+*   **Advanced Features:**
+    *   **Image Conditioning:** Upload reference images.
+    *   **LoRA Support:** Checkbox selection for Camera Control.
+    *   **Seed Control:** Reproducible generations.
+
+## 📥 Model Download & Setup
+
+To run this, you need to download the specific FP8 distilled checkpoints and the Text Encoder.
+
+**1. Create a `models` directory in the root folder:**
+```bash
+mkdir models
+mkdir models/loras
+```
+
+**2. Download the models:**
+* You can find the compatible models via the links provided in the original optimized repository or the Lightricks HuggingFace page.
+```
+./models/
+    ltx-2-19b-distilled-fp8.safetensors	
+    ltx-2-spatial-upscaler-x2-1.0.safetensors
+
+./models/gemma3/
+    gemma-3 files
+
+./models/loras/
+    loras
+```
+**3. Install all required modules:**
+```
+required modules
+pip install -e packages/ltx-pipelines
+pip install -e packages/ltx-core
+
+Python 3.12.8
+accelerate==1.10.1
+torch==2.8.0+cu128
+torchaudio==2.8.0+cu128
+torchvision==0.23.0+cu128
+xformers==0.0.32.post2
+...
+```
+**🖥️ Usage**
+Run the web interface with a single command:
+```Bash
+python web_ui_v2.py
+```
+
+##Credits
+* Original Model: Lightricks (LTX-2)
+* Optimization: nalexand
+* Web UI: Created for the community to make this powerful model accessible.
+
+Original Model: 
+* (you can find links to all model files and loras bellow)
+
+
+## LTX-2
 
 [![Website](https://img.shields.io/badge/Website-LTX-181717?logo=google-chrome)](https://ltx.io)
 [![Model](https://img.shields.io/badge/HuggingFace-Model-orange?logo=huggingface)](https://huggingface.co/Lightricks/LTX-2)
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index d98c9c62..35b8528e 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -1,5 +1,8 @@
 import logging
 import time
+import hashlib
+import os
+
 from collections.abc import Iterator
 
 import torch
@@ -36,6 +39,8 @@
 
 device = get_device()
 
+logging.basicConfig(level=logging.ERROR)
+
 
 class DistilledPipeline:
     """
@@ -85,10 +90,7 @@ def __call__(
             tiling_config: TilingConfig | None = None,
             enhance_prompt: bool = False,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
-        import hashlib
-        import os
-
-        print("Start Call")
+        print("Preparing Inference")
         startAt = time.time()
         assert_resolution(height=height, width=width, is_two_stage=True)
 
@@ -96,7 +98,6 @@ def __call__(
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
-        print("starting text encoder", time.time() - startAt)
 
         # --- DISK CACHE LOGIC START ---
         CACHE_DIR = "./prompt_embeddings_cache"
@@ -121,7 +122,7 @@ def __call__(
         context_p = None
 
         if os.path.exists(cache_path):
-            print(f"Disk cache hit! Loading embeddings from {cache_path}")
+            print(f"Prompt cache hit! Loading embeddings from {cache_path}")
             try:
                 # Load directly to the correct device
                 # For distilled, we only saved context_p
@@ -131,7 +132,7 @@ def __call__(
 
         # If cache miss or load failed
         if context_p is None:
-            print("Disk cache miss. Running text encoder.")
+            print("Prompt cache miss. Running text encoder.")
             text_encoder = self.model_ledger.text_encoder()
 
             # Logic to handle prompt enhancement
@@ -148,6 +149,7 @@ def __call__(
             # Save to disk for next time
             print(f"Saving embeddings to {cache_path}")
             torch.save(context_p, cache_path)
+            print("Prompt encoded.", time.time() - startAt)
 
             torch.cuda.synchronize()
             del text_encoder
@@ -157,9 +159,7 @@ def __call__(
         # Unpack the positive context (Distilled usually splits this into video/audio context)
         video_context, audio_context = context_p
 
-        print("end text encoder", time.time() - startAt)
-
-        print("Stage 1: Initial low resolution video generation.", time.time() - startAt)
+        print("Stage 1: Initial low resolution video generation.")
         # Stage 1: Initial low resolution video generation.
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
@@ -208,14 +208,13 @@ def denoising_loop(
             dtype=dtype,
             device=self.device,
         )
-        print("Stage 1: End denoising loop.", time.time() - startAt)
+        print("Stage 1: Finish denoising loop.", time.time() - startAt)
 
         print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
         # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=self.model_ledger.spatial_upsampler()
         )
-        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
 
         torch.cuda.synchronize()
         cleanup_memory()
@@ -244,17 +243,17 @@ def denoising_loop(
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
         )
-        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
+        print("Stage 2: Finish upsample and refine the video.", time.time() - startAt)
         torch.cuda.synchronize()
         del transformer
         del video_encoder
         cleanup_memory()
-
+        print("Stage 3: Starting vae decode video.", time.time() - startAt)
         decoded_video = vae_decode_video(video_state.latent, self.model_ledger.video_decoder(), tiling_config)
         decoded_audio = vae_decode_audio(
             audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
         )
-        print("Stage 2:vae decode video end.", time.time() - startAt)
+        print("Stage 3: Done.", time.time() - startAt)
         return decoded_video, decoded_audio
 
 
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
index 2925ea3a..ba2e3b97 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
@@ -188,7 +188,7 @@ def transformer(self) -> X0Model:
                 module_ops=(UPCAST_DURING_INFERENCE,),
                 model_sd_ops=LTXV_MODEL_COMFY_RENAMING_WITH_TRANSFORMER_LINEAR_DOWNCAST_MAP,
             )
-            return X0Model(fp8_builder.build(device=self._target_device(), max_memory=offload_config))#.to(self.device).eval()
+            return X0Model(fp8_builder.build(device=self._target_device(), max_memory=offload_config))  # .to(self.device).eval()
         else:
             return (
                 X0Model(self.transformer_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config))
@@ -208,8 +208,8 @@ def video_encoder(self) -> VideoEncoder:
             raise ValueError(
                 "Video encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
             )
-        offload_config = {0: "0.1GiB", "cpu": "32GiB"}
-        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config)  # .to(self.device).eval()
+
+        return self.vae_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
 
     def text_encoder(self) -> AVGemmaTextEncoderModel:
         if not hasattr(self, "text_encoder_builder"):
@@ -218,7 +218,7 @@ def text_encoder(self) -> AVGemmaTextEncoderModel:
                 "ModelLedger constructor."
             )
 
-        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype)  #.to(self.device).eval()
+        return self.text_encoder_builder.build(device=self._target_device(), dtype=self.dtype)   # .to(self.device).eval()
 
     def audio_decoder(self) -> AudioDecoder:
         if not hasattr(self, "audio_decoder_builder"):
@@ -240,4 +240,4 @@ def spatial_upsampler(self) -> LatentUpsampler:
         if not hasattr(self, "upsampler_builder"):
             raise ValueError("Upsampler not initialized. Please provide upsampler path to the ModelLedger constructor.")
         offload_config = {0: "0.1GiB", "cpu": "32GiB"}
-        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config) #.to(self.device).eval()
+        return self.upsampler_builder.build(device=self._target_device(), dtype=self.dtype, max_memory=offload_config)  # .to(self.device).eval()
diff --git a/web_ui_v2.py b/web_ui_v2.py
new file mode 100644
index 00000000..d0f6e5fc
--- /dev/null
+++ b/web_ui_v2.py
@@ -0,0 +1,298 @@
+import gradio as gr
+import subprocess
+import os
+import datetime
+import uuid
+import sys
+import shlex
+
+# --- Configuration & Defaults ---
+DEFAULT_CHECKPOINT = "./models/ltx-2-19b-distilled-fp8.safetensors"
+DEFAULT_GEMMA = "./models/gemma3"
+DEFAULT_UPSAMPLER = "./models/ltx-2-spatial-upscaler-x2-1.0.safetensors"
+LORA_ROOT = "./models/loras"
+
+# LoRA List
+LORA_OPTIONS = [
+    #"LTX-2-19b-IC-LoRA-Canny-Control",
+    #"LTX-2-19b-IC-LoRA-Depth-Control",
+    #"LTX-2-19b-IC-LoRA-Detailer",
+    #"LTX-2-19b-IC-LoRA-Pose-Control",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-In",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Left",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Out",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Right",
+    "LTX-2-19b-LoRA-Camera-Control-Jib-Down",
+    "LTX-2-19b-LoRA-Camera-Control-Jib-Up",
+    "LTX-2-19b-LoRA-Camera-Control-Static"
+]
+
+# Resolution Presets with Max Frame Data for 8GB VRAM
+PRESETS = {
+    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 177},
+    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 177},
+
+    "1536x1024 (Standard)": {"w": 1536, "h": 1024, "max_frames": 121},
+    "1024x1536 (Vertical)": {"w": 1024, "h": 1536, "max_frames": 121},
+
+    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 81},
+    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 81},
+
+    "2560x1408 (2K)": {"w": 2560, "h": 1408, "max_frames": 49},
+    "1408x2560 (2K Vert)": {"w": 1408, "h": 2560, "max_frames": 49},
+
+    "3840x2176 (4K)": {"w": 3840, "h": 2176, "max_frames": 17},
+}
+
+
+# --- Logic Functions ---
+
+def get_preset_frames(preset_key, is_safe_mode, current_val):
+    """Updates frame count slider based on preset and safe mode"""
+    if not is_safe_mode:
+        return current_val  # Do not change if safe mode is off
+
+    if preset_key in PRESETS:
+        return PRESETS[preset_key]["max_frames"]
+    return 121
+
+
+def run_generation(
+        prompt,
+        resolution_preset,
+        num_frames,
+        frame_rate,
+        steps,
+        seed,
+        randomize_seed,
+        enhance_prompt,
+        enable_fp8,
+        # Paths
+        checkpoint_path,
+        gemma_path,
+        upsampler_path,
+        # Images
+        img1_path, img1_idx, img1_str,
+        img2_path, img2_idx, img2_str,
+        img3_path, img3_idx, img3_str,
+        # LoRAs
+        selected_loras
+):
+    # 1. Setup Data
+    width = PRESETS[resolution_preset]["w"]
+    height = PRESETS[resolution_preset]["h"]
+
+    if randomize_seed:
+        seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
+
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    unique_id = str(uuid.uuid4())[:8]
+    output_filename = f"output_{timestamp}_{unique_id}.mp4"
+    output_path = os.path.abspath(output_filename)
+
+    # 2. Build Command
+    cmd = [
+        sys.executable, "-m", "ltx_pipelines.distilled",
+        "--checkpoint-path", checkpoint_path,
+        "--gemma-root", gemma_path,
+        "--spatial-upsampler-path", upsampler_path,
+        "--prompt", prompt,
+        "--output-path", output_path,
+        "--width", str(width),
+        "--height", str(height),
+        "--num-frames", str(int(num_frames)),
+        "--frame-rate", str(frame_rate),
+        "--num-inference-steps", str(int(steps)),
+        "--seed", str(int(seed))
+    ]
+
+    if enable_fp8:
+        cmd.append("--enable-fp8")
+    if enhance_prompt:
+        cmd.append("--enhance-prompt")
+
+    # Images
+    images = [
+        (img1_path, img1_idx, img1_str),
+        (img2_path, img2_idx, img2_str),
+        (img3_path, img3_idx, img3_str)
+    ]
+    for path, idx, strength in images:
+        if path is not None:
+            cmd.extend(["--image", path, str(int(idx)), str(float(strength))])
+
+    # LoRAs
+    for lora_name in selected_loras:
+        lora_full_path = os.path.join(LORA_ROOT, f"{lora_name.lower()}.safetensors")
+        cmd.extend(["--lora", lora_full_path, "1.0"])
+
+    # 3. Execution with Real-time Logging
+    full_command_str = " ".join(shlex.quote(arg) for arg in cmd)
+    log_buffer = f"Command:\n{full_command_str}\n\n--- OUTPUT LOG ---\n"
+    yield None, log_buffer  # Clear video, show start log
+
+    try:
+        # Popen allows reading stdout line by line
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+            universal_newlines=True
+        )
+
+        for line in process.stdout:
+            log_buffer += line
+            yield None, log_buffer  # Stream logs to UI
+
+        process.wait()
+
+        if process.returncode == 0 and os.path.exists(output_path):
+            log_buffer += f"\n\nSUCCESS: Video saved to {output_path}"
+            yield output_path, log_buffer
+        else:
+            log_buffer += f"\n\nERROR: Process failed or output file missing."
+            yield None, log_buffer
+
+    except Exception as e:
+        log_buffer += f"\n\nEXCEPTION: {str(e)}"
+        yield None, log_buffer
+
+
+# --- UI Theme & Layout ---
+
+# Custom neutral theme (slate/gray)
+theme = gr.themes.Soft(
+    primary_hue="slate",
+    secondary_hue="gray",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
+).set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+    button_primary_background_fill="*primary_600",
+    button_primary_text_color="white",
+)
+
+css = """
+.gradio-container { max_width: 1400px !important; }
+textarea { font-family: monospace; }
+"""
+
+with gr.Blocks(title="LTX-2 Studio", theme=theme, css=css) as demo:
+    gr.Markdown("## 🎬 LTX-2 Distilled Web Interface")
+
+    with gr.Row():
+        # Left Column: Controls
+        with gr.Column(scale=3):
+            prompt = gr.Textbox(label="Prompt", placeholder="Describe your video scene here...", lines=3)
+
+            with gr.Row():
+                with gr.Column(scale=1):
+                    # Resolution & Safe Mode
+                    preset = gr.Dropdown(
+                        label="Resolution",
+                        choices=list(PRESETS.keys()),
+                        value="1536x1024 (Standard)"
+                    )
+                    safe_mode = gr.Checkbox(
+                        label="8GB VRAM Safe Mode",
+                        value=True,
+                        info="Auto-limits max frames to prevent OOM"
+                    )
+
+                with gr.Column(scale=1):
+                    # Generation Params
+                    num_frames = gr.Slider(label="Number of Frames", minimum=9, maximum=257, step=8, value=121)
+                    fps = gr.Slider(label="Frame Rate", minimum=8, maximum=60, step=1, value=24)
+
+            with gr.Accordion("Advanced Settings", open=False):
+                with gr.Row():
+                    steps = gr.Slider(label="Inference Steps", minimum=8, maximum=8, step=1, value=8, info="Fixed to 8 for distiled model")
+                    seed = gr.Number(label="Seed", value=10, precision=0)
+
+                with gr.Row():
+                    random_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                    enable_fp8 = gr.Checkbox(label="Enable FP8 (Required for 8GB vram)", value=True)
+                    enhance_prompt = gr.Checkbox(label="Enhance Prompt (slow +1..3 min)", value=False)
+
+                gr.Markdown("### Model Paths")
+                checkpoint_path = gr.Textbox(label="Checkpoint", value=DEFAULT_CHECKPOINT)
+                gemma_path = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
+                upsampler_path = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
+
+        # Right Column: Output & Media
+        with gr.Column(scale=4):
+            out_video = gr.Video(label="Generated Result", height=400)
+            generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")
+
+            with gr.Accordion("Console Log", open=True):
+                console_log = gr.Textbox(label="Terminal Output", lines=10, max_lines=20, interactive=False,
+                                         elem_id="console_log")
+
+    gr.Markdown("---")
+
+    with gr.Row():
+        # LoRAs Column
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎨 LoRA Adapters")
+            lora_checks = gr.CheckboxGroup(
+                choices=LORA_OPTIONS,
+                label=None,
+                info="Applied with strength 1.0"
+            )
+
+        # Image Conditioning Column
+        with gr.Column(scale=2):
+            gr.Markdown("### 🖼️ Image Conditioning (Optional)")
+            with gr.Row():
+                # Image 1
+                with gr.Group():
+                    i1_img = gr.Image(type="filepath", label="Ref Image 1", height=150)
+                    i1_idx = gr.Number(label="Frame Index", value=0)
+                    i1_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+                # Image 2
+                with gr.Group():
+                    i2_img = gr.Image(type="filepath", label="Ref Image 2", height=150)
+                    i2_idx = gr.Number(label="Frame Index", value=0)
+                    i2_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+                # Image 3
+                with gr.Group():
+                    i3_img = gr.Image(type="filepath", label="Ref Image 3", height=150)
+                    i3_idx = gr.Number(label="Frame Index", value=0)
+                    i3_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+    # --- Event Wiring ---
+
+    # Logic: When Preset OR Safe Mode changes, update the Num Frames Slider
+    preset.change(
+        fn=get_preset_frames,
+        inputs=[preset, safe_mode, num_frames],
+        outputs=num_frames
+    )
+
+    safe_mode.change(
+        fn=get_preset_frames,
+        inputs=[preset, safe_mode, num_frames],
+        outputs=num_frames
+    )
+
+    # Logic: Run Generation
+    generate_btn.click(
+        fn=run_generation,
+        inputs=[
+            prompt, preset, num_frames, fps, steps, seed, random_seed, enhance_prompt, enable_fp8,
+            checkpoint_path, gemma_path, upsampler_path,
+            i1_img, i1_idx, i1_str,
+            i2_img, i2_idx, i2_str,
+            i3_img, i3_idx, i3_str,
+            lora_checks
+        ],
+        outputs=[out_video, console_log]
+    )
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)
\ No newline at end of file

From 975ff530c9f6a089dbd594f5be5ee192651a5208 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Wed, 14 Jan 2026 06:39:35 +0100
Subject: [PATCH 04/38] Update README.md

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0c557912..30396cc1 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,9 @@ mkdir models/loras
 ```
 
 **2. Download the models:**
-* You can find the compatible models via the links provided in the original optimized repository or the Lightricks HuggingFace page.
+* [`ltx-2-19b-dev-fp8.safetensors`](https://huggingface.co/Lightricks/LTX-2/blob/main/ltx-2-19b-dev-fp8.safetensors) - [Download](https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-dev-fp8.safetensors)
+* [`ltx-2-spatial-upscaler-x2-1.0.safetensors`](https://huggingface.co/Lightricks/LTX-2/blob/main/ltx-2-spatial-upscaler-x2-1.0.safetensors) - [Download](https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-spatial-upscaler-x2-1.0.safetensors)
+* [`Gemma 3`](https://huggingface.co/google/gemma-3-12b-it-qat-q4_0-unquantized/tree/main)
 ```
 ./models/
     ltx-2-19b-distilled-fp8.safetensors	
@@ -38,7 +40,7 @@ mkdir models/loras
     gemma-3 files
 
 ./models/loras/
-    loras
+    lora files here
 ```
 **3. Install all required modules:**
 ```
@@ -60,7 +62,7 @@ Run the web interface with a single command:
 python web_ui_v2.py
 ```
 
-##Credits
+**Credits**
 * Original Model: Lightricks (LTX-2)
 * Optimization: nalexand
 * Web UI: Created for the community to make this powerful model accessible.

From 01b200436c8e0a404fc6b8b005b7e04f04222d87 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Wed, 14 Jan 2026 06:51:13 +0100
Subject: [PATCH 05/38] Update README.md

---
 README.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 30396cc1..c5b19ac4 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 *   **8GB VRAM Optimization:** Runs locally on cards like the RTX 3070/4060Ti using FP8 quantization and memory management tweaks.
 *   **Windows 11 support!!!** You can even run it on Windows (not supported in original model).
 *   **User-Friendly Web UI:** Control everything from your browser.
-*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors.
+*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors. (if you do not have 8 Gb free vram try decrease frames count)
 *   **Real-time Logging:** View the generation progress and console output directly in the web interface.
 *   **Advanced Features:**
     *   **Image Conditioning:** Upload reference images.
@@ -62,6 +62,16 @@ Run the web interface with a single command:
 python web_ui_v2.py
 ```
 
+**📊 Performance & Presets (8GB VRAM)**
+* The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on 3070Ti laptop GPU ~300sec for all presets.
+* Resolution	Max Frames	Est. Time (3070ti laptop 8gb vram)
+* 1280 x 704	177	      ~300 sec
+* 1536 x 1024	121	      ~300 sec
+* 1920 x 1088	81		      ~300 sec
+* 2560 x 1408	49		      ~315 sec
+* 3840 x 2176	17	         ~315 sec
+* +60 sec for prompt (if not empty/not cached) 
+
 **Credits**
 * Original Model: Lightricks (LTX-2)
 * Optimization: nalexand

From 861916bb02e04aab91f9d7d451ff12dfc0f9ddf8 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Wed, 14 Jan 2026 07:17:09 +0100
Subject: [PATCH 06/38] Remove profiler

---
 packages/ltx-core/src/ltx_core/loader/fuse_loras.py | 2 --
 packages/ltx-core/src/ltx_core/loader/kernels.py    | 2 --
 web_ui_v2.py                                        | 3 ++-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
index 0da89375..485a084a 100644
--- a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
+++ b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
@@ -6,9 +6,7 @@
 
 BLOCK_SIZE = 1024
 
-from line_profiler import profile
 
-@profile
 def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
     """
     Native PyTorch implementation of fused_add_round_launch.
diff --git a/packages/ltx-core/src/ltx_core/loader/kernels.py b/packages/ltx-core/src/ltx_core/loader/kernels.py
index 765f367e..105497d9 100644
--- a/packages/ltx-core/src/ltx_core/loader/kernels.py
+++ b/packages/ltx-core/src/ltx_core/loader/kernels.py
@@ -1,8 +1,6 @@
 import torch
 
-from line_profiler import profile
 
-@profile
 def fused_add_round_kernel(
         x: torch.Tensor,
         output: torch.Tensor,
diff --git a/web_ui_v2.py b/web_ui_v2.py
index d0f6e5fc..d4effac7 100644
--- a/web_ui_v2.py
+++ b/web_ui_v2.py
@@ -119,7 +119,8 @@ def run_generation(
     ]
     for path, idx, strength in images:
         if path is not None:
-            cmd.extend(["--image", path, str(int(idx)), str(float(strength))])
+            latent_idx = int(idx) // 8
+            cmd.extend(["--image", path, str(latent_idx), str(float(strength))])
 
     # LoRAs
     for lora_name in selected_loras:

From bbec3c8d5e07e51d0a8e6095c3481a99638c77be Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Wed, 14 Jan 2026 07:21:52 +0100
Subject: [PATCH 07/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c5b19ac4..0d468899 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ mkdir models/loras
 ```
 
 **2. Download the models:**
-* [`ltx-2-19b-dev-fp8.safetensors`](https://huggingface.co/Lightricks/LTX-2/blob/main/ltx-2-19b-dev-fp8.safetensors) - [Download](https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-dev-fp8.safetensors)
+* [`ltx-2-19b-distilled-fp8.safetensors`](https://huggingface.co/Lightricks/LTX-2/blob/main/ltx-2-19b-distilled-fp8.safetensors) - [Download](https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-distilled-fp8.safetensors)
 * [`ltx-2-spatial-upscaler-x2-1.0.safetensors`](https://huggingface.co/Lightricks/LTX-2/blob/main/ltx-2-spatial-upscaler-x2-1.0.safetensors) - [Download](https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-spatial-upscaler-x2-1.0.safetensors)
 * [`Gemma 3`](https://huggingface.co/google/gemma-3-12b-it-qat-q4_0-unquantized/tree/main)
 ```

From 0184c34b1a718c3d98de68d5e76efdbfe5ecb1a6 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Wed, 14 Jan 2026 07:27:33 +0100
Subject: [PATCH 08/38] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 0d468899..6e1b135e 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@ This repository contains a **modified and optimized version of the LTX-2 Video G
 
 It includes a fully-featured **Gradio Web Interface** to make generating videos, managing presets, and applying LoRAs easy without needing to remember complex command-line arguments.
 
+<img width="2260" height="1078" alt="image" src="https://github.com/user-attachments/assets/5a9f5dce-f313-44a3-bbbe-10eccc002191" />
+
 
 ## 🚀 Features
 

From 4f8524b28b6f374d4b0daca442f0c3d05922d7e8 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 16 Jan 2026 04:40:24 +0100
Subject: [PATCH 09/38] More oprimizations

---
 .../src/ltx_core/loader/fuse_loras.py         | 107 +++++++++---------
 .../model/transformer/model_configurator.py   |   3 +
 .../src/ltx_core/model/transformer/rope.py    |  29 +++++
 .../gemma/encoders/base_encoder.py            |  24 +---
 .../src/ltx_pipelines/distilled.py            |  77 ++++++++-----
 web_ui_v2.py                                  |  13 ++-
 6 files changed, 147 insertions(+), 106 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
index 485a084a..7b3385b3 100644
--- a/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
+++ b/packages/ltx-core/src/ltx_core/loader/fuse_loras.py
@@ -1,65 +1,64 @@
 import torch
-# import triton
 
 from ltx_core.loader.kernels import fused_add_round_kernel
 from ltx_core.loader.primitives import LoraStateDictWithStrength, StateDict
 
 BLOCK_SIZE = 1024
 
+try:
+    import triton
 
-def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
-    """
-    Native PyTorch implementation of fused_add_round_launch.
-
-    Note:
-    1. Requires PyTorch 2.1 or newer for torch.float8 support.
-    2. The 'seed' argument is accepted to maintain API compatibility but is ignored
-       because native PyTorch addition uses deterministic Round-To-Nearest-Even (RTNE)
-       rather than stochastic rounding.
-    """
-    # Validation logic from original function
-    if original_weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        raise ValueError("Unsupported dtype")
-
-    if target_weight.dtype != torch.bfloat16:
-        raise ValueError("target_weight dtype must be bfloat16")
-
-    # Implementation:
-    # 1. Cast original_weight (fp8) to target_weight dtype (bf16).
-    #    Since bf16 has higher dynamic range and precision than fp8, this upcast is exact.
-    # 2. Add in-place.
-    target_weight.add_(original_weight.to(target_weight.dtype))
-
-    return target_weight
-
-
-def fused_add_round_launch__(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
-    if original_weight.dtype == torch.float8_e4m3fn:
-        exponent_bits, mantissa_bits, exponent_bias = 4, 3, 7
-    elif original_weight.dtype == torch.float8_e5m2:
-        exponent_bits, mantissa_bits, exponent_bias = 5, 2, 15  # noqa: F841
-    else:
-        raise ValueError("Unsupported dtype")
-
-    if target_weight.dtype != torch.bfloat16:
-        raise ValueError("target_weight dtype must be bfloat16")
-
-    # Calculate grid and block sizes
-    n_elements = original_weight.numel()
-    #grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
-    grid = 0
-
-    # Launch kernel
-    fused_add_round_kernel[grid](
-        original_weight,
-        target_weight,
-        seed,
-        n_elements,
-        exponent_bias,
-        mantissa_bits,
-        BLOCK_SIZE,
-    )
-    return target_weight
+    def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
+        if original_weight.dtype == torch.float8_e4m3fn:
+            exponent_bits, mantissa_bits, exponent_bias = 4, 3, 7
+        elif original_weight.dtype == torch.float8_e5m2:
+            exponent_bits, mantissa_bits, exponent_bias = 5, 2, 15  # noqa: F841
+        else:
+            raise ValueError("Unsupported dtype")
+
+        if target_weight.dtype != torch.bfloat16:
+            raise ValueError("target_weight dtype must be bfloat16")
+
+        # Calculate grid and block sizes
+        n_elements = original_weight.numel()
+        grid = (triton.cdiv(n_elements, BLOCK_SIZE),)
+
+        # Launch kernel
+        fused_add_round_kernel[grid](
+            original_weight,
+            target_weight,
+            seed,
+            n_elements,
+            exponent_bias,
+            mantissa_bits,
+            BLOCK_SIZE,
+        )
+        return target_weight
+except Exception:
+    def fused_add_round_launch(target_weight: torch.Tensor, original_weight: torch.Tensor, seed: int) -> torch.Tensor:
+        """
+        Native PyTorch implementation of fused_add_round_launch.
+
+        Note:
+        1. Requires PyTorch 2.1 or newer for torch.float8 support.
+        2. The 'seed' argument is accepted to maintain API compatibility but is ignored
+           because native PyTorch addition uses deterministic Round-To-Nearest-Even (RTNE)
+           rather than stochastic rounding.
+        """
+        # Validation logic from original function
+        if original_weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+            raise ValueError("Unsupported dtype")
+
+        if target_weight.dtype != torch.bfloat16:
+            raise ValueError("target_weight dtype must be bfloat16")
+
+        # Implementation:
+        # 1. Cast original_weight (fp8) to target_weight dtype (bf16).
+        #    Since bf16 has higher dynamic range and precision than fp8, this upcast is exact.
+        # 2. Add in-place.
+        target_weight.add_(original_weight.to(target_weight.dtype))
+
+        return target_weight
 
 
 def calculate_weight_float8_(target_weights: torch.Tensor, original_weights: torch.Tensor) -> torch.Tensor:
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/model_configurator.py b/packages/ltx-core/src/ltx_core/model/transformer/model_configurator.py
index 567f436a..4ca168ce 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/model_configurator.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/model_configurator.py
@@ -126,6 +126,9 @@ def _upcast_and_round(
     Upcast the weight to the given dtype and optionally apply stochastic rounding.
     Input weight needs to have float8_e4m3fn or float8_e5m2 dtype.
     """
+    if weight.dtype == dtype:
+        return weight
+
     if not with_stochastic_rounding:
         return weight.to(dtype)
     return fused_add_round_launch(torch.zeros_like(weight, dtype=dtype), weight, seed)
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/rope.py b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
index 2ce58d90..cf2b18b0 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/rope.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
@@ -38,6 +38,35 @@ def apply_interleaved_rotary_emb(
 
     return out
 
+def apply_split_rotary_emb_(
+    input_tensor: torch.Tensor,
+    cos_freqs: torch.Tensor,
+    sin_freqs: torch.Tensor
+) -> torch.Tensor:
+    needs_reshape = False
+    orig_shape = None
+    if input_tensor.ndim != 4 and cos_freqs.ndim == 4:
+        b, h, t, _ = cos_freqs.shape
+        orig_shape = (b, t, -1)
+        input_tensor = input_tensor.view(b, t, h, -1).transpose(1, 2)
+        needs_reshape = True
+
+    d = input_tensor.shape[-1]
+    half = d // 2
+    x_even = input_tensor[..., :half]
+    x_odd = input_tensor[..., half:]
+
+    even_rot = x_even * cos_freqs - x_odd * sin_freqs
+    odd_rot = x_even * sin_freqs + x_odd * cos_freqs
+
+    output = torch.empty_like(input_tensor)
+    output[..., :half] = even_rot
+    output[..., half:] = odd_rot
+
+    if needs_reshape:
+        output = output.transpose(1, 2).reshape(*orig_shape)
+
+    return output
 
 def apply_split_rotary_emb(
     input_tensor: torch.Tensor, cos_freqs: torch.Tensor, sin_freqs: torch.Tensor
diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
index 4c14ed48..85298831 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
@@ -3,7 +3,7 @@
 
 import torch
 from einops import rearrange
-from transformers import AutoImageProcessor, Gemma3ForConditionalGeneration, Gemma3Processor, BitsAndBytesConfig
+from transformers import AutoImageProcessor, Gemma3ForConditionalGeneration, Gemma3Processor
 
 from ltx_core.loader.module_ops import ModuleOps
 from ltx_core.text_encoders.gemma.feature_extractor import GemmaFeaturesExtractorProjLinear
@@ -44,16 +44,12 @@ def _run_feature_extractor(
         encoded_text_features = torch.stack(hidden_states, dim=-1)
         encoded_text_features_dtype = encoded_text_features.dtype
 
-        print(encoded_text_features_dtype)
-
         sequence_lengths = attention_mask.sum(dim=-1)
         normed_concated_encoded_text_features = _norm_and_concat_padded_batch(
             encoded_text_features, sequence_lengths, padding_side=padding_side
         )
-        print("normed_concated_encoded_text_features")
-        print(normed_concated_encoded_text_features.dtype)
 
-        return self.feature_extractor_linear(normed_concated_encoded_text_features.to(torch.bfloat16))
+        return self.feature_extractor_linear(normed_concated_encoded_text_features.to(encoded_text_features_dtype))
 
     def _convert_to_additive_mask(self, attention_mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
         return (attention_mask - 1).to(dtype).reshape(
@@ -248,24 +244,12 @@ def module_ops_from_gemma_root(gemma_root: str) -> tuple[ModuleOps, ...]:
     tokenizer_path = _find_matching_dir(gemma_root, "tokenizer.model")
 
     def load_gemma(module: GemmaTextEncoderModelBase) -> GemmaTextEncoderModelBase:
-        #max_memory = {0: "8GiB", "cpu": "32GiB"}
-        # 2. Load the model
-        #module.model = Gemma3ForConditionalGeneration.from_pretrained(
-        #    gemma_path,
-        #    local_files_only=True,
-        #    device_map="auto",
-        #    max_memory=max_memory
-        #)
-
-        # Reserve 2GB VRAM for context window and activations
-        # Limit Gemma to 6GB, forcing more layers to CPU RAM
-        max_memory = {0: "3GiB", "cpu": "32GiB"}  # GPU 0: 6GB, CPU: 32GB
         module.model = Gemma3ForConditionalGeneration.from_pretrained(
             gemma_path,
             local_files_only=True,
             torch_dtype=torch.bfloat16,
-            device_map="auto",  # Enable sequential offloading
-            max_memory=max_memory  # Reserve 2GB VRAM for inference
+            device_map="auto",
+            max_memory={0: "3GiB", "cpu": "32GiB"}
         )
         module._gemma_root = module._gemma_root or gemma_root
         return module
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index 35b8528e..c576bf87 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -40,6 +40,7 @@
 device = get_device()
 
 logging.basicConfig(level=logging.ERROR)
+logging.getLogger("accelerate").setLevel(logging.ERROR)
 
 
 class DistilledPipeline:
@@ -161,11 +162,9 @@ def __call__(
 
         print("Stage 1: Initial low resolution video generation.")
         # Stage 1: Initial low resolution video generation.
-        video_encoder = self.model_ledger.video_encoder()
-        transformer = self.model_ledger.transformer()
 
+        transformer = self.model_ledger.transformer()
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-
         def denoising_loop(
                 sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
         ) -> tuple[LatentState, LatentState]:
@@ -180,7 +179,6 @@ def denoising_loop(
                     transformer=transformer,  # noqa: F821
                 ),
             )
-
         stage_1_output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
@@ -188,14 +186,20 @@ def denoising_loop(
             height=height // 2,
             fps=frame_rate,
         )
-        stage_1_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
+        stage_1_conditionings = []
+        if images:
+            video_encoder = self.model_ledger.video_encoder()
+            stage_1_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+            torch.cuda.synchronize()
+            del video_encoder
+            cleanup_memory()
         print("Stage 1: Starting denoising loop.", time.time() - startAt)
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
@@ -209,26 +213,37 @@ def denoising_loop(
             device=self.device,
         )
         print("Stage 1: Finish denoising loop.", time.time() - startAt)
+        torch.cuda.synchronize()
+        del stage_1_sigmas
+        del stage_1_output_shape
+        del stage_1_conditionings
+        cleanup_memory()
 
         print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
         # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
+        video_encoder = self.model_ledger.video_encoder()
+        upsampler = self.model_ledger.spatial_upsampler()
         upscaled_video_latent = upsample_video(
-            latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=self.model_ledger.spatial_upsampler()
+            latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=upsampler
         )
+        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = []
+        if images:
+            stage_2_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_2_output_shape.height,
+                width=stage_2_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
 
         torch.cuda.synchronize()
+        del video_encoder
+        del upsampler
+        del video_state
         cleanup_memory()
-
-        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=stage_2_output_shape.height,
-            width=stage_2_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
@@ -246,13 +261,21 @@ def denoising_loop(
         print("Stage 2: Finish upsample and refine the video.", time.time() - startAt)
         torch.cuda.synchronize()
         del transformer
-        del video_encoder
+        del stage_2_output_shape
+        del stage_2_conditionings
+        del stage_2_sigmas
         cleanup_memory()
         print("Stage 3: Starting vae decode video.", time.time() - startAt)
-        decoded_video = vae_decode_video(video_state.latent, self.model_ledger.video_decoder(), tiling_config)
+        video_decoder = self.model_ledger.video_decoder()
+        decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
+        del video_decoder
+        cleanup_memory()
+        vocoder = self.model_ledger.vocoder()
         decoded_audio = vae_decode_audio(
-            audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
+            audio_state.latent, self.model_ledger.audio_decoder(), vocoder
         )
+        del vocoder
+        cleanup_memory()
         print("Stage 3: Done.", time.time() - startAt)
         return decoded_video, decoded_audio
 
diff --git a/web_ui_v2.py b/web_ui_v2.py
index d4effac7..0eafae95 100644
--- a/web_ui_v2.py
+++ b/web_ui_v2.py
@@ -29,14 +29,17 @@
 
 # Resolution Presets with Max Frame Data for 8GB VRAM
 PRESETS = {
-    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 177},
-    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 177},
+    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 209},
+    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 209},
 
     "1536x1024 (Standard)": {"w": 1536, "h": 1024, "max_frames": 121},
     "1024x1536 (Vertical)": {"w": 1024, "h": 1536, "max_frames": 121},
 
-    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 81},
-    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 81},
+    "1600x896 (Landscape)": {"w": 1600, "h": 896, "max_frames": 121},
+    "896x1600 (Vertical)": {"w": 8996, "h": 1600, "max_frames": 121},
+
+    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 89},
+    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 89},
 
     "2560x1408 (2K)": {"w": 2560, "h": 1408, "max_frames": 49},
     "1408x2560 (2K Vert)": {"w": 1408, "h": 2560, "max_frames": 49},
@@ -210,7 +213,7 @@ def run_generation(
 
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
-                    steps = gr.Slider(label="Inference Steps", minimum=8, maximum=8, step=1, value=8, info="Fixed to 8 for distiled model")
+                    steps = gr.Slider(label="Inference Steps", minimum=8, maximum=8, step=1, value=8, info="Fixed to 8 for distilled model")
                     seed = gr.Number(label="Seed", value=10, precision=0)
 
                 with gr.Row():

From 7eec0ae377c765f8c8d82cedddaa53396bdb10b3 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 17 Jan 2026 22:20:37 +0100
Subject: [PATCH 10/38] Update README.md

---
 README.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6e1b135e..2f0c1ec9 100644
--- a/README.md
+++ b/README.md
@@ -66,14 +66,16 @@ python web_ui_v2.py
 
 **📊 Performance & Presets (8GB VRAM)**
 * The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on 3070Ti laptop GPU ~300sec for all presets.
-* Resolution	Max Frames	Est. Time (3070ti laptop 8gb vram)
-* 1280 x 704	177	      ~300 sec
-* 1536 x 1024	121	      ~300 sec
-* 1920 x 1088	81		      ~300 sec
-* 2560 x 1408	49		      ~315 sec
-* 3840 x 2176	17	         ~315 sec
+* | Resolution  | Max Frames | Est. Time (3070ti laptop 8gb vram) |
+* | :---------- | :--------- | :--------------------------------- |
+* | 1280 x 704  | 177        | ~300 sec                           |
+* | 1536 x 1024 | 121        | ~300 sec                           |
+* | 1920 x 1088 | 81         | ~300 sec                           |
+* | 2560 x 1408 | 49         | ~315 sec                           |
+* | 3840 x 2176 | 17         | ~315 sec                           |
 * +60 sec for prompt (if not empty/not cached) 
 
+
 **Credits**
 * Original Model: Lightricks (LTX-2)
 * Optimization: nalexand

From ea796c6bb7d620f368d0ed5773b342d569c62d96 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Tue, 20 Jan 2026 16:43:29 +0100
Subject: [PATCH 11/38] Revert triton support

---
 .../ltx-core/src/ltx_core/loader/kernels.py   | 231 ++++++++++++------
 .../loader/single_gpu_model_builder.py        |   7 +-
 .../src/ltx_core/model/transformer/rope.py    |   1 +
 .../gemma/encoders/base_encoder.py            |   2 +-
 .../src/ltx_pipelines/distilled.py            |  43 ++++
 .../{ic_lora.py => ic_lora_.py_}              |   0
 .../src/ltx_pipelines/utils/model_ledger.py   |   2 +-
 web_ui_v2.py                                  |  12 +-
 8 files changed, 207 insertions(+), 91 deletions(-)
 rename packages/ltx-pipelines/src/ltx_pipelines/{ic_lora.py => ic_lora_.py_} (100%)

diff --git a/packages/ltx-core/src/ltx_core/loader/kernels.py b/packages/ltx-core/src/ltx_core/loader/kernels.py
index 105497d9..9086f299 100644
--- a/packages/ltx-core/src/ltx_core/loader/kernels.py
+++ b/packages/ltx-core/src/ltx_core/loader/kernels.py
@@ -1,77 +1,154 @@
-import torch
-
-
-def fused_add_round_kernel(
-        x: torch.Tensor,
-        output: torch.Tensor,
-        seed: int,
-        n_elements: int,  # Kept for signature compatibility, but unused
-        EXPONENT_BIAS: int,
-        MANTISSA_BITS: int,
-        BLOCK_SIZE: int = None,  # Kept for signature compatibility, but unused
-):
-    """
-    Native PyTorch implementation of the fused_add_round_kernel.
-
-    This performs:
-    1. Upcast 8-bit weights (x) to match output precision.
-    2. Add output weights (deltas) to x.
-    3. Calculate the epsilon (quantization noise step) based on the target
-       Float8 parameters (EXPONENT_BIAS, MANTISSA_BITS).
-    4. Apply stochastic rounding (add noise proportional to epsilon).
-    5. Store back to output.
-    """
-
-    # 1. Setup Generators for stochastic rounding
-    # We use a specific generator to respect the seed argument
-    gen = torch.Generator(device=output.device).manual_seed(seed)
-
-    # 2. Load and Cast to calculation precision (Float32 for safety, or Float16)
-    # Using Float32 ensures high precision during the intermediate math
-    val_x = x.to(torch.float32)
-    val_delta = output.to(torch.float32)
-
-    # x = x + delta
-    val = val_x + val_delta
-
-    # 3. Calculate Epsilon (The Stochastic Rounding Step)
-    # The Triton kernel calculates epsilon based on the magnitude of 'val'
-    # mapped onto the specific Float8 exponent grid.
-
-    # Extract exponent: val = mantissa * 2^exp.
-    # torch.frexp returns exp such that 0.5 <= |mantissa| < 1.0.
-    # IEEE 754 log2(x) is (exp - 1).
-    _, exp_obj = torch.frexp(val)
-    unbiased_exp = exp_obj - 1
-
-    # Map to target Float8 exponent space
-    target_exp = unbiased_exp + EXPONENT_BIAS
-
-    # Clamp exponent to target dtype range.
-    # Max is standard formulation (2*Bias + 1).
-    # Min is 1. Why 1? In the original Triton kernel, subnormals (exp <= 0)
-    # utilize a constant epsilon calculated based on exponent=1 (the smallest normal).
-    max_exponent = 2 * EXPONENT_BIAS + 1
-    target_exp_clamped = torch.clamp(target_exp, min=1, max=max_exponent)
-
-    # Calculate ULP exponent: E_target - BIAS - Mantissa_Bits
-    eps_exponent = target_exp_clamped - EXPONENT_BIAS - MANTISSA_BITS
-
-    # Convert exponent to actual epsilon value: 2^eps_exponent
-    eps = torch.pow(2.0, eps_exponent.to(torch.float32))
-
-    # Mask epsilon where value is exactly 0 (matches `tl.where(x == 0, 0.0, eps)`)
-    eps = torch.where(val == 0, 0.0, eps)
-
-    # 4. Generate Random Noise [-0.5, 0.5]
-    rand_vals = torch.rand(val.shape, generator=gen, device=val.device) - 0.5
-
-    # 5. Apply Stochastic Rounding
-    # output = x + (noise * epsilon)
-    result = val + (rand_vals * eps)
-
-    # 6. Store Result
-    # In-place update of the output tensor, cast to bfloat16
-    output.copy_(result.to(torch.bfloat16))
-
-    # No return value needed as operation is in-place on output_ptr/output
\ No newline at end of file
+try:
+    # ruff: noqa: ANN001, ANN201, ERA001, N803, N806
+    import triton
+    import triton.language as tl
+
+
+    @triton.jit
+    def fused_add_round_kernel(
+            x_ptr,
+            output_ptr,  # contents will be added to the output
+            seed,
+            n_elements,
+            EXPONENT_BIAS,
+            MANTISSA_BITS,
+            BLOCK_SIZE: tl.constexpr,
+    ):
+        """
+        A kernel to upcast 8bit quantized weights to bfloat16 with stochastic rounding
+        and add them to bfloat16 output weights. Might be used to upcast original model weights
+        and to further add them to precalculated deltas coming from LoRAs.
+        """
+        # Get program ID and compute offsets
+        pid = tl.program_id(axis=0)
+        block_start = pid * BLOCK_SIZE
+        offsets = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < n_elements
+
+        # Load data
+        x = tl.load(x_ptr + offsets, mask=mask)
+        rand_vals = tl.rand(seed, offsets) - 0.5
+
+        x = tl.cast(x, tl.float16)
+        delta = tl.load(output_ptr + offsets, mask=mask)
+        delta = tl.cast(delta, tl.float16)
+        x = x + delta
+
+        x_bits = tl.cast(x, tl.int16, bitcast=True)
+
+        # Calculate the exponent. Unbiased fp16 exponent is ((x_bits & 0x7C00) >> 10) - 15 for
+        # normal numbers and -14 for subnormals.
+        fp16_exponent_bits = (x_bits & 0x7C00) >> 10
+        fp16_normals = fp16_exponent_bits > 0
+        fp16_exponent = tl.where(fp16_normals, fp16_exponent_bits - 15, -14)
+
+        # Add the target dtype's exponent bias and clamp to the target dtype's exponent range.
+        exponent = fp16_exponent + EXPONENT_BIAS
+        MAX_EXPONENT = 2 * EXPONENT_BIAS + 1
+        exponent = tl.where(exponent > MAX_EXPONENT, MAX_EXPONENT, exponent)
+        exponent = tl.where(exponent < 0, 0, exponent)
+
+        # Normal ULP exponent, expressed as an fp16 exponent field:
+        # (exponent - EXPONENT_BIAS - MANTISSA_BITS) + 15
+        # Simplifies to: fp16_exponent - MANTISSA_BITS + 15
+        # See https://en.wikipedia.org/wiki/Unit_in_the_last_place
+        eps_exp = tl.maximum(0, tl.minimum(31, exponent - EXPONENT_BIAS - MANTISSA_BITS + 15))
+
+        # Calculate epsilon in the target dtype
+        eps_normal = tl.cast(tl.cast(eps_exp << 10, tl.int16), tl.float16, bitcast=True)
+
+        # Subnormal ULP: 2^(1 - EXPONENT_BIAS - MANTISSA_BITS) ->
+        # fp16 exponent bits: (1 - EXPONENT_BIAS - MANTISSA_BITS) + 15 =
+        # 16 - EXPONENT_BIAS - MANTISSA_BITS
+        eps_subnormal = tl.cast(tl.cast((16 - EXPONENT_BIAS - MANTISSA_BITS) << 10, tl.int16), tl.float16, bitcast=True)
+        eps = tl.where(exponent > 0, eps_normal, eps_subnormal)
+
+        # Apply zero mask to epsilon
+        eps = tl.where(x == 0, 0.0, eps)
+
+        # Apply stochastic rounding
+        output = tl.cast(x + rand_vals * eps, tl.bfloat16)
+
+        # Store the result
+        tl.store(output_ptr + offsets, output, mask=mask)
+    print("used triton kernels")
+except Exception:
+    import torch
+
+    print("used torch kernels")
+
+
+    def fused_add_round_kernel(
+            x: torch.Tensor,
+            output: torch.Tensor,
+            seed: int,
+            n_elements: int,  # Kept for signature compatibility, but unused
+            EXPONENT_BIAS: int,
+            MANTISSA_BITS: int,
+            BLOCK_SIZE: int = None,  # Kept for signature compatibility, but unused
+    ):
+        """
+        Native PyTorch implementation of the fused_add_round_kernel.
+
+        This performs:
+        1. Upcast 8-bit weights (x) to match output precision.
+        2. Add output weights (deltas) to x.
+        3. Calculate the epsilon (quantization noise step) based on the target
+           Float8 parameters (EXPONENT_BIAS, MANTISSA_BITS).
+        4. Apply stochastic rounding (add noise proportional to epsilon).
+        5. Store back to output.
+        """
+
+        # 1. Setup Generators for stochastic rounding
+        # We use a specific generator to respect the seed argument
+        gen = torch.Generator(device=output.device).manual_seed(seed)
+
+        # 2. Load and Cast to calculation precision (Float32 for safety, or Float16)
+        # Using Float32 ensures high precision during the intermediate math
+        val_x = x.to(torch.float32)
+        val_delta = output.to(torch.float32)
+
+        # x = x + delta
+        val = val_x + val_delta
+
+        # 3. Calculate Epsilon (The Stochastic Rounding Step)
+        # The Triton kernel calculates epsilon based on the magnitude of 'val'
+        # mapped onto the specific Float8 exponent grid.
+
+        # Extract exponent: val = mantissa * 2^exp.
+        # torch.frexp returns exp such that 0.5 <= |mantissa| < 1.0.
+        # IEEE 754 log2(x) is (exp - 1).
+        _, exp_obj = torch.frexp(val)
+        unbiased_exp = exp_obj - 1
+
+        # Map to target Float8 exponent space
+        target_exp = unbiased_exp + EXPONENT_BIAS
+
+        # Clamp exponent to target dtype range.
+        # Max is standard formulation (2*Bias + 1).
+        # Min is 1. Why 1? In the original Triton kernel, subnormals (exp <= 0)
+        # utilize a constant epsilon calculated based on exponent=1 (the smallest normal).
+        max_exponent = 2 * EXPONENT_BIAS + 1
+        target_exp_clamped = torch.clamp(target_exp, min=1, max=max_exponent)
+
+        # Calculate ULP exponent: E_target - BIAS - Mantissa_Bits
+        eps_exponent = target_exp_clamped - EXPONENT_BIAS - MANTISSA_BITS
+
+        # Convert exponent to actual epsilon value: 2^eps_exponent
+        eps = torch.pow(2.0, eps_exponent.to(torch.float32))
+
+        # Mask epsilon where value is exactly 0 (matches `tl.where(x == 0, 0.0, eps)`)
+        eps = torch.where(val == 0, 0.0, eps)
+
+        # 4. Generate Random Noise [-0.5, 0.5]
+        rand_vals = torch.rand(val.shape, generator=gen, device=val.device) - 0.5
+
+        # 5. Apply Stochastic Rounding
+        # output = x + (noise * epsilon)
+        result = val + (rand_vals * eps)
+
+        # 6. Store Result
+        # In-place update of the output tensor, cast to bfloat16
+        output.copy_(result.to(torch.bfloat16))
+
+        # No return value needed as operation is in-place on output_ptr/output
\ No newline at end of file
diff --git a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
index 55119036..03ca3a44 100644
--- a/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
+++ b/packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py
@@ -28,7 +28,7 @@
 @dataclass(frozen=True)
 class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType], LoRAAdaptableProtocol):
     """
-    Builder for PyTorch models residing on a single GPU or offloaded via Accelerate.
+    Builder for PyTorch models residing on a single GPU.
     """
 
     model_class_configurator: type[ModelConfigurator[ModelType]]
@@ -79,18 +79,13 @@ def build(
             max_memory: dict[int | str, str] | None = None
     ) -> ModelType:
         target_device = torch.device("cuda") if device is None else device
-
-        # 1. Get Config and Meta Model
         config = self.model_config()
         meta_model = self.meta_model(config, self.module_ops)
-
-        # 2. Load Base State Dict
         model_paths = self.model_path if isinstance(self.model_path, tuple) else [self.model_path]
         load_device = target_device if max_memory is None else torch.device("cpu")
         model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry,
                                         device=load_device)
 
-        # 3. Handle LoRAs
         lora_strengths = [lora.strength for lora in self.loras]
         final_sd_map = {}
 
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/rope.py b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
index cf2b18b0..4838425a 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/rope.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
@@ -68,6 +68,7 @@ def apply_split_rotary_emb_(
 
     return output
 
+
 def apply_split_rotary_emb(
     input_tensor: torch.Tensor, cos_freqs: torch.Tensor, sin_freqs: torch.Tensor
 ) -> torch.Tensor:
diff --git a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
index 85298831..29a6d9d7 100644
--- a/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
+++ b/packages/ltx-core/src/ltx_core/text_encoders/gemma/encoders/base_encoder.py
@@ -249,7 +249,7 @@ def load_gemma(module: GemmaTextEncoderModelBase) -> GemmaTextEncoderModelBase:
             local_files_only=True,
             torch_dtype=torch.bfloat16,
             device_map="auto",
-            max_memory={0: "3GiB", "cpu": "32GiB"}
+            max_memory={0: "2GiB", "cpu": "32GiB"}
         )
         module._gemma_root = module._gemma_root or gemma_root
         return module
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index c576bf87..b52feb0f 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -4,6 +4,7 @@
 import os
 
 from collections.abc import Iterator
+import numpy as np
 
 import torch
 
@@ -77,6 +78,46 @@ def __init__(
             device=device,
         )
 
+    def get_interpolated_sigmas(self, num_steps: int, device: torch.device) -> torch.Tensor:
+        # The original 8-step schedule provided by LTX
+        original_sigmas = DISTILLED_SIGMA_VALUES  #  [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
+
+        # Create x-axis for original data (0 to 1)
+        old_x = np.linspace(0, 1, len(original_sigmas))
+
+        # Create x-axis for new data (we need num_steps + 1 points for num_steps steps)
+        new_x = np.linspace(0, 1, num_steps + 1)
+
+        # Linear interpolation to find new values that fit the curve
+        new_sigmas = np.interp(new_x, old_x, original_sigmas)
+
+        # Ensure precision and boundaries
+        new_sigmas[0] = original_sigmas[0]  # Force 1.0
+        new_sigmas[-1] = original_sigmas[-1]  # Force 0.0
+
+        # Convert to tensor
+        return torch.tensor(new_sigmas, dtype=torch.float32, device=device)
+
+    def get_interpolated_sigmas2(self, num_steps: int, device: torch.device) -> torch.Tensor:
+        # The original 8-step schedule provided by LTX
+        original_sigmas = STAGE_2_DISTILLED_SIGMA_VALUES  #  [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
+
+        # Create x-axis for original data (0 to 1)
+        old_x = np.linspace(0, 1, len(original_sigmas))
+
+        # Create x-axis for new data (we need num_steps + 1 points for num_steps steps)
+        new_x = np.linspace(0, 1, num_steps + 1)
+
+        # Linear interpolation to find new values that fit the curve
+        new_sigmas = np.interp(new_x, old_x, original_sigmas)
+
+        # Ensure precision and boundaries
+        new_sigmas[0] = original_sigmas[0]  # Force 1.0
+        new_sigmas[-1] = original_sigmas[-1]  # Force 0.0
+
+        # Convert to tensor
+        return torch.tensor(new_sigmas, dtype=torch.float32, device=device)
+
     @torch.inference_mode()
     #@profile
     def __call__(
@@ -165,6 +206,7 @@ def __call__(
 
         transformer = self.model_ledger.transformer()
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        #stage_1_sigmas = self.get_interpolated_sigmas(12, self.device)
         def denoising_loop(
                 sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
         ) -> tuple[LatentState, LatentState]:
@@ -227,6 +269,7 @@ def denoising_loop(
             latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=upsampler
         )
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        #stage_2_sigmas = self.get_interpolated_sigmas2(4, self.device)
         stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_2_conditionings = []
         if images:
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/ic_lora.py b/packages/ltx-pipelines/src/ltx_pipelines/ic_lora_.py_
similarity index 100%
rename from packages/ltx-pipelines/src/ltx_pipelines/ic_lora.py
rename to packages/ltx-pipelines/src/ltx_pipelines/ic_lora_.py_
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
index ba2e3b97..6e00d5ee 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
@@ -177,7 +177,7 @@ def with_loras(self, loras: LoraPathStrengthAndSDOps) -> "ModelLedger":
         )
 
     def transformer(self) -> X0Model:
-        offload_config = {0: "0.5GiB", "cpu": "32GiB"}
+        offload_config = {0: "0.2GiB", "cpu": "32GiB"}
         if not hasattr(self, "transformer_builder"):
             raise ValueError(
                 "Transformer not initialized. Please provide a checkpoint path to the ModelLedger constructor."
diff --git a/web_ui_v2.py b/web_ui_v2.py
index 0eafae95..4d7722ef 100644
--- a/web_ui_v2.py
+++ b/web_ui_v2.py
@@ -29,17 +29,17 @@
 
 # Resolution Presets with Max Frame Data for 8GB VRAM
 PRESETS = {
-    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 209},
-    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 209},
+    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 225},
+    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 225},
 
     "1536x1024 (Standard)": {"w": 1536, "h": 1024, "max_frames": 121},
     "1024x1536 (Vertical)": {"w": 1024, "h": 1536, "max_frames": 121},
 
-    "1600x896 (Landscape)": {"w": 1600, "h": 896, "max_frames": 121},
-    "896x1600 (Vertical)": {"w": 8996, "h": 1600, "max_frames": 121},
+    "1600x896 (Landscape)": {"w": 1600, "h": 896, "max_frames": 145},
+    "896x1600 (Vertical)": {"w": 896, "h": 1600, "max_frames": 145},
 
-    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 89},
-    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 89},
+    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 97},
+    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 97},
 
     "2560x1408 (2K)": {"w": 2560, "h": 1408, "max_frames": 49},
     "1408x2560 (2K Vert)": {"w": 1408, "h": 2560, "max_frames": 49},

From 44290cf6354a45e7e67ebf35f14aa569a53be8ad Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Thu, 22 Jan 2026 18:53:43 +0100
Subject: [PATCH 12/38] Optimize memory usage in transformer

---
 .../src/ltx_core/model/transformer/rope.py    |   5 +-
 .../ltx_core/model/transformer/transformer.py | 282 ++++++++++++------
 .../src/ltx_pipelines/distilled.py            |  88 +++---
 .../{ic_lora_.py_ => ic_lora.py}              |   0
 4 files changed, 229 insertions(+), 146 deletions(-)
 rename packages/ltx-pipelines/src/ltx_pipelines/{ic_lora_.py_ => ic_lora.py} (100%)

diff --git a/packages/ltx-core/src/ltx_core/model/transformer/rope.py b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
index 4838425a..aac53d3f 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/rope.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/rope.py
@@ -38,7 +38,8 @@ def apply_interleaved_rotary_emb(
 
     return out
 
-def apply_split_rotary_emb_(
+
+def apply_split_rotary_emb(
     input_tensor: torch.Tensor,
     cos_freqs: torch.Tensor,
     sin_freqs: torch.Tensor
@@ -69,7 +70,7 @@ def apply_split_rotary_emb_(
     return output
 
 
-def apply_split_rotary_emb(
+def apply_split_rotary_emb_(
     input_tensor: torch.Tensor, cos_freqs: torch.Tensor, sin_freqs: torch.Tensor
 ) -> torch.Tensor:
     needs_reshape = False
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
index 78173ddd..bc304c9e 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
@@ -9,8 +9,6 @@
 from ltx_core.model.transformer.transformer_args import TransformerArgs
 from ltx_core.utils import rms_norm
 
-#from line_profiler import profile
-
 
 @dataclass
 class TransformerConfig:
@@ -22,13 +20,13 @@ class TransformerConfig:
 
 class BasicAVTransformerBlock(torch.nn.Module):
     def __init__(
-        self,
-        idx: int,
-        video: TransformerConfig | None = None,
-        audio: TransformerConfig | None = None,
-        rope_type: LTXRopeType = LTXRopeType.INTERLEAVED,
-        norm_eps: float = 1e-6,
-        attention_function: AttentionFunction | AttentionCallable = AttentionFunction.DEFAULT,
+            self,
+            idx: int,
+            video: TransformerConfig | None = None,
+            audio: TransformerConfig | None = None,
+            rope_type: LTXRopeType = LTXRopeType.INTERLEAVED,
+            norm_eps: float = 1e-6,
+            attention_function: AttentionFunction | AttentionCallable = AttentionFunction.DEFAULT,
     ):
         super().__init__()
 
@@ -78,7 +76,6 @@ def __init__(
             self.audio_scale_shift_table = torch.nn.Parameter(torch.empty(6, audio.dim))
 
         if audio is not None and video is not None:
-            # Q: Video, K,V: Audio
             self.audio_to_video_attn = Attention(
                 query_dim=video.dim,
                 context_dim=audio.dim,
@@ -88,8 +85,6 @@ def __init__(
                 norm_eps=norm_eps,
                 attention_function=attention_function,
             )
-
-            # Q: Audio, K,V: Video
             self.video_to_audio_attn = Attention(
                 query_dim=audio.dim,
                 context_dim=video.dim,
@@ -99,34 +94,38 @@ def __init__(
                 norm_eps=norm_eps,
                 attention_function=attention_function,
             )
-
             self.scale_shift_table_a2v_ca_audio = torch.nn.Parameter(torch.empty(5, audio.dim))
             self.scale_shift_table_a2v_ca_video = torch.nn.Parameter(torch.empty(5, video.dim))
 
         self.norm_eps = norm_eps
 
-    #@profile 1.26368 s
     def get_ada_values(
-        self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice
+            self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice
     ) -> tuple[torch.Tensor, ...]:
         num_ada_params = scale_shift_table.shape[0]
 
+        if timestep.dim() > 2 and timestep.shape[1] > 1:
+            timestep = timestep[:, [0], ...]
+
+        table_slice = scale_shift_table[indices]
+        if table_slice.device != timestep.device or table_slice.dtype != timestep.dtype:
+            table_slice = table_slice.to(device=timestep.device, dtype=timestep.dtype)
+
         ada_values = (
-            scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=timestep.device, dtype=timestep.dtype)  # 89.6%
-            + timestep.reshape(batch_size, timestep.shape[1], num_ada_params, -1)[:, :, indices, :]
+                table_slice.unsqueeze(0).unsqueeze(0)
+                + timestep.reshape(batch_size, timestep.shape[1], num_ada_params, -1)[:, :, indices, :]
         ).unbind(dim=2)
         return ada_values
 
-    #@profile 0.925723 s
     def get_av_ca_ada_values(
-        self,
-        scale_shift_table: torch.Tensor,
-        batch_size: int,
-        scale_shift_timestep: torch.Tensor,
-        gate_timestep: torch.Tensor,
-        num_scale_shift_values: int = 4,
+            self,
+            scale_shift_table: torch.Tensor,
+            batch_size: int,
+            scale_shift_timestep: torch.Tensor,
+            gate_timestep: torch.Tensor,
+            num_scale_shift_values: int = 4,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        scale_shift_ada_values = self.get_ada_values(  # 86%
+        scale_shift_ada_values = self.get_ada_values(
             scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None)
         )
         gate_ada_values = self.get_ada_values(
@@ -138,14 +137,13 @@ def get_av_ca_ada_values(
 
         return (*scale_shift_chunks, *gate_ada_values)
 
-    #@profile 859.862 s
     def forward(  # noqa: PLR0915
-        self,
-        video: TransformerArgs | None,
-        audio: TransformerArgs | None,
-        perturbations: BatchedPerturbationConfig | None = None,
+            self,
+            video: TransformerArgs | None,
+            audio: TransformerArgs | None,
+            perturbations: BatchedPerturbationConfig | None = None,
     ) -> tuple[TransformerArgs | None, TransformerArgs | None]:
-        batch_size = video.x.shape[0]
+        batch_size = video.x.shape[0] if video is not None else (audio.x.shape[0] if audio is not None else 0)
         if perturbations is None:
             perturbations = BatchedPerturbationConfig.empty(batch_size)
 
@@ -158,122 +156,212 @@ def forward(  # noqa: PLR0915
         run_a2v = run_vx and (audio is not None and ax.numel() > 0)
         run_v2a = run_ax and (video is not None and vx.numel() > 0)
 
+        # --- Video Self-Attention & Cross-Attention ---
         if run_vx:
             vshift_msa, vscale_msa, vgate_msa = self.get_ada_values(
                 self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3)
             )
+
             if not perturbations.all_in_batch(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx):
-                norm_vx = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_msa) + vshift_msa
+                # Optimization: Compute norm, then scale in-place to avoid allocating 'vx_scaled'
+                norm_vx = rms_norm(vx, eps=self.norm_eps)
+                norm_vx.mul_(1 + vscale_msa).add_(vshift_msa)
+
                 v_mask = perturbations.mask_like(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx, vx)
-                vx = vx + self.attn1(norm_vx, pe=video.positional_embeddings) * vgate_msa * v_mask  # 24%
+                attn_out = self.attn1(norm_vx, pe=video.positional_embeddings)
+                vx = vx + attn_out * vgate_msa * v_mask
+
+                del norm_vx, attn_out, v_mask
 
-            vx = vx + self.attn2(rms_norm(vx, eps=self.norm_eps), context=video.context, mask=video.context_mask)  # 14%
+            # Optimization: Context Attention
+            # We can reuse the norm calculation or just compute it fresh.
+            # To be safe with residual connections, we compute fresh, but scale in-place.
+            norm_vx = rms_norm(vx, eps=self.norm_eps)
+            # No ada modulation for context attn in this block usually, but if we had it, we'd do it in-place here.
+            attn_out = self.attn2(norm_vx, context=video.context, mask=video.context_mask)
+            vx = vx + attn_out
 
-            del vshift_msa, vscale_msa, vgate_msa
+            del norm_vx, attn_out, vshift_msa, vscale_msa, vgate_msa
 
+        # --- Audio Self-Attention & Cross-Attention ---
         if run_ax:
             ashift_msa, ascale_msa, agate_msa = self.get_ada_values(
                 self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3)
             )
 
             if not perturbations.all_in_batch(PerturbationType.SKIP_AUDIO_SELF_ATTN, self.idx):
-                norm_ax = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_msa) + ashift_msa
+                norm_ax = rms_norm(ax, eps=self.norm_eps)
+                norm_ax.mul_(1 + ascale_msa).add_(ashift_msa)
+
                 a_mask = perturbations.mask_like(PerturbationType.SKIP_AUDIO_SELF_ATTN, self.idx, ax)
-                ax = ax + self.audio_attn1(norm_ax, pe=audio.positional_embeddings) * agate_msa * a_mask
+                attn_out = self.audio_attn1(norm_ax, pe=audio.positional_embeddings)
+                ax = ax + attn_out * agate_msa * a_mask
 
-            ax = ax + self.audio_attn2(rms_norm(ax, eps=self.norm_eps), context=audio.context, mask=audio.context_mask)
+                del norm_ax, attn_out, a_mask
 
             del ashift_msa, ascale_msa, agate_msa
 
-        # Audio - Video cross attention.
+            # Audio Context Attention
+            norm_ax = rms_norm(ax, eps=self.norm_eps)
+            attn_out = self.audio_attn2(norm_ax, context=audio.context, mask=audio.context_mask)
+            ax = ax + attn_out
+            del norm_ax, attn_out
+
+        # --- Audio - Video Cross Attention (MEMORY OPTIMIZED) ---
         if run_a2v or run_v2a:
+            # These norms are allocated fresh.
             vx_norm3 = rms_norm(vx, eps=self.norm_eps)
             ax_norm3 = rms_norm(ax, eps=self.norm_eps)
 
-            (
-                scale_ca_audio_hidden_states_a2v,
-                shift_ca_audio_hidden_states_a2v,
-                scale_ca_audio_hidden_states_v2a,
-                shift_ca_audio_hidden_states_v2a,
-                gate_out_v2a,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_audio,
-                ax.shape[0],
-                audio.cross_scale_shift_timestep,
-                audio.cross_gate_timestep,
-            )
+            # Helper to process A2V
+            if run_a2v:
+                (
+                    scale_ca_audio_hidden_states_a2v,
+                    shift_ca_audio_hidden_states_a2v,
+                    _,
+                    _,
+                    _,
+                ) = self.get_av_ca_ada_values(
+                    self.scale_shift_table_a2v_ca_audio,
+                    ax.shape[0],
+                    audio.cross_scale_shift_timestep,
+                    audio.cross_gate_timestep,
+                )
 
-            (
-                scale_ca_video_hidden_states_a2v,
-                shift_ca_video_hidden_states_a2v,
-                scale_ca_video_hidden_states_v2a,
-                shift_ca_video_hidden_states_v2a,
-                gate_out_a2v,
-            ) = self.get_av_ca_ada_values(
-                self.scale_shift_table_a2v_ca_video,
-                vx.shape[0],
-                video.cross_scale_shift_timestep,
-                video.cross_gate_timestep,
-            )
+                (
+                    scale_ca_video_hidden_states_a2v,
+                    shift_ca_video_hidden_states_a2v,
+                    _,
+                    _,
+                    gate_out_a2v,
+                ) = self.get_av_ca_ada_values(
+                    self.scale_shift_table_a2v_ca_video,
+                    vx.shape[0],
+                    video.cross_scale_shift_timestep,
+                    video.cross_gate_timestep,
+                )
 
-            if run_a2v:
-                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v) + shift_ca_video_hidden_states_a2v
-                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v
                 a2v_mask = perturbations.mask_like(PerturbationType.SKIP_A2V_CROSS_ATTN, self.idx, vx)
-                vx = vx + (
-                    self.audio_to_video_attn(
+
+                # OPTIMIZATION: If V2A is NOT running, we can modify vx_norm3/ax_norm3 in-place.
+                # This prevents allocating 'vx_scaled' and 'ax_scaled' buffers.
+                if not run_v2a:
+                    vx_norm3.mul_(1 + scale_ca_video_hidden_states_a2v).add_(shift_ca_video_hidden_states_a2v)
+                    ax_norm3.mul_(1 + scale_ca_audio_hidden_states_a2v).add_(shift_ca_audio_hidden_states_a2v)
+
+                    attn_out = self.audio_to_video_attn(
+                        vx_norm3,
+                        context=ax_norm3,
+                        pe=video.cross_positional_embeddings,
+                        k_pe=audio.cross_positional_embeddings
+                    )
+                else:
+                    # If V2A is running, we need the original norms for it, so we must allocate new scaled tensors.
+                    vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_a2v) + shift_ca_video_hidden_states_a2v
+                    ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_a2v) + shift_ca_audio_hidden_states_a2v
+
+                    attn_out = self.audio_to_video_attn(
                         vx_scaled,
                         context=ax_scaled,
                         pe=video.cross_positional_embeddings,
-                        k_pe=audio.cross_positional_embeddings,
+                        k_pe=audio.cross_positional_embeddings
                     )
-                    * gate_out_a2v
-                    * a2v_mask
-                )
+                    del vx_scaled, ax_scaled
+
+                vx = vx + attn_out * gate_out_a2v * a2v_mask
 
+                del scale_ca_video_hidden_states_a2v, shift_ca_video_hidden_states_a2v
+                del scale_ca_audio_hidden_states_a2v, shift_ca_audio_hidden_states_a2v
+                del gate_out_a2v, a2v_mask, attn_out
+
+            # Helper to process V2A
             if run_v2a:
-                ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a
-                vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a
+                (
+                    _,
+                    _,
+                    scale_ca_audio_hidden_states_v2a,
+                    shift_ca_audio_hidden_states_v2a,
+                    gate_out_v2a,
+                ) = self.get_av_ca_ada_values(
+                    self.scale_shift_table_a2v_ca_audio,
+                    ax.shape[0],
+                    audio.cross_scale_shift_timestep,
+                    audio.cross_gate_timestep,
+                )
+
+                (
+                    _,
+                    _,
+                    scale_ca_video_hidden_states_v2a,
+                    shift_ca_video_hidden_states_v2a,
+                    _,
+                ) = self.get_av_ca_ada_values(
+                    self.scale_shift_table_a2v_ca_video,
+                    vx.shape[0],
+                    video.cross_scale_shift_timestep,
+                    video.cross_gate_timestep,
+                )
+
                 v2a_mask = perturbations.mask_like(PerturbationType.SKIP_V2A_CROSS_ATTN, self.idx, ax)
-                ax = ax + (
-                    self.video_to_audio_attn(
+
+                # OPTIMIZATION: If A2V did NOT run, we can use the norms in-place.
+                if not run_a2v:
+                    ax_norm3.mul_(1 + scale_ca_audio_hidden_states_v2a).add_(shift_ca_audio_hidden_states_v2a)
+                    vx_norm3.mul_(1 + scale_ca_video_hidden_states_v2a).add_(shift_ca_video_hidden_states_v2a)
+
+                    attn_out = self.video_to_audio_attn(
+                        ax_norm3,
+                        context=vx_norm3,
+                        pe=audio.cross_positional_embeddings,
+                        k_pe=video.cross_positional_embeddings
+                    )
+                else:
+                    # Both A2V and V2A ran. A2V preserved the norms (because of the `else` block above).
+                    # So we still have the original norms here. We must allocate new.
+                    ax_scaled = ax_norm3 * (1 + scale_ca_audio_hidden_states_v2a) + shift_ca_audio_hidden_states_v2a
+                    vx_scaled = vx_norm3 * (1 + scale_ca_video_hidden_states_v2a) + shift_ca_video_hidden_states_v2a
+
+                    attn_out = self.video_to_audio_attn(
                         ax_scaled,
                         context=vx_scaled,
                         pe=audio.cross_positional_embeddings,
-                        k_pe=video.cross_positional_embeddings,
+                        k_pe=video.cross_positional_embeddings
                     )
-                    * gate_out_v2a
-                    * v2a_mask
-                )
+                    del ax_scaled, vx_scaled
 
-            del gate_out_a2v, gate_out_v2a
-            del (
-                scale_ca_video_hidden_states_a2v,
-                shift_ca_video_hidden_states_a2v,
-                scale_ca_audio_hidden_states_a2v,
-                shift_ca_audio_hidden_states_a2v,
-                scale_ca_video_hidden_states_v2a,
-                shift_ca_video_hidden_states_v2a,
-                scale_ca_audio_hidden_states_v2a,
-                shift_ca_audio_hidden_states_v2a,
-            )
+                ax = ax + attn_out * gate_out_v2a * v2a_mask
 
+                del scale_ca_video_hidden_states_v2a, shift_ca_video_hidden_states_v2a
+                del scale_ca_audio_hidden_states_v2a, shift_ca_audio_hidden_states_v2a
+                del gate_out_v2a, v2a_mask, attn_out
+
+            del vx_norm3, ax_norm3
+
+        # --- FFN Layers ---
         if run_vx:
             vshift_mlp, vscale_mlp, vgate_mlp = self.get_ada_values(
                 self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None)
             )
-            vx_scaled = rms_norm(vx, eps=self.norm_eps) * (1 + vscale_mlp) + vshift_mlp
-            vx = vx + self.ff(vx_scaled) * vgate_mlp  # 33%
+            # Optimization: In-place scaling of the normalized input
+            vx_scaled = rms_norm(vx, eps=self.norm_eps)
+            vx_scaled.mul_(1 + vscale_mlp).add_(vshift_mlp)
 
-            del vshift_mlp, vscale_mlp, vgate_mlp
+            ffn_out = self.ff(vx_scaled)
+            vx = vx + ffn_out * vgate_mlp
+
+            del vshift_mlp, vscale_mlp, vgate_mlp, vx_scaled, ffn_out
 
         if run_ax:
             ashift_mlp, ascale_mlp, agate_mlp = self.get_ada_values(
                 self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, None)
             )
-            ax_scaled = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_mlp) + ashift_mlp
-            ax = ax + self.audio_ff(ax_scaled) * agate_mlp
+            # Optimization: In-place scaling
+            ax_scaled = rms_norm(ax, eps=self.norm_eps)
+            ax_scaled.mul_(1 + ascale_mlp).add_(ashift_mlp)
+
+            ffn_out = self.audio_ff(ax_scaled)
+            ax = ax + ffn_out * agate_mlp
 
-            del ashift_mlp, ascale_mlp, agate_mlp
+            del ashift_mlp, ascale_mlp, agate_mlp, ax_scaled, ffn_out
 
         return replace(video, x=vx) if video is not None else None, replace(audio, x=ax) if audio is not None else None
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index b52feb0f..7c7a383e 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -79,47 +79,24 @@ def __init__(
         )
 
     def get_interpolated_sigmas(self, num_steps: int, device: torch.device) -> torch.Tensor:
-        # The original 8-step schedule provided by LTX
-        original_sigmas = DISTILLED_SIGMA_VALUES  #  [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
-
-        # Create x-axis for original data (0 to 1)
+        original_sigmas = DISTILLED_SIGMA_VALUES
         old_x = np.linspace(0, 1, len(original_sigmas))
-
-        # Create x-axis for new data (we need num_steps + 1 points for num_steps steps)
         new_x = np.linspace(0, 1, num_steps + 1)
-
-        # Linear interpolation to find new values that fit the curve
         new_sigmas = np.interp(new_x, old_x, original_sigmas)
-
-        # Ensure precision and boundaries
-        new_sigmas[0] = original_sigmas[0]  # Force 1.0
-        new_sigmas[-1] = original_sigmas[-1]  # Force 0.0
-
-        # Convert to tensor
+        new_sigmas[0] = original_sigmas[0]
+        new_sigmas[-1] = original_sigmas[-1]
         return torch.tensor(new_sigmas, dtype=torch.float32, device=device)
 
     def get_interpolated_sigmas2(self, num_steps: int, device: torch.device) -> torch.Tensor:
-        # The original 8-step schedule provided by LTX
-        original_sigmas = STAGE_2_DISTILLED_SIGMA_VALUES  #  [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
-
-        # Create x-axis for original data (0 to 1)
+        original_sigmas = STAGE_2_DISTILLED_SIGMA_VALUES
         old_x = np.linspace(0, 1, len(original_sigmas))
-
-        # Create x-axis for new data (we need num_steps + 1 points for num_steps steps)
         new_x = np.linspace(0, 1, num_steps + 1)
-
-        # Linear interpolation to find new values that fit the curve
         new_sigmas = np.interp(new_x, old_x, original_sigmas)
-
-        # Ensure precision and boundaries
-        new_sigmas[0] = original_sigmas[0]  # Force 1.0
-        new_sigmas[-1] = original_sigmas[-1]  # Force 0.0
-
-        # Convert to tensor
+        new_sigmas[0] = original_sigmas[0]
+        new_sigmas[-1] = original_sigmas[-1]
         return torch.tensor(new_sigmas, dtype=torch.float32, device=device)
 
     @torch.inference_mode()
-    #@profile
     def __call__(
             self,
             prompt: str,
@@ -131,6 +108,9 @@ def __call__(
             images: list[tuple[str, int, float]],
             tiling_config: TilingConfig | None = None,
             enhance_prompt: bool = False,
+            output_path: str = '',
+            video_chunks_number: int = 0,
+            fps: int = 0,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
         print("Preparing Inference")
         startAt = time.time()
@@ -141,23 +121,20 @@ def __call__(
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
 
-        # --- DISK CACHE LOGIC START ---
+        # --- PROMPT CACHE LOGIC START ---
         CACHE_DIR = "./prompt_embeddings_cache"
         os.makedirs(CACHE_DIR, exist_ok=True)
 
-        # 1. Create a unique hash string based on inputs that affect text encoding
-        # Distilled pipeline usually doesn't use negative prompts, so we exclude it from the hash
         image_identifier = images[0][0] if (len(images) > 0 and enhance_prompt) else "no_img"
 
         hash_input_str = (
             f"prompt:{prompt}|"
-            f"pipeline:distilled|"  # Differentiates from standard t2v if they share a folder
+            f"pipeline:distilled|"
             f"enhance:{enhance_prompt}|"
             f"seed:{seed if enhance_prompt else 'ignored'}|"
             f"img:{image_identifier}"
         )
 
-        # Create MD5 hash for filename
         cache_filename = hashlib.md5(hash_input_str.encode('utf-8')).hexdigest() + ".pt"
         cache_path = os.path.join(CACHE_DIR, cache_filename)
 
@@ -166,29 +143,20 @@ def __call__(
         if os.path.exists(cache_path):
             print(f"Prompt cache hit! Loading embeddings from {cache_path}")
             try:
-                # Load directly to the correct device
-                # For distilled, we only saved context_p
                 context_p = torch.load(cache_path, map_location=self.device)
             except Exception as e:
                 print(f"Failed to load cache (corrupted?): {e}. Regenerating.")
 
-        # If cache miss or load failed
         if context_p is None:
             print("Prompt cache miss. Running text encoder.")
             text_encoder = self.model_ledger.text_encoder()
-
-            # Logic to handle prompt enhancement
             current_prompt = prompt
             if enhance_prompt:
                 current_prompt = generate_enhanced_prompt(
                     text_encoder, prompt, images[0][0] if len(images) > 0 else None
                 )
-
-            # In distilled pipeline, we usually only take the first element (positive)
-            # and there is no negative context generated
             context_p = encode_text(text_encoder, prompts=[current_prompt])[0]
 
-            # Save to disk for next time
             print(f"Saving embeddings to {cache_path}")
             torch.save(context_p, cache_path)
             print("Prompt encoded.", time.time() - startAt)
@@ -196,9 +164,8 @@ def __call__(
             torch.cuda.synchronize()
             del text_encoder
             cleanup_memory()
-        # --- DISK CACHE LOGIC END ---
+        # --- PROMPT CACHE LOGIC END ---
 
-        # Unpack the positive context (Distilled usually splits this into video/audio context)
         video_context, audio_context = context_p
 
         print("Stage 1: Initial low resolution video generation.")
@@ -206,7 +173,8 @@ def __call__(
 
         transformer = self.model_ledger.transformer()
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-        #stage_1_sigmas = self.get_interpolated_sigmas(12, self.device)
+        # stage_1_sigmas = self.get_interpolated_sigmas(16, self.device)
+
         def denoising_loop(
                 sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
         ) -> tuple[LatentState, LatentState]:
@@ -261,6 +229,29 @@ def denoising_loop(
         del stage_1_conditionings
         cleanup_memory()
 
+        if False:  # save step 1 result video
+            video_decoder = self.model_ledger.video_decoder()
+            decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
+            torch.cuda.synchronize()
+            del video_decoder
+            cleanup_memory()
+            vocoder = self.model_ledger.vocoder()
+            decoded_audio = vae_decode_audio(
+                audio_state.latent, self.model_ledger.audio_decoder(), vocoder
+            )
+            torch.cuda.synchronize()
+            del vocoder
+            cleanup_memory()
+
+            encode_video(
+                video=decoded_video,
+                fps=fps,
+                audio=decoded_audio,
+                audio_sample_rate=AUDIO_SAMPLE_RATE,
+                output_path=output_path.replace('.mp4', '_.mp4'),
+                video_chunks_number=video_chunks_number,
+            )
+
         print("Stage 2: Upsample and refine the video at higher resolution with distilled LORA.", time.time() - startAt)
         # Stage 2: Upsample and refine the video at higher resolution with distilled LORA.
         video_encoder = self.model_ledger.video_encoder()
@@ -269,7 +260,7 @@ def denoising_loop(
             latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=upsampler
         )
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        #stage_2_sigmas = self.get_interpolated_sigmas2(4, self.device)
+        #stage_2_sigmas = self.get_interpolated_sigmas2(10, self.device)
         stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_2_conditionings = []
         if images:
@@ -347,6 +338,9 @@ def main() -> None:
         images=args.images,
         tiling_config=tiling_config,
         enhance_prompt=args.enhance_prompt,
+        output_path=args.output_path,
+        video_chunks_number=video_chunks_number,
+        fps=args.frame_rate,
     )
 
     encode_video(
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/ic_lora_.py_ b/packages/ltx-pipelines/src/ltx_pipelines/ic_lora.py
similarity index 100%
rename from packages/ltx-pipelines/src/ltx_pipelines/ic_lora_.py_
rename to packages/ltx-pipelines/src/ltx_pipelines/ic_lora.py

From 0605c6a50f21535f9261fd42b31439c747b3508c Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Thu, 22 Jan 2026 19:01:07 +0100
Subject: [PATCH 13/38] Update README.md

---
 README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 2f0c1ec9..2a2253e6 100644
--- a/README.md
+++ b/README.md
@@ -66,14 +66,17 @@ python web_ui_v2.py
 
 **📊 Performance & Presets (8GB VRAM)**
 * The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on 3070Ti laptop GPU ~300sec for all presets.
-* | Resolution  | Max Frames | Est. Time (3070ti laptop 8gb vram) |
-* | :---------- | :--------- | :--------------------------------- |
-* | 1280 x 704  | 177        | ~300 sec                           |
-* | 1536 x 1024 | 121        | ~300 sec                           |
-* | 1920 x 1088 | 81         | ~300 sec                           |
-* | 2560 x 1408 | 49         | ~315 sec                           |
-* | 3840 x 2176 | 17         | ~315 sec                           |
+```
+| Resolution  | Max Frames | Est. Time (3070ti laptop 8gb vram) |
+| :---------- | :--------- | :--------------------------------- |
+| 1280 x 704  | 177        | ~300 sec                           |
+| 1536 x 1024 | 121        | ~300 sec                           |
+| 1920 x 1088 | 81         | ~300 sec                           |
+| 2560 x 1408 | 49         | ~315 sec                           |
+| 3840 x 2176 | 17         | ~315 sec                           |
 * +60 sec for prompt (if not empty/not cached) 
+```
+* UPD: optimized transformer code, increased max frames by 40%, generation speed 300..315 -> 385..415 sec
 
 
 **Credits**

From 13dbbdfb9819f4dc8f22dfdb81e6d93fd468ab6e Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Thu, 22 Jan 2026 19:05:10 +0100
Subject: [PATCH 14/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2a2253e6..9ce76f18 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ python web_ui_v2.py
 | 3840 x 2176 | 17         | ~315 sec                           |
 * +60 sec for prompt (if not empty/not cached) 
 ```
-* UPD: optimized transformer code, increased max frames by 40%, generation speed 300..315 -> 385..415 sec
+* UPD: optimized transformer code, increased max frames by 40%, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
 
 
 **Credits**

From 149af94820fb417082c6b95f12189d99e85cf39e Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Thu, 22 Jan 2026 21:06:28 +0100
Subject: [PATCH 15/38] Fix image conditioning

---
 .../src/ltx_core/model/transformer/model.py   |  8 +++++--
 .../ltx_core/model/transformer/transformer.py | 21 +++++++++++--------
 .../src/ltx_pipelines/distilled.py            |  7 +++++--
 .../src/ltx_pipelines/utils/helpers.py        |  6 ++++--
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/model/transformer/model.py b/packages/ltx-core/src/ltx_core/model/transformer/model.py
index dc4d662f..a8c991e1 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/model.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/model.py
@@ -333,6 +333,7 @@ def _process_transformer_blocks(
         video: TransformerArgs | None,
         audio: TransformerArgs | None,
         perturbations: BatchedPerturbationConfig,
+        is_conditioning: bool = True
     ) -> tuple[TransformerArgs, TransformerArgs]:
         """Process transformer blocks for LTXAV."""
 
@@ -354,6 +355,7 @@ def _process_transformer_blocks(
                     video=video,
                     audio=audio,
                     perturbations=perturbations,
+                    is_conditioning=is_conditioning,
                 )
 
         return video, audio
@@ -381,7 +383,7 @@ def _process_output(
 
     #@profile 502.847 s
     def forward(
-        self, video: Modality | None, audio: Modality | None, perturbations: BatchedPerturbationConfig
+        self, video: Modality | None, audio: Modality | None, perturbations: BatchedPerturbationConfig, is_conditioning: bool = True
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Forward pass for LTX models.
@@ -400,6 +402,7 @@ def forward(
             video=video_args,
             audio=audio_args,
             perturbations=perturbations,
+            is_conditioning=is_conditioning,
         )
 
         # Process output
@@ -468,13 +471,14 @@ def forward(
         video: Modality | None,
         audio: Modality | None,
         perturbations: BatchedPerturbationConfig,
+        is_conditioning: bool = True
     ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
         """
         Denoise the video and audio according to the sigma.
         Returns:
             Denoised video and audio
         """
-        vx, ax = self.velocity_model(video, audio, perturbations)  # 100%
+        vx, ax = self.velocity_model(video, audio, perturbations, is_conditioning)  # 100%
         denoised_video = to_denoised(video.latent, vx, video.timesteps) if vx is not None else None
         denoised_audio = to_denoised(audio.latent, ax, audio.timesteps) if ax is not None else None
         return denoised_video, denoised_audio
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
index bc304c9e..f51e2b36 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
@@ -100,12 +100,13 @@ def __init__(
         self.norm_eps = norm_eps
 
     def get_ada_values(
-            self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice
+            self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice, is_conditioning: bool = False
     ) -> tuple[torch.Tensor, ...]:
         num_ada_params = scale_shift_table.shape[0]
 
-        if timestep.dim() > 2 and timestep.shape[1] > 1:
-            timestep = timestep[:, [0], ...]
+        if not is_conditioning:
+            if timestep.dim() > 2 and timestep.shape[1] > 1:
+                timestep = timestep[:, [0], ...]
 
         table_slice = scale_shift_table[indices]
         if table_slice.device != timestep.device or table_slice.dtype != timestep.dtype:
@@ -124,12 +125,13 @@ def get_av_ca_ada_values(
             scale_shift_timestep: torch.Tensor,
             gate_timestep: torch.Tensor,
             num_scale_shift_values: int = 4,
+            is_conditioning: bool = True
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         scale_shift_ada_values = self.get_ada_values(
-            scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None)
+            scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None), is_conditioning
         )
         gate_ada_values = self.get_ada_values(
-            scale_shift_table[num_scale_shift_values:, :], batch_size, gate_timestep, slice(None, None)
+            scale_shift_table[num_scale_shift_values:, :], batch_size, gate_timestep, slice(None, None), is_conditioning
         )
 
         scale_shift_chunks = [t.squeeze(2) for t in scale_shift_ada_values]
@@ -142,6 +144,7 @@ def forward(  # noqa: PLR0915
             video: TransformerArgs | None,
             audio: TransformerArgs | None,
             perturbations: BatchedPerturbationConfig | None = None,
+            is_conditioning: bool = True
     ) -> tuple[TransformerArgs | None, TransformerArgs | None]:
         batch_size = video.x.shape[0] if video is not None else (audio.x.shape[0] if audio is not None else 0)
         if perturbations is None:
@@ -159,7 +162,7 @@ def forward(  # noqa: PLR0915
         # --- Video Self-Attention & Cross-Attention ---
         if run_vx:
             vshift_msa, vscale_msa, vgate_msa = self.get_ada_values(
-                self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3)
+                self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3), is_conditioning
             )
 
             if not perturbations.all_in_batch(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx):
@@ -186,7 +189,7 @@ def forward(  # noqa: PLR0915
         # --- Audio Self-Attention & Cross-Attention ---
         if run_ax:
             ashift_msa, ascale_msa, agate_msa = self.get_ada_values(
-                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3)
+                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3), is_conditioning
             )
 
             if not perturbations.all_in_batch(PerturbationType.SKIP_AUDIO_SELF_ATTN, self.idx):
@@ -340,7 +343,7 @@ def forward(  # noqa: PLR0915
         # --- FFN Layers ---
         if run_vx:
             vshift_mlp, vscale_mlp, vgate_mlp = self.get_ada_values(
-                self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None)
+                self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None), is_conditioning
             )
             # Optimization: In-place scaling of the normalized input
             vx_scaled = rms_norm(vx, eps=self.norm_eps)
@@ -353,7 +356,7 @@ def forward(  # noqa: PLR0915
 
         if run_ax:
             ashift_mlp, ascale_mlp, agate_mlp = self.get_ada_values(
-                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, None)
+                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, None), is_conditioning
             )
             # Optimization: In-place scaling
             ax_scaled = rms_norm(ax, eps=self.norm_eps)
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index 7c7a383e..7d90b1e4 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -176,7 +176,7 @@ def __call__(
         # stage_1_sigmas = self.get_interpolated_sigmas(16, self.device)
 
         def denoising_loop(
-                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
         ) -> tuple[LatentState, LatentState]:
             return euler_denoising_loop(
                 sigmas=sigmas,
@@ -187,6 +187,7 @@ def denoising_loop(
                     video_context=video_context,
                     audio_context=audio_context,
                     transformer=transformer,  # noqa: F821
+                    is_conditioning=is_conditioning,
                 ),
             )
         stage_1_output_shape = VideoPixelShape(
@@ -221,6 +222,7 @@ def denoising_loop(
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
+            is_conditioning=len(images) > 0
         )
         print("Stage 1: Finish denoising loop.", time.time() - startAt)
         torch.cuda.synchronize()
@@ -229,7 +231,7 @@ def denoising_loop(
         del stage_1_conditionings
         cleanup_memory()
 
-        if False:  # save step 1 result video
+        if True:  # save step 1 result video
             video_decoder = self.model_ledger.video_decoder()
             decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
             torch.cuda.synchronize()
@@ -291,6 +293,7 @@ def denoising_loop(
             noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
+            is_conditioning=len(images) > 0
         )
         print("Stage 2: Finish upsample and refine the video.", time.time() - startAt)
         torch.cuda.synchronize()
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index 4b9df1ff..6662b73c 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -331,7 +331,7 @@ def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor)
 
 #@profile
 def simple_denoising_func(
-    video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model
+    video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model, is_conditioning
 ) -> DenoisingFunc:
     def simple_denoising_step(
         video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
@@ -340,7 +340,7 @@ def simple_denoising_step(
         pos_video = modality_from_latent_state(video_state, video_context, sigma)
         pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
 
-        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None)  # 100%
+        denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None, is_conditioning=is_conditioning)  # 100%
         return denoised_video, denoised_audio
 
     return simple_denoising_step
@@ -389,6 +389,7 @@ def denoise_audio_video(  # noqa: PLR0913
     noise_scale: float = 1.0,
     initial_video_latent: torch.Tensor | None = None,
     initial_audio_latent: torch.Tensor | None = None,
+    is_conditioning: bool = True
 ) -> tuple[LatentState, LatentState]:
     video_state, video_tools = noise_video_state(
         output_shape=output_shape,
@@ -416,6 +417,7 @@ def denoise_audio_video(  # noqa: PLR0913
         video_state,
         audio_state,
         stepper,
+        is_conditioning,
     )
 
     video_state = video_tools.clear_conditioning(video_state)

From 2189e952dfe6afe76dc969cd3d8a103c81311089 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Thu, 22 Jan 2026 21:10:05 +0100
Subject: [PATCH 16/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9ce76f18..8eb73488 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ python web_ui_v2.py
 | 3840 x 2176 | 17         | ~315 sec                           |
 * +60 sec for prompt (if not empty/not cached) 
 ```
-* UPD: optimized transformer code, increased max frames by 40%, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
+* UPD: optimized transformer code, increased max frames by 40% for text to video, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
 
 
 **Credits**

From 47a4a5c5556397912224eb30ff39c40142cbcd3a Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Thu, 22 Jan 2026 23:39:10 +0100
Subject: [PATCH 17/38] Fix image conditioning

---
 .../ltx_core/model/transformer/transformer.py | 67 +++++++------------
 .../src/ltx_pipelines/distilled.py            |  8 ++-
 2 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
index f51e2b36..724cec3f 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
@@ -104,7 +104,7 @@ def get_ada_values(
     ) -> tuple[torch.Tensor, ...]:
         num_ada_params = scale_shift_table.shape[0]
 
-        if not is_conditioning:
+        if is_conditioning == False:
             if timestep.dim() > 2 and timestep.shape[1] > 1:
                 timestep = timestep[:, [0], ...]
 
@@ -128,10 +128,10 @@ def get_av_ca_ada_values(
             is_conditioning: bool = True
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         scale_shift_ada_values = self.get_ada_values(
-            scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None), is_conditioning
+            scale_shift_table[:num_scale_shift_values, :], batch_size, scale_shift_timestep, slice(None, None), is_conditioning=is_conditioning
         )
         gate_ada_values = self.get_ada_values(
-            scale_shift_table[num_scale_shift_values:, :], batch_size, gate_timestep, slice(None, None), is_conditioning
+            scale_shift_table[num_scale_shift_values:, :], batch_size, gate_timestep, slice(None, None), is_conditioning=is_conditioning
         )
 
         scale_shift_chunks = [t.squeeze(2) for t in scale_shift_ada_values]
@@ -161,54 +161,40 @@ def forward(  # noqa: PLR0915
 
         # --- Video Self-Attention & Cross-Attention ---
         if run_vx:
-            vshift_msa, vscale_msa, vgate_msa = self.get_ada_values(
-                self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3), is_conditioning
-            )
-
             if not perturbations.all_in_batch(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx):
-                # Optimization: Compute norm, then scale in-place to avoid allocating 'vx_scaled'
+                vshift_msa, vscale_msa, vgate_msa = self.get_ada_values(
+                    self.scale_shift_table, vx.shape[0], video.timesteps, slice(0, 3), is_conditioning=is_conditioning
+                )
                 norm_vx = rms_norm(vx, eps=self.norm_eps)
                 norm_vx.mul_(1 + vscale_msa).add_(vshift_msa)
 
                 v_mask = perturbations.mask_like(PerturbationType.SKIP_VIDEO_SELF_ATTN, self.idx, vx)
                 attn_out = self.attn1(norm_vx, pe=video.positional_embeddings)
+                del norm_vx
                 vx = vx + attn_out * vgate_msa * v_mask
 
-                del norm_vx, attn_out, v_mask
-
-            # Optimization: Context Attention
-            # We can reuse the norm calculation or just compute it fresh.
-            # To be safe with residual connections, we compute fresh, but scale in-place.
-            norm_vx = rms_norm(vx, eps=self.norm_eps)
-            # No ada modulation for context attn in this block usually, but if we had it, we'd do it in-place here.
-            attn_out = self.attn2(norm_vx, context=video.context, mask=video.context_mask)
-            vx = vx + attn_out
+                del attn_out, v_mask, vgate_msa
 
-            del norm_vx, attn_out, vshift_msa, vscale_msa, vgate_msa
+            vx = vx + self.attn2(rms_norm(vx, eps=self.norm_eps), context=video.context, mask=video.context_mask)
 
         # --- Audio Self-Attention & Cross-Attention ---
         if run_ax:
             ashift_msa, ascale_msa, agate_msa = self.get_ada_values(
-                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3), is_conditioning
+                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3), is_conditioning=is_conditioning
             )
 
             if not perturbations.all_in_batch(PerturbationType.SKIP_AUDIO_SELF_ATTN, self.idx):
                 norm_ax = rms_norm(ax, eps=self.norm_eps)
                 norm_ax.mul_(1 + ascale_msa).add_(ashift_msa)
+                del ashift_msa, ascale_msa
 
                 a_mask = perturbations.mask_like(PerturbationType.SKIP_AUDIO_SELF_ATTN, self.idx, ax)
-                attn_out = self.audio_attn1(norm_ax, pe=audio.positional_embeddings)
-                ax = ax + attn_out * agate_msa * a_mask
+                ax = ax + self.audio_attn1(norm_ax, pe=audio.positional_embeddings) * agate_msa * a_mask
 
-                del norm_ax, attn_out, a_mask
-
-            del ashift_msa, ascale_msa, agate_msa
+                del norm_ax, agate_msa, a_mask
 
             # Audio Context Attention
-            norm_ax = rms_norm(ax, eps=self.norm_eps)
-            attn_out = self.audio_attn2(norm_ax, context=audio.context, mask=audio.context_mask)
-            ax = ax + attn_out
-            del norm_ax, attn_out
+            ax = ax + self.audio_attn2(rms_norm(ax, eps=self.norm_eps), context=audio.context, mask=audio.context_mask)
 
         # --- Audio - Video Cross Attention (MEMORY OPTIMIZED) ---
         if run_a2v or run_v2a:
@@ -229,6 +215,7 @@ def forward(  # noqa: PLR0915
                     ax.shape[0],
                     audio.cross_scale_shift_timestep,
                     audio.cross_gate_timestep,
+                    is_conditioning=is_conditioning,
                 )
 
                 (
@@ -242,6 +229,7 @@ def forward(  # noqa: PLR0915
                     vx.shape[0],
                     video.cross_scale_shift_timestep,
                     video.cross_gate_timestep,
+                    is_conditioning=is_conditioning,
                 )
 
                 a2v_mask = perturbations.mask_like(PerturbationType.SKIP_A2V_CROSS_ATTN, self.idx, vx)
@@ -290,6 +278,7 @@ def forward(  # noqa: PLR0915
                     ax.shape[0],
                     audio.cross_scale_shift_timestep,
                     audio.cross_gate_timestep,
+                    is_conditioning=is_conditioning,
                 )
 
                 (
@@ -303,6 +292,7 @@ def forward(  # noqa: PLR0915
                     vx.shape[0],
                     video.cross_scale_shift_timestep,
                     video.cross_gate_timestep,
+                    is_conditioning=is_conditioning,
                 )
 
                 v2a_mask = perturbations.mask_like(PerturbationType.SKIP_V2A_CROSS_ATTN, self.idx, ax)
@@ -343,28 +333,23 @@ def forward(  # noqa: PLR0915
         # --- FFN Layers ---
         if run_vx:
             vshift_mlp, vscale_mlp, vgate_mlp = self.get_ada_values(
-                self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None), is_conditioning
+                self.scale_shift_table, vx.shape[0], video.timesteps, slice(3, None), is_conditioning=is_conditioning
             )
-            # Optimization: In-place scaling of the normalized input
             vx_scaled = rms_norm(vx, eps=self.norm_eps)
             vx_scaled.mul_(1 + vscale_mlp).add_(vshift_mlp)
+            del vscale_mlp, vshift_mlp
+            vx = vx + self.ff(vx_scaled) * vgate_mlp
 
-            ffn_out = self.ff(vx_scaled)
-            vx = vx + ffn_out * vgate_mlp
-
-            del vshift_mlp, vscale_mlp, vgate_mlp, vx_scaled, ffn_out
+            del vx_scaled
 
         if run_ax:
             ashift_mlp, ascale_mlp, agate_mlp = self.get_ada_values(
-                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, None), is_conditioning
+                self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, None), is_conditioning=is_conditioning
             )
-            # Optimization: In-place scaling
             ax_scaled = rms_norm(ax, eps=self.norm_eps)
             ax_scaled.mul_(1 + ascale_mlp).add_(ashift_mlp)
-
-            ffn_out = self.audio_ff(ax_scaled)
-            ax = ax + ffn_out * agate_mlp
-
-            del ashift_mlp, ascale_mlp, agate_mlp, ax_scaled, ffn_out
+            del ashift_mlp, ascale_mlp
+            ax = ax + self.audio_ff(ax_scaled) * agate_mlp
+            del agate_mlp, ax_scaled
 
         return replace(video, x=vx) if video is not None else None, replace(audio, x=ax) if audio is not None else None
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index 7d90b1e4..132ed003 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -198,7 +198,9 @@ def denoising_loop(
             fps=frame_rate,
         )
         stage_1_conditionings = []
+        is_conditioning = False
         if images:
+            is_conditioning = True
             video_encoder = self.model_ledger.video_encoder()
             stage_1_conditionings = image_conditionings_by_replacing_latent(
                 images=images,
@@ -222,7 +224,7 @@ def denoising_loop(
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
-            is_conditioning=len(images) > 0
+            is_conditioning=is_conditioning
         )
         print("Stage 1: Finish denoising loop.", time.time() - startAt)
         torch.cuda.synchronize()
@@ -231,7 +233,7 @@ def denoising_loop(
         del stage_1_conditionings
         cleanup_memory()
 
-        if True:  # save step 1 result video
+        if False:  # save step 1 result video
             video_decoder = self.model_ledger.video_decoder()
             decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
             torch.cuda.synchronize()
@@ -293,7 +295,7 @@ def denoising_loop(
             noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_state.latent,
-            is_conditioning=len(images) > 0
+            is_conditioning=is_conditioning
         )
         print("Stage 2: Finish upsample and refine the video.", time.time() - startAt)
         torch.cuda.synchronize()

From 6f00cc27b27020de8502392d337c08924b27ee29 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 30 Jan 2026 23:59:07 +0100
Subject: [PATCH 18/38] Add web ui v4, add disable audio option

---
 .../ltx-core/src/ltx_core/loader/kernels.py   |   5 +-
 .../ltx_core/model/transformer/attention.py   |   4 +
 .../ltx_core/model/transformer/transformer.py |  26 +-
 .../model/transformer/transformer_args.py     |   8 +-
 .../src/ltx_pipelines/distilled.py            |  54 +-
 .../src/ltx_pipelines/utils/args.py           |   1 +
 .../src/ltx_pipelines/utils/helpers.py        |  16 +-
 web_ui_v4.py                                  | 492 ++++++++++++++++++
 8 files changed, 577 insertions(+), 29 deletions(-)
 create mode 100644 web_ui_v4.py

diff --git a/packages/ltx-core/src/ltx_core/loader/kernels.py b/packages/ltx-core/src/ltx_core/loader/kernels.py
index 9086f299..c3d4de40 100644
--- a/packages/ltx-core/src/ltx_core/loader/kernels.py
+++ b/packages/ltx-core/src/ltx_core/loader/kernels.py
@@ -71,13 +71,10 @@ def fused_add_round_kernel(
 
         # Store the result
         tl.store(output_ptr + offsets, output, mask=mask)
-    print("used triton kernels")
+
 except Exception:
     import torch
 
-    print("used torch kernels")
-
-
     def fused_add_round_kernel(
             x: torch.Tensor,
             output: torch.Tensor,
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/attention.py b/packages/ltx-core/src/ltx_core/model/transformer/attention.py
index 94204782..743f94b0 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/attention.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/attention.py
@@ -180,8 +180,10 @@ def forward(
     ) -> torch.Tensor:
         q = self.to_q(x)
         context = x if context is None else context
+        del x
         k = self.to_k(context)
         v = self.to_v(context)
+        del context
 
         q = self.q_norm(q)
         k = self.k_norm(k)
@@ -192,4 +194,6 @@ def forward(
 
         # attention_function can be an enum *or* a custom callable
         out = self.attention_function(q, k, v, self.heads, mask)
+        del q, k, v, mask
+
         return self.to_out(out)
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
index 724cec3f..ef71e41c 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer.py
@@ -106,7 +106,7 @@ def get_ada_values(
 
         if is_conditioning == False:
             if timestep.dim() > 2 and timestep.shape[1] > 1:
-                timestep = timestep[:, [0], ...]
+                timestep = timestep[:, 0:1, ...]
 
         table_slice = scale_shift_table[indices]
         if table_slice.device != timestep.device or table_slice.dtype != timestep.dtype:
@@ -118,6 +118,30 @@ def get_ada_values(
         ).unbind(dim=2)
         return ada_values
 
+    def get_ada_values_(
+            self,
+            scale_shift_table: torch.Tensor,
+            batch_size: int,
+            timestep: torch.Tensor,
+            indices: slice,
+            is_conditioning: bool = False
+    ) -> tuple[torch.Tensor, ...]:
+        if not is_conditioning and timestep.dim() > 2 and timestep.shape[1] > 1:
+            timestep = timestep[:, 0:1]
+
+        table_slice = scale_shift_table[indices]
+
+        if table_slice.device != timestep.device or table_slice.dtype != timestep.dtype:
+            table_slice = table_slice.to(device=timestep.device, dtype=timestep.dtype, non_blocking=True)
+
+        ts_view = timestep.reshape(batch_size, timestep.shape[1], scale_shift_table.shape[0], -1)
+        ts_chunk = ts_view[:, :, indices]
+
+        return tuple(
+            chunk.add(param)
+            for chunk, param in zip(ts_chunk.unbind(2), table_slice)
+        )
+
     def get_av_ca_ada_values(
             self,
             scale_shift_table: torch.Tensor,
diff --git a/packages/ltx-core/src/ltx_core/model/transformer/transformer_args.py b/packages/ltx-core/src/ltx_core/model/transformer/transformer_args.py
index ade5aa77..d24de705 100644
--- a/packages/ltx-core/src/ltx_core/model/transformer/transformer_args.py
+++ b/packages/ltx-core/src/ltx_core/model/transformer/transformer_args.py
@@ -84,13 +84,15 @@ def _prepare_context(
         return context, attention_mask
 
     def _prepare_attention_mask(self, attention_mask: torch.Tensor | None, x_dtype: torch.dtype) -> torch.Tensor | None:
-        """Prepare attention mask."""
         if attention_mask is None or torch.is_floating_point(attention_mask):
             return attention_mask
 
-        return (attention_mask - 1).to(x_dtype).reshape(
+        # Allocate once for the cast and reshape, then scale in-place
+        mask = (attention_mask - 1).to(x_dtype).reshape(
             (attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
-        ) * torch.finfo(x_dtype).max
+        )
+        mask.mul_(torch.finfo(x_dtype).max)
+        return mask
 
     def _prepare_positional_embeddings(
         self,
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index 132ed003..8afcf317 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -42,6 +42,7 @@
 
 logging.basicConfig(level=logging.ERROR)
 logging.getLogger("accelerate").setLevel(logging.ERROR)
+logging.getLogger("ltx_core").setLevel(logging.ERROR)
 
 
 class DistilledPipeline:
@@ -111,6 +112,7 @@ def __call__(
             output_path: str = '',
             video_chunks_number: int = 0,
             fps: int = 0,
+            disable_audio: bool = True,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
         print("Preparing Inference")
         startAt = time.time()
@@ -175,6 +177,11 @@ def __call__(
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
         # stage_1_sigmas = self.get_interpolated_sigmas(16, self.device)
 
+        if not disable_audio:
+            pass
+        else:
+            audio_context = None
+
         def denoising_loop(
                 sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
         ) -> tuple[LatentState, LatentState]:
@@ -188,7 +195,9 @@ def denoising_loop(
                     audio_context=audio_context,
                     transformer=transformer,  # noqa: F821
                     is_conditioning=is_conditioning,
+                    disable_audio=disable_audio,
                 ),
+                disable_audio=disable_audio,
             )
         stage_1_output_shape = VideoPixelShape(
             batch=1,
@@ -233,19 +242,22 @@ def denoising_loop(
         del stage_1_conditionings
         cleanup_memory()
 
-        if False:  # save step 1 result video
+        if True:  # save step 1 result video
             video_decoder = self.model_ledger.video_decoder()
             decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
             torch.cuda.synchronize()
             del video_decoder
             cleanup_memory()
-            vocoder = self.model_ledger.vocoder()
-            decoded_audio = vae_decode_audio(
-                audio_state.latent, self.model_ledger.audio_decoder(), vocoder
-            )
-            torch.cuda.synchronize()
-            del vocoder
-            cleanup_memory()
+            if not disable_audio:
+                vocoder = self.model_ledger.vocoder()
+                decoded_audio = vae_decode_audio(
+                    audio_state.latent, self.model_ledger.audio_decoder(), vocoder
+                )
+                torch.cuda.synchronize()
+                del vocoder
+                cleanup_memory()
+            else:
+                decoded_audio = None
 
             encode_video(
                 video=decoded_video,
@@ -264,7 +276,7 @@ def denoising_loop(
             latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=upsampler
         )
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        #stage_2_sigmas = self.get_interpolated_sigmas2(10, self.device)
+        # stage_2_sigmas = self.get_interpolated_sigmas2(10, self.device)
         stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_2_conditionings = []
         if images:
@@ -282,6 +294,11 @@ def denoising_loop(
         del upsampler
         del video_state
         cleanup_memory()
+
+        audio_latents = None
+        if not disable_audio:
+            audio_latents = audio_state.latent
+
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
@@ -294,7 +311,7 @@ def denoising_loop(
             device=self.device,
             noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_state.latent,
+            initial_audio_latent=audio_latents,
             is_conditioning=is_conditioning
         )
         print("Stage 2: Finish upsample and refine the video.", time.time() - startAt)
@@ -309,12 +326,16 @@ def denoising_loop(
         decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
         del video_decoder
         cleanup_memory()
-        vocoder = self.model_ledger.vocoder()
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.model_ledger.audio_decoder(), vocoder
-        )
-        del vocoder
-        cleanup_memory()
+
+        if not disable_audio:
+            vocoder = self.model_ledger.vocoder()
+            decoded_audio = vae_decode_audio(
+                audio_state.latent, self.model_ledger.audio_decoder(), vocoder
+            )
+            del vocoder
+            cleanup_memory()
+        else:
+            decoded_audio = None
         print("Stage 3: Done.", time.time() - startAt)
         return decoded_video, decoded_audio
 
@@ -346,6 +367,7 @@ def main() -> None:
         output_path=args.output_path,
         video_chunks_number=video_chunks_number,
         fps=args.frame_rate,
+        disable_audio=args.disable_audio,
     )
 
     encode_video(
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
index c5bb8f58..ccd49b63 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
@@ -180,6 +180,7 @@ def basic_arg_parser() -> argparse.ArgumentParser:
         "Note that calculations are still performed in bfloat16 precision.",
     )
     parser.add_argument("--enhance-prompt", action="store_true")
+    parser.add_argument("--disable-audio", action="store_true")
     return parser
 
 
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index 6662b73c..bc7891a4 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -94,9 +94,10 @@ def image_conditionings_by_adding_guiding_latent(
 def euler_denoising_loop(
     sigmas: torch.Tensor,
     video_state: LatentState,
-    audio_state: LatentState,
+    audio_state: LatentState | None,
     stepper: DiffusionStepProtocol,
     denoise_fn: DenoisingFunc,
+    disable_audio: bool = False
 ) -> tuple[LatentState, LatentState]:
     """
     Perform the joint audio-video denoising loop over a diffusion schedule.
@@ -133,10 +134,12 @@ def euler_denoising_loop(
         denoised_video, denoised_audio = denoise_fn(video_state, audio_state, sigmas, step_idx)  # 100%
 
         denoised_video = post_process_latent(denoised_video, video_state.denoise_mask, video_state.clean_latent)
-        denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
+        if not disable_audio:
+            denoised_audio = post_process_latent(denoised_audio, audio_state.denoise_mask, audio_state.clean_latent)
 
         video_state = replace(video_state, latent=stepper.step(video_state.latent, denoised_video, sigmas, step_idx))
-        audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
+        if not disable_audio:
+            audio_state = replace(audio_state, latent=stepper.step(audio_state.latent, denoised_audio, sigmas, step_idx))
 
     return (video_state, audio_state)
 
@@ -331,14 +334,17 @@ def timesteps_from_mask(denoise_mask: torch.Tensor, sigma: float | torch.Tensor)
 
 #@profile
 def simple_denoising_func(
-    video_context: torch.Tensor, audio_context: torch.Tensor, transformer: X0Model, is_conditioning
+    video_context: torch.Tensor, audio_context: torch.Tensor | None, transformer: X0Model, is_conditioning: bool = True, disable_audio: bool = False
 ) -> DenoisingFunc:
     def simple_denoising_step(
         video_state: LatentState, audio_state: LatentState, sigmas: torch.Tensor, step_index: int
     ) -> tuple[torch.Tensor, torch.Tensor]:
         sigma = sigmas[step_index]
         pos_video = modality_from_latent_state(video_state, video_context, sigma)
-        pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
+        if disable_audio:
+            pos_audio = None
+        else:
+            pos_audio = modality_from_latent_state(audio_state, audio_context, sigma)
 
         denoised_video, denoised_audio = transformer(video=pos_video, audio=pos_audio, perturbations=None, is_conditioning=is_conditioning)  # 100%
         return denoised_video, denoised_audio
diff --git a/web_ui_v4.py b/web_ui_v4.py
new file mode 100644
index 00000000..654e7f26
--- /dev/null
+++ b/web_ui_v4.py
@@ -0,0 +1,492 @@
+import gradio as gr
+import subprocess
+import os
+import datetime
+import uuid
+import threading
+import time
+import sys
+from collections import deque
+
+# --- Configuration & Defaults ---
+DEFAULT_CHECKPOINT = "./models/ltx-2-19b-distilled-fp8.safetensors"
+DEFAULT_GEMMA = "./models/gemma3"
+DEFAULT_UPSAMPLER = "./models/ltx-2-spatial-upscaler-x2-1.0.safetensors"
+LORA_ROOT = "./models/loras"
+
+# LoRA List
+LORA_OPTIONS = [
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-In",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Left",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Out",
+    "LTX-2-19b-LoRA-Camera-Control-Dolly-Right",
+    "LTX-2-19b-LoRA-Camera-Control-Jib-Down",
+    "LTX-2-19b-LoRA-Camera-Control-Jib-Up",
+    "LTX-2-19b-LoRA-Camera-Control-Static"
+]
+
+# Resolution Presets with Max Frame Data for 8GB VRAM
+PRESETS = {
+    "1280x704 (Landscape)": {"w": 1280, "h": 704, "max_frames": 225},
+    "704x1280 (Vertical)": {"w": 704, "h": 1280, "max_frames": 225},
+    "1536x1024 (Standard)": {"w": 1536, "h": 1024, "max_frames": 121},
+    "1024x1536 (Vertical)": {"w": 1024, "h": 1536, "max_frames": 121},
+    "1600x896 (Landscape)": {"w": 1600, "h": 896, "max_frames": 145},
+    "896x1600 (Vertical)": {"w": 896, "h": 1600, "max_frames": 145},
+    "1920x1088 (HD)": {"w": 1920, "h": 1088, "max_frames": 97},
+    "1088x1920 (HD Vert)": {"w": 1088, "h": 1920, "max_frames": 97},
+    "2560x1408 (2K)": {"w": 2560, "h": 1408, "max_frames": 49},
+    "1408x2560 (2K Vert)": {"w": 1408, "h": 2560, "max_frames": 49},
+    "3840x2176 (4K)": {"w": 3840, "h": 2176, "max_frames": 17},
+}
+
+# Cinematic
+
+# --- Prompt Construction Data ---
+# Animation: stop-motion, 2D/3D animation, claymation, hand-drawn
+# Stylized: comic book, cyberpunk, 8-bit pixel, surreal, minimalist, painterly, illustrated
+# Cinematic: period drama, film noir, fantasy, epic space opera, thriller, modern romance, experimental film, arthouse, documentary
+STYLES = ["Cinematic", "Photorealistic", "3D Animation", "Anime", "Vintage Film (VHS)", "Film Noir", "Cyberpunk",
+          "Oil Painting", "Claymation"]
+SHOT_TYPES = ["Wide establishing", "Medium", "Close-up", "Extreme close-up", "Over-the-shoulder", "Low angle",
+              "High angle", "Overhead"]
+LIGHTING = ["Natural sunlight", "Golden hour", "Cinematic", "Volumetric fog", "Neon glow", "Dark and moody",
+            "Studio lighting", "Soft rim light"]
+CAM_MOVES = ["static frame", "wide establishing shot", "over-the-shoulder", "handheld movement", "overhead view", "pushes in", "pulls back",
+             "tilts upward", "circles around", "pans across", "follows", "tracks"]  # ok
+
+
+# Establish the shot. Use cinematography terms that match your preferred film genre. Include aspects like scale or specific category characteristics to further refine the style you’re looking for.
+# Set the scene. Describe lighting conditions, color palette, surface textures, and atmosphere to shape the mood.
+# Describe the action. Write the core action as a natural sequence, flowing from beginning to end.
+# Define your character(s). Include age, hairstyle, clothing, and distinguishing details. Express emotions through physical cues.
+# Identify camera movement(s). Specify when the view should shift and how. Including how subjects or objects appear after the camera motion gives the model a better idea of how to finish the motion.‍
+# Describe the audio. Use clear descriptions for ambient sounds, music, audio, and speech. For dialogue, place the text between quotation marks and (if required) mention the language and accent you would like the character to have.
+
+
+# Keep your prompt in a single flowing paragraph to give the model a cohesive scene to work with.
+# Use present tense verbs to describe movement and action.
+# Match your detail to the shot scale. Closeups need more precise detail than wide shots.
+# When describing camera movement, focus on the camera’s relationship to the subject.
+# You should expect to write 4 to 8 descriptive sentences to cover all the key aspects of the prompt.
+# Don’t be afraid to iterate! LTX-2 is designed for fast experimentation, so refining your prompt is part of the workflow.
+
+# Scale indicators: expansive, epic, intimate, claustrophobic
+# Film characteristics: jittery stop-motion, pixelated edges, lens flares, film grain
+# Pacing and temporal effects: slow motion, time-lapse, rapid cuts, lingering shot, continuous shot, freeze-frame, fade-in, fade-out, seamless transition, dynamic movement, sudden stop
+# Specific visual effects (if relevant): particle systems, motion blur, depth of field
+
+# Lighting conditions: flickering candles, neon glow, natural sunlight, dramatic shadows
+# Textures: rough stone, smooth metal, worn fabric, glossy surfaces
+# Color palette: vibrant, muted, monochromatic, high contrast
+# Atmospheric elements: fog, rain, dust, particles, smoke
+
+# Sound and Voice
+# Setting: Ambient coffeeshop noises, dripping rain and wind blowing, forest ambience with birds singing
+# Dialogue style: Energetic announcer, resonant voice with gravitas, distorted radio-style, robotic monotone, childlike curiosity
+# Volume: quiet whisper, mutters, shouts, screams
+
+
+# ‍Cinematic close-up shot ‍cinematic lighting shallow depth of field, and natural motion.
+
+# What Works Well with LTX-2
+# ‍Cinematic compositions:
+# ‍Wide, medium, and close-up shots with thoughtful lighting, shallow depth of field, and natural motion.
+# Emotive human moments:
+# ‍LTX-2 excels at single-subject emotional expressions, subtle gestures, and facial nuance.
+# Atmosphere & setting:
+# ‍Weather effects like fog, mist, golden hour light, soft shadows, rain, reflections, and ambient textures all help ground the scene.
+# Clean, readable camera language:
+# ‍Clear directions like “slow dolly in,” “handheld tracking,” or “over-the-shoulder” improve consistency.
+# Stylized aesthetics:
+# ‍Painterly, noir, analog film look, fashion editorial, pixelated animation, or surreal art styles work especially well when named early in the prompt.
+# Lighting and mood control:
+# ‍Backlighting, color palettes, soft rim light, flickering lamps — these anchor tone better than generic mood words.
+# Voice:
+# ‍Characters can talk and sing in various languages.
+
+# --- Global Queue System ---
+JOB_QUEUE = deque()
+QUEUE_LOCK = threading.Lock()
+CURRENT_LOG = "System Ready. Waiting for jobs..."
+LATEST_VIDEO_PATH = None
+IS_PROCESSING = False
+CURRENT_JOB_ID = None
+CURRENT_PROCESS = None
+PREVIEW_VIDEO_PATH = None
+CURRENT_OUTPUT_PATH = None
+
+
+# --- Logic Functions ---
+
+def build_ltx_prompt_text(style, shot, subject, env, light, cam):
+    parts = []
+    opener = ""
+    if style: opener += f"{style} "
+    if shot: opener += f"{shot} shot"
+    if opener:
+        parts.append(f"A {opener.strip()} of")
+    else:
+        parts.append("A shot of")
+
+    parts.append(subject if subject else "a subject")
+
+    loc_details = []
+    if env: loc_details.append(f"in {env}")
+    if light: loc_details.append(f"with {light}")
+    if loc_details: parts.append(" ".join(loc_details))
+
+    full_text = " ".join(parts) + "."
+    if cam: full_text += f" The camera {cam.lower()}."
+    return full_text
+
+
+def get_preset_frames(preset_key, is_safe_mode, current_val):
+    if not is_safe_mode: return current_val
+    if preset_key in PRESETS: return PRESETS[preset_key]["max_frames"]
+    return 121
+
+
+# --- Worker Logic ---
+
+def process_job_logic(job):
+    """Internal function to run the actual generation logic"""
+    global CURRENT_LOG, LATEST_VIDEO_PATH, CURRENT_PROCESS, PREVIEW_VIDEO_PATH, CURRENT_OUTPUT_PATH
+
+    # Reset preview/process state
+    PREVIEW_VIDEO_PATH = None
+    CURRENT_PROCESS = None
+    prompt = job['prompt']
+    width = PRESETS[job['preset']]["w"]
+    height = PRESETS[job['preset']]["h"]
+    seed = job['seed']
+    if job['randomize_seed']:
+        seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
+
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_filename = f"output_{timestamp}_{job['id']}.mp4"
+    output_path = os.path.abspath(output_filename)
+    CURRENT_OUTPUT_PATH = output_path
+
+    CURRENT_LOG += f"\n\n--- STARTED JOB: {job['id']} ---\nPrompt: {prompt}\nSeed: {seed}\n"
+
+    # Build Command
+    cmd = [
+        sys.executable, "-m", "ltx_pipelines.distilled",
+        #"kernprof", "-l", "-v", "-m", "ltx_pipelines.distilled",
+        "--checkpoint-path", job['checkpoint_path'],
+        "--gemma-root", job['gemma_path'],
+        "--spatial-upsampler-path", job['upsampler_path'],
+        "--prompt", prompt,
+        "--output-path", output_path,
+        "--width", str(width),
+        "--height", str(height),
+        "--num-frames", str(int(job['num_frames'])),
+        "--frame-rate", str(job['frame_rate']),
+        "--num-inference-steps", str(int(job['steps'])),
+        "--seed", str(int(seed)),
+        # "--enable-chunked-stage2"
+    ]
+
+    if job['enable_fp8']: cmd.append("--enable-fp8")
+    if job['enhance_prompt']: cmd.append("--enhance-prompt")
+    if job['disable_audio']: cmd.append("--disable-audio")
+
+    # Images
+    for path, idx, strength in job['images']:
+        if path is not None:
+            latent_idx = int(idx) // 8
+            cmd.extend(["--image", path, str(latent_idx), str(float(strength))])
+
+    # LoRAs
+    for lora_name in job['loras']:
+        lora_full_path = os.path.join(LORA_ROOT, f"{lora_name.lower()}.safetensors")
+        cmd.extend(["--lora", lora_full_path, "1.0"])
+
+    # Execution
+    try:
+        CURRENT_PROCESS = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+            text=True, bufsize=1, universal_newlines=True
+        )
+
+        for line in CURRENT_PROCESS.stdout:
+            CURRENT_LOG += line
+
+        CURRENT_PROCESS.wait()
+
+        if CURRENT_PROCESS.returncode == 0 and os.path.exists(output_path):
+            CURRENT_LOG += f"\n--- JOB COMPLETE ---\nSaved: {output_path}\n"
+            LATEST_VIDEO_PATH = output_path
+        else:
+            CURRENT_LOG += f"\n--- JOB FAILED ---\nReturn Code: {CURRENT_PROCESS.returncode}\n"
+    except Exception as e:
+        CURRENT_LOG += f"\n--- EXCEPTION ---\n{str(e)}\n"
+    finally:
+        CURRENT_PROCESS = None
+
+
+def cancel_job():
+    global CURRENT_PROCESS, CURRENT_LOG
+    if CURRENT_PROCESS:
+        try:
+            subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+            CURRENT_LOG += "\n--- JOB CANCELLATION REQUESTED AND EXECUTED ---\n"
+            return "Cancellation requested and executed via taskkill."
+        except Exception as e:
+            try:
+                CURRENT_PROCESS.terminate()
+                return f"Taskkill failed, tried terminate: {str(e)}"
+            except:
+                return f"Error cancelling: {str(e)}"
+    return "No active process to cancel."
+
+
+def worker_thread():
+    """Background thread that constantly checks for jobs"""
+    global IS_PROCESSING, CURRENT_JOB_ID, CURRENT_LOG
+    while True:
+        job = None
+        with QUEUE_LOCK:
+            if len(JOB_QUEUE) > 0:
+                job = JOB_QUEUE.popleft()
+                IS_PROCESSING = True
+                CURRENT_JOB_ID = job['id']
+            else:
+                IS_PROCESSING = False
+                CURRENT_JOB_ID = None
+
+        if job:
+            process_job_logic(job)
+        else:
+            time.sleep(1)
+
+
+# Start Worker
+threading.Thread(target=worker_thread, daemon=True).start()
+
+
+# --- UI Functions ---
+
+def enqueue_job(
+        prompt, preset, num_frames, disable_audio, frame_rate, steps, seed, randomize_seed, enhance_prompt, enable_fp8,
+        checkpoint_path, gemma_path, upsampler_path,
+        img1_path, img1_idx, img1_str,
+        img2_path, img2_idx, img2_str,
+        img3_path, img3_idx, img3_str,
+        selected_loras
+):
+    job_id = str(uuid.uuid4())[:4]
+
+    job_data = {
+        "id": job_id,
+        "prompt": prompt,
+        "preset": preset,
+        "num_frames": num_frames,
+        "disable_audio": disable_audio,
+        "frame_rate": frame_rate,
+        "steps": steps,
+        "seed": seed,
+        "randomize_seed": randomize_seed,
+        "enhance_prompt": enhance_prompt,
+        "enable_fp8": enable_fp8,
+        "checkpoint_path": checkpoint_path,
+        "gemma_path": gemma_path,
+        "upsampler_path": upsampler_path,
+        "images": [
+            (img1_path, img1_idx, img1_str),
+            (img2_path, img2_idx, img2_str),
+            (img3_path, img3_idx, img3_str)
+        ],
+        "loras": selected_loras
+    }
+
+    with QUEUE_LOCK:
+        JOB_QUEUE.append(job_data)
+        q_pos = len(JOB_QUEUE)
+
+    return f"Job {job_id} queued. Position: {q_pos}"
+
+
+def update_monitor():
+    """Polled by the UI to get latest logs and queue status"""
+    global PREVIEW_VIDEO_PATH
+    
+    # Check for intermediate preview file
+    if CURRENT_OUTPUT_PATH and PREVIEW_VIDEO_PATH is None:
+        preview_file = CURRENT_OUTPUT_PATH.replace('.mp4', '_.mp4')
+        if os.path.exists(preview_file):
+            PREVIEW_VIDEO_PATH = preview_file
+
+    q_len = len(JOB_QUEUE)
+    status_str = f"Queue Size: {q_len}"
+    if IS_PROCESSING:
+        status_str += f" | Processing Job: {CURRENT_JOB_ID}"
+    else:
+        status_str += " | Idle"
+
+    return LATEST_VIDEO_PATH, PREVIEW_VIDEO_PATH, CURRENT_LOG, status_str
+
+
+# --- UI Theme & Layout ---
+
+theme = gr.themes.Soft(
+    primary_hue="slate",
+    secondary_hue="gray",
+    neutral_hue="slate",
+    font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
+).set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+    button_primary_background_fill="*primary_600",
+    button_primary_text_color="white",
+)
+
+css = """
+.gradio-container { max_width: 1400px !important; }
+textarea { font-family: monospace; }
+#status_box { font-weight: bold; color: #475569; }
+"""
+
+with gr.Blocks(title="LTX-2 Studio + Queue", theme=theme, css=css) as demo:
+    gr.Markdown("## 🎬 LTX-2 Distilled Web Interface (Queue Enabled)")
+
+    with gr.Row():
+        # Left Column: Controls
+        with gr.Column(scale=3):
+            # --- Prompt Constructor ---
+            with gr.Accordion("📝 Prompt Constructor (LTX Guide Based)", open=False):
+                with gr.Row():
+                    pc_style = gr.Dropdown(choices=STYLES, label="Style", value="Cinematic")
+                    pc_shot = gr.Dropdown(choices=SHOT_TYPES, label="Shot Type", value="Medium")
+                    pc_light = gr.Dropdown(choices=LIGHTING, label="Lighting", value="Golden hour")
+
+                pc_subject = gr.Textbox(label="Subject & Action", placeholder="e.g., a futuristic robot walking...",
+                                        lines=2)
+                pc_env = gr.Textbox(label="Environment", placeholder="e.g., a dusty desert")
+
+                with gr.Row():
+                    pc_cam = gr.Dropdown(choices=CAM_MOVES, label="Camera Movement", value="static frame")
+                    pc_build_btn = gr.Button("⬇️ Insert into Prompt", variant="secondary")
+
+            # Main Prompt
+            prompt = gr.Textbox(label="Final Prompt", placeholder="Describe your video scene here...", lines=3)
+
+            with gr.Row():
+                with gr.Column(scale=1):
+                    preset = gr.Dropdown(label="Resolution", choices=list(PRESETS.keys()), value="1536x1024 (Standard)")
+                    safe_mode = gr.Checkbox(label="8GB Safe Mode", value=True)
+                    disable_audio = gr.Checkbox(label="Disable audio", value=False)
+
+                with gr.Column(scale=1):
+                    num_frames = gr.Slider(label="Frames", minimum=9, maximum=257, step=8, value=121)
+                    fps = gr.Slider(label="FPS", minimum=8, maximum=60, step=1, value=24)
+
+            with gr.Accordion("Advanced", open=False):
+                with gr.Row():
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=8)
+                    seed = gr.Number(label="Seed", value=10, precision=0)
+                with gr.Row():
+                    random_seed = gr.Checkbox(label="Random Seed", value=True)
+                    enable_fp8 = gr.Checkbox(label="FP8", value=True)
+                    enhance_prompt = gr.Checkbox(label="Enhance", value=False)
+
+                checkpoint_path = gr.Textbox(label="Checkpoint", value=DEFAULT_CHECKPOINT)
+                gemma_path = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
+                upsampler_path = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
+
+        # Right Column: Output & Monitor
+        with gr.Column(scale=4):
+            # Status Bar
+            queue_status = gr.Textbox(label="System Status", value="Idle", interactive=False, elem_id="status_box")
+
+            # Video Output
+            out_video = gr.Video(label="Last Completed Video", height=400, autoplay=True)
+            
+            # Preview Video
+            preview_video = gr.Video(label="Stage 1 Preview", height=300, autoplay=True)
+
+            # Button (Adds to Queue)
+            with gr.Row():
+                generate_btn = gr.Button("➕ Add to Queue", variant="primary", size="lg")
+                cancel_btn = gr.Button("🛑 Cancel Current Job", variant="secondary", size="lg")
+            
+            add_result_msg = gr.Markdown("")  # Feedback for button click
+
+            with gr.Accordion("Console Log", open=True):
+                console_log = gr.Textbox(label="Worker Log", lines=10, max_lines=20, interactive=False,
+                                         elem_id="console_log")
+
+    gr.Markdown("---")
+
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎨 LoRA Adapters")
+            lora_checks = gr.CheckboxGroup(choices=LORA_OPTIONS, label=None)
+
+        with gr.Column(scale=2):
+            gr.Markdown("### 🖼️ Image Conditioning")
+            with gr.Row():
+                # Image 1
+                with gr.Group():
+                    i1_img = gr.Image(type="filepath", label="Ref Image 1", height=150)
+                    i1_idx = gr.Number(label="Frame Index", value=0)
+                    i1_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+                # Image 2
+                with gr.Group():
+                    i2_img = gr.Image(type="filepath", label="Ref Image 2", height=150)
+                    i2_idx = gr.Number(label="Frame Index", value=0)
+                    i2_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+                # Image 3
+                with gr.Group():
+                    i3_img = gr.Image(type="filepath", label="Ref Image 3", height=150)
+                    i3_idx = gr.Number(label="Frame Index", value=0)
+                    i3_str = gr.Slider(label="Strength", minimum=0, maximum=1, value=0.8)
+
+    # --- Event Wiring ---
+
+    # --- Timers & Events ---
+
+    timer = gr.Timer(1)
+    timer.tick(
+        fn=update_monitor,
+        inputs=None,
+        outputs=[out_video, preview_video, console_log, queue_status]
+    )
+
+    # 2. Add to Queue Action
+    generate_btn.click(
+        fn=enqueue_job,
+        inputs=[
+            prompt, preset, num_frames, disable_audio, fps, steps, seed, random_seed, enhance_prompt, enable_fp8,
+            checkpoint_path, gemma_path, upsampler_path,
+            i1_img, i1_idx, i1_str,
+            i2_img, i2_idx, i2_str,
+            i3_img, i3_idx, i3_str,
+            lora_checks
+        ],
+        outputs=[add_result_msg]
+    )
+
+    # 2b. Cancel Job Action
+    cancel_btn.click(
+        fn=cancel_job,
+        inputs=None,
+        outputs=[add_result_msg]
+    )
+
+    # 3. Prompt Construction
+    pc_build_btn.click(
+        fn=build_ltx_prompt_text,
+        inputs=[pc_style, pc_shot, pc_subject, pc_env, pc_light, pc_cam],
+        outputs=prompt
+    )
+
+    # 4. Presets Logic
+    preset.change(fn=get_preset_frames, inputs=[preset, safe_mode, num_frames], outputs=num_frames)
+    safe_mode.change(fn=get_preset_frames, inputs=[preset, safe_mode, num_frames], outputs=num_frames)
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=False)

From ac8f203226eca97d9732d227d647b579424605ed Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 03:40:28 +0100
Subject: [PATCH 19/38] Update README.md

---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8eb73488..c5932694 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,12 @@
 This repository contains a **modified and optimized version of the LTX-2 Video Generation Model**, designed specifically to run on consumer hardware with as little as **8GB VRAM**. 
 
 It includes a fully-featured **Gradio Web Interface** to make generating videos, managing presets, and applying LoRAs easy without needing to remember complex command-line arguments.
-
+## Web UI v2
 <img width="2260" height="1078" alt="image" src="https://github.com/user-attachments/assets/5a9f5dce-f313-44a3-bbbe-10eccc002191" />
 
+## Web UI v4
+<img width="949" height="575" alt="scr221" src="https://github.com/user-attachments/assets/2e3a6c51-51b8-4487-b64d-a7d2c41d794a" />
+
 
 ## 🚀 Features
 
@@ -62,6 +65,10 @@ xformers==0.0.32.post2
 Run the web interface with a single command:
 ```Bash
 python web_ui_v2.py
+
+or
+
+python web_ui_v4.py
 ```
 
 **📊 Performance & Presets (8GB VRAM)**
@@ -77,6 +84,7 @@ python web_ui_v2.py
 * +60 sec for prompt (if not empty/not cached) 
 ```
 * UPD: optimized transformer code, increased max frames by 40% for text to video, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
+* UPD2: added web ui v4, stage 1 video preview, task queue, prompt constructor, disable audio option (faster inference 10-30%)
 
 
 **Credits**

From 2ba13e6cfd84f6f816a95198802fab55a4476f39 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 03:56:56 +0100
Subject: [PATCH 20/38] Update README.md

---
 README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index c5932694..e285e6c3 100644
--- a/README.md
+++ b/README.md
@@ -74,14 +74,15 @@ python web_ui_v4.py
 **📊 Performance & Presets (8GB VRAM)**
 * The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on 3070Ti laptop GPU ~300sec for all presets.
 ```
-| Resolution  | Max Frames | Est. Time (3070ti laptop 8gb vram) |
-| :---------- | :--------- | :--------------------------------- |
-| 1280 x 704  | 177        | ~300 sec                           |
-| 1536 x 1024 | 121        | ~300 sec                           |
-| 1920 x 1088 | 81         | ~300 sec                           |
-| 2560 x 1408 | 49         | ~315 sec                           |
-| 3840 x 2176 | 17         | ~315 sec                           |
-* +60 sec for prompt (if not empty/not cached) 
+| Resolution  | Max Frames i2v| t2v  | Est. Time (3070ti laptop 8gb vram) |
+| :---------- | :------------ |:---- |:---------------------------------- |
+| 1280 x 704  | 177           | 257  | ~300..400 sec                      |
+| 1536 x 1024 | 121           | 185  | ~300..400 sec                      |
+| 1920 x 1088 | 81            | 121  | ~300..400 sec                      |
+| 2560 x 1408 | 49            | 65   | ~300..400 sec                      |
+| 3840 x 2176 | 17            | 25   | ~300..400 sec                      |
+* +60 sec for prompt (if not empty/not cached)
+* time to stage 1 prewiev 80..150 sec
 ```
 * UPD: optimized transformer code, increased max frames by 40% for text to video, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
 * UPD2: added web ui v4, stage 1 video preview, task queue, prompt constructor, disable audio option (faster inference 10-30%)

From 760f633e5e8fbc1d3b9a83dd8e082ca4392b262f Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 03:58:46 +0100
Subject: [PATCH 21/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e285e6c3..5cd82195 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 *   **8GB VRAM Optimization:** Runs locally on cards like the RTX 3070/4060Ti using FP8 quantization and memory management tweaks.
 *   **Windows 11 support!!!** You can even run it on Windows (not supported in original model).
 *   **User-Friendly Web UI:** Control everything from your browser.
-*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors. (if you do not have 8 Gb free vram try decrease frames count)
+*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors. (if you do not have 8 Gb free vram try to decrease frames count)
 *   **Real-time Logging:** View the generation progress and console output directly in the web interface.
 *   **Advanced Features:**
     *   **Image Conditioning:** Upload reference images.

From bd342f7e412e5fac11c5a08922047fde6330c6aa Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 04:32:32 +0100
Subject: [PATCH 22/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5cd82195..11c57c22 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ python web_ui_v4.py
 | 2560 x 1408 | 49            | 65   | ~300..400 sec                      |
 | 3840 x 2176 | 17            | 25   | ~300..400 sec                      |
 * +60 sec for prompt (if not empty/not cached)
-* time to stage 1 prewiev 80..150 sec
+* time to stage 1 preview 80..150 sec
 ```
 * UPD: optimized transformer code, increased max frames by 40% for text to video, generation speed 300..315 -> 385..415 sec, (1280x704 11sec 24fps, 1920x1088 5sec 24fps)
 * UPD2: added web ui v4, stage 1 video preview, task queue, prompt constructor, disable audio option (faster inference 10-30%)

From 6e5cd5c04503c27126843834f51ba7e0621ea721 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 04:38:42 +0100
Subject: [PATCH 23/38] Update README.md

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 11c57c22..d9077890 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# LTX-2 Optimized (8GB VRAM Edition) + Web UI
+# LTX-2 Optimized (8GB of VRAM Edition) + Web UI
 
 This repository contains a **modified and optimized version of the LTX-2 Video Generation Model**, designed specifically to run on consumer hardware with as little as **8GB VRAM**. 
 
@@ -13,9 +13,9 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 ## 🚀 Features
 
 *   **8GB VRAM Optimization:** Runs locally on cards like the RTX 3070/4060Ti using FP8 quantization and memory management tweaks.
-*   **Windows 11 support!!!** You can even run it on Windows (not supported in original model).
+*   **Windows 11 support!!!** You can even run it on Windows (not supported in the original model).
 *   **User-Friendly Web UI:** Control everything from your browser.
-*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors. (if you do not have 8 Gb free vram try to decrease frames count)
+*   **Smart "Safe Mode":** The UI automatically limits the frame count based on selected resolution to prevent Out-Of-Memory (OOM) errors. (If you do not have 8GB of free VRAM, try decreasing the frame count.)
 *   **Real-time Logging:** View the generation progress and console output directly in the web interface.
 *   **Advanced Features:**
     *   **Image Conditioning:** Upload reference images.
@@ -45,7 +45,7 @@ mkdir models/loras
     gemma-3 files
 
 ./models/loras/
-    lora files here
+    LoRA files here
 ```
 **3. Install all required modules:**
 ```
@@ -71,8 +71,8 @@ or
 python web_ui_v4.py
 ```
 
-**📊 Performance & Presets (8GB VRAM)**
-* The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on 3070Ti laptop GPU ~300sec for all presets.
+**📊 Performance & Presets (8GB of VRAM)**
+* The Web UI includes an "8GB VRAM Safe Mode" checkbox. When enabled, it enforces the following limits to ensure you don't crash your GPU. Est. inference time on RTX 3070 Ti laptop GPU ~300sec for all presets.
 ```
 | Resolution  | Max Frames i2v| t2v  | Est. Time (3070ti laptop 8gb vram) |
 | :---------- | :------------ |:---- |:---------------------------------- |
@@ -94,7 +94,7 @@ python web_ui_v4.py
 * Web UI: Created for the community to make this powerful model accessible.
 
 Original Model: 
-* (you can find links to all model files and loras bellow)
+* (you can find links to all model files and loras below)
 
 
 ## LTX-2

From 3cc54c2d04a497649f8b3218316a0aa294e72932 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sat, 31 Jan 2026 04:39:32 +0100
Subject: [PATCH 24/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d9077890..79a63dca 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # LTX-2 Optimized (8GB of VRAM Edition) + Web UI
 
-This repository contains a **modified and optimized version of the LTX-2 Video Generation Model**, designed specifically to run on consumer hardware with as little as **8GB VRAM**. 
+This repository contains a **modified and optimized version of the LTX-2 Video Generation Model**, designed specifically to run on consumer hardware with as little as **8GB of VRAM**. 
 
 It includes a fully-featured **Gradio Web Interface** to make generating videos, managing presets, and applying LoRAs easy without needing to remember complex command-line arguments.
 ## Web UI v2

From 4b23b84245105d10e0a92d0cda990455c6b5816f Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Wed, 11 Feb 2026 00:03:12 +0100
Subject: [PATCH 25/38] Add cinema maker UI

---
 film_maker_ui_v4.py                           | 574 ++++++++++++++++++
 .../src/ltx_pipelines/distilled.py            |   3 +-
 2 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 film_maker_ui_v4.py

diff --git a/film_maker_ui_v4.py b/film_maker_ui_v4.py
new file mode 100644
index 00000000..856548c5
--- /dev/null
+++ b/film_maker_ui_v4.py
@@ -0,0 +1,574 @@
+import gradio as gr
+import subprocess
+import os
+import datetime
+import threading
+import json
+import sys
+import google.generativeai as genai
+from collections import deque
+import cv2  # For frame extraction
+
+# --- Configuration & Defaults ---
+DEFAULT_CHECKPOINT = "./models/ltx-2-19b-distilled-fp8.safetensors"
+DEFAULT_GEMMA = "./models/gemma3"
+DEFAULT_UPSAMPLER = "./models/ltx-2-spatial-upscaler-x2-1.0.safetensors"
+
+# --- Master Prompt ---
+SYSTEM_INSTRUCTION = """
+You are a Creative Assistant. Given a user's raw input prompt describing a scene or concept, expand it into a detailed video generation script split into 5-8 short scenes (5 seconds each).
+Each scene must guide a text-to-video model with specific visuals and integrated audio.
+
+#### Crucial Generation Context
+- We generate scenes in CHRONOLOGICAL ORDER (starting from the first scene and moving towards the last).
+- The FIRST SCENE must be the MOST DETAILED, describing the environment, primary characters, and lighting with high precision to set the standard for the entire chain.
+- Subsequent scenes should maintain this description while focusing on their specific action and ensuring continuity from the previous scene.
+
+#### Continuity & Scene Construction
+- All scenes are connected by shared end/start frames.
+- Environment changes MUST OCCUR INSIDE a scene, not between scenes.
+- Each scene must be a direct continuation of the previous one.
+- Describe explicit CAMERA MOVEMENTS (e.g., "slow dolly in," "pan left," "handheld shake") within each scene.
+- Transitions or scene changes must be described as part of the visual action within the 5-second block.
+
+#### Guidelines
+- Strictly follow all aspects of the user's raw input.
+- If the input is vague, invent concrete details: lighting, textures, materials, scene settings, etc.
+- For characters: describe gender, clothing, hair, expressions. DO NOT invent unrequested characters.
+- NO SPEECH: Characters do not speak (this model produces video and background audio only). Describe reactions, expressions, and physical movements instead.
+- Use active language: present-progressive verbs ("is walking," "is grasping").
+- Maintain chronological flow within scenes: use temporal connectors ("as," "then," "while").
+- Audio layer: Describe complete soundscape integrated chronologically. Be specific (e.g., "distant thunder," "rustling leaves," "mechanical hum").
+- Style: Include visual style at the beginning: "Style: <style>, <rest of prompt>." Default to cinematic-realistic if unspecified.
+- Visual and audio only: NO non-visual/auditory senses.
+- NO timestamps or cuts within a single scene.
+- Each scene is a single continuous paragraph.
+
+Examples of good prompts:
+1. A warm sunny backyard. The camera starts in a tight cinematic close-up of a woman and a man in their 30s, facing each other with serious expressions. The woman, emotional and dramatic, says softly, “That’s it... Dad’s lost it. And we’ve lost Dad.”
+The man exhales, slightly annoyed: “Stop being so dramatic, Jess.”
+A beat. He glances aside, then mutters defensively, “He’s just having fun.”
+The camera slowly pans right, revealing the grandfather in the garden wearing enormous butterfly wings, waving his arms in the air like he’s trying to take off.
+He shouts, “Wheeeew!” as he flaps his wings with full commitment.
+The woman covers her face, on the verge of tears. The tone is deadpan, absurd, and quietly tragic.
+
+2. Static camera from inside the oven, looking outward through the slightly fogged glass door. Warm golden light glows around freshly baked cookies. The baker’s face fills the frame, eyes wide with focus, his breath fogging the glass as he leans in. Subtle reflections move across the glass as steam rises.
+Baker (whispering dramatically): “Today… I achieve perfection.”
+He leans even closer, nose nearly touching the glass.
+“Golden edges. Soft center. The gods themselves will smell these cookies and weep.”
+Baker: “Wait—”
+(beat)
+“Did I… forget the chocolate chips?”
+Cut to side view — coworker pops into frame, chewing casually.
+Coworker (mouth full): “Nope. You forgot the sugar.”
+Quick zoom back to the baker’s horrified face, pressed against the oven door, as cookies deflate behind the glass. Steam drifts upward in slow motion.
+pixar style acting and timing
+
+3. Soft studio lighting glows across a warm-toned set. The audience murmurs faintly as the camera pans to reveal three guests seated on a couch — a middle-aged couple and the show’s host sitting across from them.
+The host leans forward, voice steady but probing:
+Host: “When did you first notice that your daughter, Missy, started to spiral?”
+The woman’s face crumples; she takes a shaky breath and begins to cry. Her husband places a comforting hand on her shoulder, looking down before turning back toward the host.
+Father (quietly, with guilt): “We… we don’t know what we did wrong.”
+The studio falls silent for a moment. The camera cuts to the host, who looks gravely into the lens.
+Host (to camera): “Let’s take a look at a short piece our team prepared — chronicling Missy’s downward path.”
+The lights dim slightly as the camera pushes in on the mother’s tear-streaked face. The studio monitors flicker to life, beginning to play the segment as the audience holds its breath.
+
+4. Pinocchio is sitting in an interrogation room, looking nervous, and slightly sweating. He's saying very quietly to himself "I didn't do it... I didn't do it... I'm not a murderer". Pinocchio's nose is quickly getting longer and longer. The camera is zooming in on the double sided mirror in the back of the room, The mirror is turning black as the camera approaches it, and exposes a blurry silhouette of two FBI detectives who stand in the dark lit room on the other side. One of them is saying "I'm telling you, I have a feeling something is off with this kiddo
+
+#### Output Format (STRICT JSON)
+Return a JSON list of objects. Each object MUST have:
+[
+  {
+    "scene_index": 1,
+    "prompt": "Style: ... [Full Prompt Text]"
+  },
+  ...
+]
+Do not include any other text or markdown fences.
+"""
+
+# --- Global State ---
+JOB_QUEUE = deque()
+QUEUE_LOCK = threading.Lock()
+CURRENT_LOG = "System Ready. Waiting for jobs..."
+LATEST_VIDEO_PATH = None
+IS_PROCESSING = False
+STOP_GENERATION = False
+CURRENT_JOB_ID = None
+CURRENT_PROCESS = None
+CURRENT_OUTPUT_PATH = None
+CURRENT_SCENE_INDEX = -1
+# SCENES_DATA will store: {'prompt': str, 'video_path': str, 'first_frame': str, 'last_frame': str}
+SCENES_DATA = [None] * 10 
+
+# --- Logic Functions ---
+
+def extract_frame(video_path, output_image_path, frame_idx=0):
+    """Extracts a specific frame by index using OpenCV"""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return False
+        
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    
+    # Handle negative indices (like -1 for last)
+    if frame_idx < 0:
+        frame_idx = frame_count + frame_idx
+    
+    # Bounds check
+    if frame_idx >= frame_count or frame_idx < 0:
+        cap.release()
+        return False
+
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+    success, frame = cap.read()
+    if success:
+        cv2.imwrite(output_image_path, frame)
+        cap.release()
+        return True
+    cap.release()
+    return False
+
+def extract_first_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, 0)
+
+def extract_last_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, -1)
+
+def call_gemini(api_key, user_prompt):
+    global SCENES_DATA
+    if not api_key:
+        return "Please provide a Gemini API Key.", []
+    
+    try:
+        genai.configure(api_key=api_key)
+        model = genai.GenerativeModel('gemini-3-flash-preview', system_instruction=SYSTEM_INSTRUCTION)
+        response = model.generate_content(user_prompt)
+        text = response.text.strip()
+        
+        # Clean potential markdown fences
+        if text.startswith("```json"):
+            text = text[7:]
+        if text.endswith("```"):
+            text = text[:-3]
+        
+        scenes = json.loads(text)
+        
+        # Initialize SCENES_DATA
+        new_scenes_data = [None] * 10
+        for i, scene in enumerate(scenes):
+            if i < 10:
+                new_scenes_data[i] = {
+                    'prompt': scene['prompt'],
+                    'video_path': None,
+                    'first_frame': None,
+                    'last_frame': None
+                }
+        SCENES_DATA = new_scenes_data
+        
+        # Prepare UI updates
+        ui_updates = []
+        for i in range(10): 
+            if i < len(scenes):
+                # Row visible, Textbox updated
+                ui_updates.append(gr.update(visible=True)) # Row
+                ui_updates.append(gr.update(value=scenes[i]['prompt'], visible=True)) # Textbox
+            else:
+                ui_updates.append(gr.update(visible=False)) # Row
+                ui_updates.append(gr.update(visible=False)) # Textbox
+        
+        return tuple(["Story decomposed into scenes."] + ui_updates)
+    except Exception as e:
+        print(f"DEBUG Error in call_gemini: {str(e)}")
+        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 10)
+
+def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    """
+    mode: "forward" (chain from start_index up to end) or "backward" (chain from start_index down to 0) or "single" (only start_index)
+    """
+    global CURRENT_LOG, LATEST_VIDEO_PATH, CURRENT_PROCESS, CURRENT_OUTPUT_PATH, IS_PROCESSING, STOP_GENERATION, SCENES_DATA
+
+    IS_PROCESSING = True
+    STOP_GENERATION = False
+    
+    # scenes_list is a list of prompts (or None for empty slots)
+    valid_indices = [i for i, p in enumerate(scenes_list) if p]
+    if not valid_indices:
+        IS_PROCESSING = False
+        return
+
+    if mode == "single":
+        indices_to_process = [start_index]
+        orig_start_idx = start_index
+    elif mode == "backward":
+        # Backward chain from start_index (or the last valid index) down to 0
+        current_start = start_index if start_index is not None else valid_indices[-1]
+        indices_to_process = [i for i in range(current_start, -1, -1) if i in valid_indices]
+        orig_start_idx = current_start
+    else:
+        # Forward chain from start_index (or the first valid index) up to the end
+        current_start = start_index if start_index is not None else valid_indices[0]
+        indices_to_process = [i for i in range(current_start, len(scenes_list)) if i in valid_indices]
+        orig_start_idx = current_start
+
+    for i in indices_to_process:
+        global CURRENT_SCENE_INDEX
+        CURRENT_SCENE_INDEX = i
+        if STOP_GENERATION:
+            CURRENT_LOG += "\n--- STOPPED BY USER ---\n"
+            break
+            
+        prompt = scenes_list[i]
+        scene_id = i + 1
+        CURRENT_LOG += f"\n\n--- GENERATING SCENE {scene_id} ---\n"
+        
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_filename = f"scene_{scene_id}_{timestamp}.mp4"
+        output_path = os.path.abspath(output_filename)
+        CURRENT_OUTPUT_PATH = output_path
+        
+        # --- Context & Continuity Setup ---
+        actual_prompt = prompt
+        actual_num_frames = int(num_frames)
+        conditioning_frames = []
+        
+        current_seed = seed
+        if random_seed:
+            current_seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
+
+        if use_context_compression and mode == "forward" and i > 0:
+            combined_prompt_parts = []
+            total_reused_latents = 0
+            
+            start_j = max(0, i - int(context_depth))
+            for j in range(start_j, i):
+                if SCENES_DATA[j] and SCENES_DATA[j]['video_path']:
+                    n_j = max(1, int(latent_reuse_count) - (i - 1 - j))
+                    prev_video = SCENES_DATA[j]['video_path']
+                    
+                    # Extract last n_j latents from prev_video
+                    cap = cv2.VideoCapture(prev_video)
+                    prev_frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    
+                    last_latent_idx_j = (prev_frame_count - 1) // 8
+                    
+                    prompt_j = SCENES_DATA[j]['prompt']
+                    latent_range_str = f"{total_reused_latents}-{total_reused_latents + n_j - 1}" if n_j > 1 else f"{total_reused_latents}"
+                    combined_prompt_parts.append(f"Prev context for {latent_range_str} latent: {prompt_j}")
+                    
+                    for latent_offset in range(n_j):
+                        target_latent_idx_in_prev = last_latent_idx_j - (n_j - 1 - latent_offset)
+                        frame_idx_in_prev = target_latent_idx_in_prev * 8
+                        
+                        tmp_frame_path = f"scene_{scene_id}_ctx_{j}_{latent_offset}.jpg"
+                        if extract_frame(prev_video, tmp_frame_path, frame_idx_in_prev):
+                            guidance = 1.0 if (total_reused_latents + latent_offset) == 0 else 0.1
+                            conditioning_frames.append((tmp_frame_path, total_reused_latents + latent_offset, guidance))
+                    
+                    total_reused_latents += n_j
+            
+            if combined_prompt_parts:
+                actual_prompt = ", ".join(combined_prompt_parts) + ", Current scene: " + prompt
+                actual_num_frames = int(num_frames) + total_reused_latents * 8
+                CURRENT_LOG += f"Context Compression: Added {total_reused_latents} latents from {len(combined_prompt_parts)} prev scenes.\n"
+        else:
+            # Standard Continuity (non-context-compression)
+            max_latent = int(num_frames) // 8
+            if mode == "forward":
+                # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
+                if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                    prev_video = SCENES_DATA[i-1]['video_path']
+                    # We need the last 2 latents. 
+                    # If scene was 121 frames, last latents are usually 14 and 15 (if indexed from 0).
+                    # Let's use the actual frame count to be safe.
+                    cap = cv2.VideoCapture(prev_video)
+                    prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    last_lat = (prev_frame_cnt - 1) // 8
+                    
+                    f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
+                    f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
+                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                         conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                         CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents 0,1)\n"
+            elif mode == "single":
+                # For SINGLE regeneration, we connect both ways if possible
+                if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                    prev_video = SCENES_DATA[i-1]['video_path']
+                    cap = cv2.VideoCapture(prev_video)
+                    prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    last_lat = (prev_frame_cnt - 1) // 8
+                    
+                    f_prev_l1 = f"scene_{i}_f_prev_last1.jpg"
+                    f_prev_l2 = f"scene_{i}_f_prev_last2.jpg"
+                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                         conditioning_frames.append((f_prev_l1, 0, 1.0))
+                         conditioning_frames.append((f_prev_l2, 1, 0.1))
+                         CURRENT_LOG += f"Connecting to Prev Scene {i} end\n"
+                
+                # 2. End at OLD VERSION's END (bridging)
+                if SCENES_DATA[i] and SCENES_DATA[i]['video_path']:
+                    old_video = SCENES_DATA[i]['video_path']
+                    cap = cv2.VideoCapture(old_video)
+                    old_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    old_last_lat = (old_frame_cnt - 1) // 8
+                    
+                    f_old_l1 = f"scene_{i}_f_old_last1.jpg"
+                    f_old_l2 = f"scene_{i}_f_old_last2.jpg"
+                    if extract_frame(old_video, f_old_l1, (old_last_lat-1)*8) and extract_frame(old_video, f_old_l2, old_last_lat*8):
+                        conditioning_frames.append((f_old_l1, max_latent-1, 1.0))
+                        conditioning_frames.append((f_old_l2, max_latent, 0.1))
+                        CURRENT_LOG += f"Connecting to Old Scene {i+1} end (bridging)\n"
+            else:
+                # Backward conditioning
+                if i + 1 < len(SCENES_DATA) and SCENES_DATA[i+1] and SCENES_DATA[i+1]['first_frame']:
+                    conditioning_frames = [(SCENES_DATA[i+1]['first_frame'], max_latent, 1.0)]
+                    CURRENT_LOG += f"Using continuity frame from scene {i+2} (latent index {max_latent})\n"
+
+        # Build Command
+        cmd = [
+            sys.executable, "-m", "ltx_pipelines.distilled",
+            # "kernprof", "-l", "-v", "-m", "ltx_pipelines.distilled",
+            "--checkpoint-path", checkpoint,
+            "--gemma-root", gemma,
+            "--spatial-upsampler-path", upsampler,
+            "--prompt", actual_prompt,
+            "--output-path", output_path,
+            "--width", str(width),
+            "--height", str(height),
+            "--num-frames", str(int(actual_num_frames)),
+            "--frame-rate", str(fps),
+            "--num-inference-steps", str(int(steps)),
+            "--seed", str(int(current_seed)),
+            "--enable-fp8"
+        ]
+            
+        for frame_path, latent_idx, guidance in conditioning_frames:
+            cmd.extend(["--image", frame_path, str(latent_idx), str(guidance)])
+
+        try:
+            CURRENT_PROCESS = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, bufsize=1, universal_newlines=True
+            )
+            for line in CURRENT_PROCESS.stdout:
+                if STOP_GENERATION:
+                    # Try to terminate
+                    subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+                    break
+                CURRENT_LOG += line
+            CURRENT_PROCESS.wait()
+            
+            if CURRENT_PROCESS.returncode == 0 and os.path.exists(output_path):
+                CURRENT_LOG += f"Scene {scene_id} Complete.\n"
+                LATEST_VIDEO_PATH = output_path
+                
+                # Update SCENES_DATA
+                first_f = f"scene_{scene_id}_first.jpg"
+                last_f = f"scene_{scene_id}_last.jpg"
+                extract_first_frame(output_path, first_f)
+                extract_last_frame(output_path, last_f)
+                
+                SCENES_DATA[i] = {
+                    'prompt': prompt,
+                    'video_path': output_path,
+                    'first_frame': first_f,
+                    'last_frame': last_f
+                }
+            else:
+                CURRENT_LOG += f"Scene {scene_id} Failed or Canceled.\n"
+                break
+        except Exception as e:
+            CURRENT_LOG += f"Exception: {str(e)}\n"
+            break
+            
+    CURRENT_LOG += "\n--- GENERATION CYCLE FINISHED ---\n"
+    IS_PROCESSING = False
+
+def start_generation_thread(s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    global SCENES_DATA
+    scenes = [s1, s2, s3, s4, s5, s6, s7, s8, s9, s10]
+    
+    # Clear subsequent scenes in data if starting a chain or single regeneration
+    if mode == "forward":
+        begin_idx = start_index if start_index is not None else 0
+        for i in range(begin_idx, 10):
+            if i < len(scenes) and scenes[i]:
+                SCENES_DATA[i] = {'prompt': scenes[i], 'video_path': None, 'first_frame': None, 'last_frame': None}
+            else:
+                SCENES_DATA[i] = None
+
+    threading.Thread(target=process_chain_generation, args=(scenes, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
+    return "Generation started..."
+
+def stop_generation():
+    global STOP_GENERATION
+    STOP_GENERATION = True
+    return "Stopping..."
+
+def update_ui():
+    global LATEST_VIDEO_PATH, CURRENT_LOG, SCENES_DATA, CURRENT_OUTPUT_PATH, CURRENT_SCENE_INDEX
+    status = "Processing..." if IS_PROCESSING else "Idle"
+    
+    # Prepare updates for all scene boxes
+    updates = []
+    for i in range(10):
+        data = SCENES_DATA[i]
+        
+        display_video = data.get('video_path') if data else None
+        display_preview = data.get('last_frame') if data else None
+        
+        # If this is the active scene being generated, look for intermediate preview
+        if IS_PROCESSING and CURRENT_SCENE_INDEX == i and CURRENT_OUTPUT_PATH:
+            preview_file = CURRENT_OUTPUT_PATH.replace('.mp4', '_.mp4')
+            if os.path.exists(preview_file):
+                display_video = preview_file
+        
+        if data or (IS_PROCESSING and CURRENT_SCENE_INDEX == i):
+            v_val = display_video
+            p_val = display_preview
+            updates.append(gr.update(value=v_val, visible=True))
+            updates.append(gr.update(value=p_val, visible=True))
+        else:
+            updates.append(gr.update(value=None)) # Video
+            updates.append(gr.update(value=None)) # Image
+            
+    return tuple([LATEST_VIDEO_PATH, CURRENT_LOG, status] + updates)
+
+def cancel_job():
+    global CURRENT_PROCESS, CURRENT_LOG
+    if CURRENT_PROCESS:
+        try:
+            subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+            CURRENT_LOG += "\n--- CANCELED ---\n"
+            return "Canceled."
+        except:
+            return "Error canceling."
+    return "No active process."
+
+# --- UI Layout ---
+
+theme = gr.themes.Soft(primary_hue="blue").set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+)
+
+with gr.Blocks(title="LTX-2 Film Maker", theme=theme) as demo:
+    gr.Markdown("# 🎬 LTX-2 CinemaMaker: Story to Film")
+    
+    with gr.Row():
+        with gr.Column(scale=1):
+            gemini_key = gr.Textbox(label="Gemini API Key", type="password")
+            story_prompt = gr.Textbox(label="Whole Movie Idea", placeholder="A robot finds a lost kitten in a rainy city...", lines=4)
+            decompose_btn = gr.Button("✨ Decompose into Scenes", variant="primary")
+            
+            with gr.Accordion("LTX-2 Settings", open=False):
+                checkpoint = gr.Textbox(label="Checkpoint", value=DEFAULT_CHECKPOINT)
+                gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
+                upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
+                with gr.Row():
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=12)
+                    fps = gr.Number(label="FPS", value=24)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=1536)
+                    height = gr.Number(label="Height", value=1024)
+                num_frames = gr.Slider(label="Frames per Scene", minimum=9, maximum=257, step=8, value=121)
+                with gr.Row():
+                    seed = gr.Number(label="Seed", value=10, precision=0)
+                    random_seed = gr.Checkbox(label="Random Seed", value=True)
+                with gr.Row():
+                    use_context_compression = gr.Checkbox(label="Use Context Compression", value=False)
+                    latent_reuse_count = gr.Slider(label="Latent Reuse", minimum=1, maximum=8, step=1, value=2)
+                    context_depth = gr.Slider(label="Context Depth", minimum=1, maximum=5, step=1, value=2)
+
+        with gr.Column(scale=3):
+            gr.Markdown("### 🎞️ Film Scenes")
+            scene_rows = []
+            scene_prompts = []
+            scene_videos = []
+            scene_previews = []
+            scene_reg_chain_btns = []
+            scene_reg_single_btns = []
+            
+            for i in range(1, 11):
+                idx = i - 1
+                with gr.Row(visible=False) as row: # Hidden until decomposed
+                    scene_rows.append(row)
+                    with gr.Column(scale=3):
+                        prompt_box = gr.Textbox(label=f"Scene {i}", lines=3)
+                        scene_prompts.append(prompt_box)
+                        with gr.Row():
+                            chain_btn = gr.Button(f"🔗 Chain From {i}", size="sm")
+                            single_btn = gr.Button(f"🎯 Only {i}", size="sm")
+                            scene_reg_chain_btns.append(chain_btn)
+                            scene_reg_single_btns.append(single_btn)
+                    
+                    video_comp = gr.Video(label="Clip", scale=2)
+                    preview_comp = gr.Image(label="Ends with", scale=1) # The LAST frame of this scene
+                    
+                    scene_videos.append(video_comp)
+                    scene_previews.append(preview_comp)
+            
+            with gr.Row():
+                generate_btn = gr.Button("🚀 Start Full Forward Chain", variant="primary", size="lg")
+                stop_btn = gr.Button("🛑 Stop", variant="secondary", size="lg")
+                cancel_btn = gr.Button("🗑️ Kill Process", variant="stop")
+            
+            latest_video = gr.Video(label="Latest Generated Scene (Global View)")
+            status_box = gr.Textbox(label="Status", interactive=False)
+    
+    with gr.Accordion("Worker Log", open=False):
+        log_box = gr.Textbox(label=None, lines=10, interactive=False)
+
+    # --- Events ---
+    # Decompose into scenes updates Rows and Textboxes
+    decompose_btn.click(
+        fn=call_gemini,
+        inputs=[gemini_key, story_prompt],
+        outputs=[status_box] + [comp for zip_list in zip(scene_rows, scene_prompts) for comp in zip_list]
+    )
+    
+    generate_btn.click(
+        fn=start_generation_thread,
+        inputs=scene_prompts + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth],
+        outputs=[status_box]
+    )
+
+    stop_btn.click(fn=stop_generation, outputs=[status_box])
+    cancel_btn.click(fn=cancel_job, outputs=[status_box])
+
+    # Per-scene buttons
+    for i in range(10):
+        def make_chain_fn(index):
+            def chain_fn(*args):
+                return start_generation_thread(*args, start_index=index, mode="forward")
+            return chain_fn
+            
+        def make_single_fn(index):
+            def single_fn(*args):
+                return start_generation_thread(*args, start_index=index, mode="single")
+            return single_fn
+
+        scene_reg_chain_btns[i].click(
+            fn=make_chain_fn(i),
+            inputs=scene_prompts + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth],
+            outputs=[status_box]
+        )
+        
+        scene_reg_single_btns[i].click(
+            fn=make_single_fn(i),
+            inputs=scene_prompts + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth],
+            outputs=[status_box]
+        )
+    
+    timer = gr.Timer(2)
+    timer.tick(
+        fn=update_ui, 
+        outputs=[latest_video, log_box, status_box] + [comp for zip_list in zip(scene_videos, scene_previews) for comp in zip_list]
+    )
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
index 8afcf317..d881e142 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/distilled.py
@@ -113,6 +113,7 @@ def __call__(
             video_chunks_number: int = 0,
             fps: int = 0,
             disable_audio: bool = True,
+            save_step_1_preview: bool = True,
     ) -> tuple[Iterator[torch.Tensor], torch.Tensor]:
         print("Preparing Inference")
         startAt = time.time()
@@ -242,7 +243,7 @@ def denoising_loop(
         del stage_1_conditionings
         cleanup_memory()
 
-        if True:  # save step 1 result video
+        if save_step_1_preview:
             video_decoder = self.model_ledger.video_decoder()
             decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
             torch.cuda.synchronize()

From 78655c7a95387f17490e2506b9165bc413984aaf Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Wed, 11 Feb 2026 00:09:00 +0100
Subject: [PATCH 26/38] Update README.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 79a63dca..56307ffc 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,13 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 ## Web UI v4
 <img width="949" height="575" alt="scr221" src="https://github.com/user-attachments/assets/2e3a6c51-51b8-4487-b64d-a7d2c41d794a" />
 
+## CinemaMaker UI
+<img width="612" height="343" alt="cm" src="https://github.com/user-attachments/assets/0f5f2dca-bacd-4f5f-a627-505ad3751277" />
+
+* https://youtu.be/eGOq0hUiri4
+* https://youtu.be/HAQqzPdDIj0
+
+
 
 ## 🚀 Features
 

From d6eb4accf3f185d992914c0ee9ce763d37010f5a Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 20 Feb 2026 03:40:45 +0100
Subject: [PATCH 27/38] Add music to video

---
 .../src/ltx_pipelines/utils/constants.py       |  2 +-
 .../src/ltx_pipelines/utils/helpers.py         |  3 ++-
 .../src/ltx_pipelines/utils/model_ledger.py    | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
index 737135b9..b0f4de60 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
@@ -31,7 +31,7 @@
 # Audio
 # =============================================================================
 
-AUDIO_SAMPLE_RATE = 24000
+AUDIO_SAMPLE_RATE = 16000
 
 
 # =============================================================================
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index bc7891a4..bbf4cf40 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -24,6 +24,7 @@
     DenoisingLoopFunc,
     PipelineComponents,
 )
+from ltx_pipelines.utils.constants import AUDIO_SAMPLE_RATE
 
 
 def get_device() -> torch.device:
@@ -255,7 +256,7 @@ def noise_audio_state(
     audio latent tools for further processing. If initial_latent is provided, it will be used to create the initial
     state, otherwise an empty initial state will be created.
     """
-    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape)
+    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape, sample_rate=AUDIO_SAMPLE_RATE)
     audio_tools = AudioLatentTools(components.audio_patchifier, audio_latent_shape)
     audio_state = create_noised_state(
         tools=audio_tools,
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
index 6e00d5ee..cbb5bc9e 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/model_ledger.py
@@ -7,9 +7,12 @@
 from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
 from ltx_core.model.audio_vae import (
     AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
     VOCODER_COMFY_KEYS_FILTER,
     AudioDecoder,
     AudioDecoderConfigurator,
+    AudioEncoder,
+    AudioEncoderConfigurator,
     Vocoder,
     VocoderConfigurator,
 )
@@ -142,6 +145,13 @@ def build_model_builders(self) -> None:
                 registry=self.registry,
             )
 
+            self.audio_encoder_builder = Builder(
+                model_path=self.checkpoint_path,
+                model_class_configurator=AudioEncoderConfigurator,
+                model_sd_ops=AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+                registry=self.registry,
+            )
+
             if self.gemma_root_path is not None:
                 self.text_encoder_builder = Builder(
                     model_path=self.checkpoint_path,
@@ -228,6 +238,14 @@ def audio_decoder(self) -> AudioDecoder:
 
         return self.audio_decoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
 
+    def audio_encoder(self) -> AudioEncoder:
+        if not hasattr(self, "audio_encoder_builder"):
+            raise ValueError(
+                "Audio encoder not initialized. Please provide a checkpoint path to the ModelLedger constructor."
+            )
+
+        return self.audio_encoder_builder.build(device=self._target_device(), dtype=self.dtype).to(self.device).eval()
+
     def vocoder(self) -> Vocoder:
         if not hasattr(self, "vocoder_builder"):
             raise ValueError(

From 6a1c0ea0023f3b4b64eb66e92ce805d7a61dfb8e Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 20 Feb 2026 06:26:36 +0100
Subject: [PATCH 28/38] Add music to video UI

---
 music_maker_ui.py                             | 470 +++++++++++++
 .../src/ltx_pipelines/music_to_video.py       | 649 ++++++++++++++++++
 .../src/ltx_pipelines/utils/constants.py      |   2 +-
 .../src/ltx_pipelines/utils/helpers.py        |   3 +-
 4 files changed, 1121 insertions(+), 3 deletions(-)
 create mode 100644 music_maker_ui.py
 create mode 100644 packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py

diff --git a/music_maker_ui.py b/music_maker_ui.py
new file mode 100644
index 00000000..e727299b
--- /dev/null
+++ b/music_maker_ui.py
@@ -0,0 +1,470 @@
+import gradio as gr
+import subprocess
+import os
+import datetime
+import threading
+import json
+import sys
+import shutil
+import math
+import torchaudio
+from collections import deque
+import cv2  # For frame extraction
+
+# --- Configuration & Defaults ---
+DEFAULT_CHECKPOINT = "./models/ltx-2-19b-distilled-fp8.safetensors"
+DEFAULT_GEMMA = "./models/gemma3"
+DEFAULT_UPSAMPLER = "./models/ltx-2-spatial-upscaler-x2-1.0.safetensors"
+AUDIO_CLIPS_DIR = "./audio_clips"
+
+# --- Global State ---
+JOB_QUEUE = deque()
+QUEUE_LOCK = threading.Lock()
+CURRENT_LOG = "System Ready. Waiting for jobs..."
+LATEST_VIDEO_PATH = None
+IS_PROCESSING = False
+STOP_GENERATION = False
+CURRENT_JOB_ID = None
+CURRENT_PROCESS = None
+CURRENT_OUTPUT_PATH = None
+CURRENT_SCENE_INDEX = -1
+# SCENES_DATA will store: {'prompt': str, 'video_path': str, 'audio_path': str, 'first_frame': str, 'last_frame': str}
+SCENES_DATA = [None] * 20  # Increased to 20 scenes support
+
+# --- Logic Functions ---
+
+def extract_frame(video_path, output_image_path, frame_idx=0):
+    """Extracts a specific frame by index using OpenCV"""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return False
+        
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    
+    # Handle negative indices (like -1 for last)
+    if frame_idx < 0:
+        frame_idx = frame_count + frame_idx
+    
+    # Bounds check
+    if frame_idx >= frame_count or frame_idx < 0:
+        cap.release()
+        return False
+
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+    success, frame = cap.read()
+    if success:
+        cv2.imwrite(output_image_path, frame)
+        cap.release()
+        return True
+    cap.release()
+    return False
+
+def extract_first_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, 0)
+
+def extract_last_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, -1)
+
+def slice_audio(audio_path, prompt, fps, num_frames):
+    global SCENES_DATA
+    if not audio_path:
+        return "Please upload an audio file.", []
+    
+    try:
+        os.makedirs(AUDIO_CLIPS_DIR, exist_ok=True)
+        
+        # Load audio info
+        info = torchaudio.info(audio_path)
+        sample_rate = info.sample_rate
+        total_frames = info.num_frames
+        duration_sec = total_frames / sample_rate
+        
+        # Calculate video scene duration
+        scene_duration_sec = num_frames / fps
+        
+        num_scenes = math.ceil(duration_sec / scene_duration_sec)
+        num_scenes = min(num_scenes, 20) # Limit to 20 scenes
+        
+        waveform, sr = torchaudio.load(audio_path)
+        
+        new_scenes_data = [None] * 20
+        ui_updates = []
+        
+        samples_per_scene = int(scene_duration_sec * sr)
+        
+        for i in range(num_scenes):
+            start_sample = i * samples_per_scene
+            end_sample = min((i + 1) * samples_per_scene, total_frames)
+            
+            chunk_waveform = waveform[:, start_sample:end_sample]
+            
+            # Save chunk
+            chunk_filename = f"scene_{i+1}_audio.wav"
+            chunk_path = os.path.join(AUDIO_CLIPS_DIR, chunk_filename)
+            torchaudio.save(chunk_path, chunk_waveform, sr)
+            
+            new_scenes_data[i] = {
+                'prompt': prompt, # Copy master prompt
+                'video_path': None,
+                'audio_path': os.path.abspath(chunk_path),
+                'first_frame': None,
+                'last_frame': None
+            }
+            
+        SCENES_DATA = new_scenes_data
+        
+        # Prepare UI updates
+        for i in range(20): 
+            if i < num_scenes:
+                # Row visible, Textbox updated
+                ui_updates.append(gr.update(visible=True)) # Row
+                ui_updates.append(gr.update(value=prompt, visible=True)) # Textbox
+                ui_updates.append(gr.update(value=new_scenes_data[i]['audio_path'])) # Audio path display
+            else:
+                ui_updates.append(gr.update(visible=False)) # Row
+                ui_updates.append(gr.update(visible=False)) # Textbox
+                ui_updates.append(gr.update(value=None))
+        
+        return tuple([f"Sliced into {num_scenes} scenes."] + ui_updates)
+        
+    except Exception as e:
+        print(f"DEBUG Error in slice_audio: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 3) # Update this count if UI structure changes
+
+def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    """
+    mode: "forward" (chain from start_index up to end) or "single" (only start_index)
+    """
+    global CURRENT_LOG, LATEST_VIDEO_PATH, CURRENT_PROCESS, CURRENT_OUTPUT_PATH, IS_PROCESSING, STOP_GENERATION, SCENES_DATA
+
+    IS_PROCESSING = True
+    STOP_GENERATION = False
+    
+    # scenes_list is a list of prompts (or None for empty slots)
+    valid_indices = [i for i, p in enumerate(scenes_list) if p]
+    if not valid_indices:
+        IS_PROCESSING = False
+        return
+
+    if mode == "single":
+        indices_to_process = [start_index]
+    else:
+        # Forward chain from start_index (or the first valid index) up to the end
+        current_start = start_index if start_index is not None else valid_indices[0]
+        indices_to_process = [i for i in range(current_start, len(scenes_list)) if i in valid_indices]
+
+    for i in indices_to_process:
+        global CURRENT_SCENE_INDEX
+        CURRENT_SCENE_INDEX = i
+        if STOP_GENERATION:
+            CURRENT_LOG += "\n--- STOPPED BY USER ---\n"
+            break
+            
+        prompt = scenes_list[i]
+        scene_data = SCENES_DATA[i]
+        
+        if not scene_data:
+             CURRENT_LOG += f"\nSkipping scene {i+1} : No Data\n"
+             continue
+             
+        audio_path = scene_data.get('audio_path')
+        scene_id = i + 1
+        CURRENT_LOG += f"\n\n--- GENERATING SCENE {scene_id} ---\n"
+        
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_filename = f"scene_{scene_id}_{timestamp}.mp4"
+        output_path = os.path.abspath(output_filename)
+        CURRENT_OUTPUT_PATH = output_path
+        
+        # --- Context & Continuity Setup ---
+        actual_prompt = prompt
+        actual_num_frames = int(num_frames)
+        conditioning_frames = []
+        
+        current_seed = seed
+        if random_seed:
+            current_seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
+
+        # Standard Continuity (Music Video likely doesn't need context compression as much as continuity? Let's keep Standard for now)
+        if mode == "forward":
+            # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
+            if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                prev_video = SCENES_DATA[i-1]['video_path']
+                
+                cap = cv2.VideoCapture(prev_video)
+                prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                cap.release()
+                last_lat = (prev_frame_cnt - 1) // 8
+                
+                f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
+                f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
+                if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                     conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                     CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
+
+        # Build Command for music_to_video.py
+        cmd = [
+            sys.executable, "-m", "ltx_pipelines.music_to_video",
+            "--checkpoint-path", checkpoint,
+            "--gemma-root", gemma,
+            "--spatial-upsampler-path", upsampler,
+            "--prompt", actual_prompt,
+            "--output-path", output_path,
+            "--width", str(width),
+            "--height", str(height),
+            "--num-frames", str(int(actual_num_frames)),
+            "--frame-rate", str(fps),
+            "--num-inference-steps", str(int(steps)),
+            "--seed", str(int(current_seed)),
+            "--enable-fp8"
+        ]
+        
+        if audio_path:
+            cmd.extend(["--audio-input-path", audio_path])
+            
+        for frame_path, latent_idx, guidance in conditioning_frames:
+            cmd.extend(["--image", frame_path, str(latent_idx), str(guidance)])
+
+        try:
+            CURRENT_PROCESS = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, bufsize=1, universal_newlines=True
+            )
+            for line in CURRENT_PROCESS.stdout:
+                if STOP_GENERATION:
+                    subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+                    break
+                CURRENT_LOG += line
+            CURRENT_PROCESS.wait()
+            
+            if CURRENT_PROCESS.returncode == 0 and os.path.exists(output_path):
+                CURRENT_LOG += f"Scene {scene_id} Complete.\n"
+                LATEST_VIDEO_PATH = output_path
+                
+                # Update SCENES_DATA
+                first_f = f"scene_{scene_id}_first.jpg"
+                last_f = f"scene_{scene_id}_last.jpg"
+                extract_first_frame(output_path, first_f)
+                extract_last_frame(output_path, last_f)
+                
+                SCENES_DATA[i]['video_path'] = output_path
+                SCENES_DATA[i]['first_frame'] = first_f
+                SCENES_DATA[i]['last_frame'] = last_f
+            else:
+                CURRENT_LOG += f"Scene {scene_id} Failed or Canceled.\n"
+                break
+        except Exception as e:
+            CURRENT_LOG += f"Exception: {str(e)}\n"
+            break
+            
+    CURRENT_LOG += "\n--- GENERATION CYCLE FINISHED ---\n"
+    IS_PROCESSING = False
+
+def start_generation_thread(prompts, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    # Clear subsequent scenes in data if starting a chain or single regeneration logic?
+    # For music video, if we regenerate, we keep the audio_path!
+    # So we should only clear video_path.
+    
+    if mode == "forward":
+        begin_idx = start_index if start_index is not None else 0
+        for i in range(begin_idx, 20):
+            if SCENES_DATA[i]:
+                SCENES_DATA[i]['video_path'] = None
+                SCENES_DATA[i]['first_frame'] = None
+                SCENES_DATA[i]['last_frame'] = None
+
+    threading.Thread(target=process_chain_generation, args=(prompts, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
+    return "Generation started..."
+
+def stop_generation():
+    global STOP_GENERATION
+    STOP_GENERATION = True
+    return "Stopping..."
+
+def update_ui():
+    global LATEST_VIDEO_PATH, CURRENT_LOG, SCENES_DATA, CURRENT_OUTPUT_PATH, CURRENT_SCENE_INDEX
+    status = "Processing..." if IS_PROCESSING else "Idle"
+    
+    # Prepare updates for all scene boxes
+    updates = []
+    for i in range(20):
+        data = SCENES_DATA[i]
+        
+        display_video = data.get('video_path') if data else None
+        display_preview = data.get('last_frame') if data else None
+        
+        # Intermediate preview logic
+        if IS_PROCESSING and CURRENT_SCENE_INDEX == i and CURRENT_OUTPUT_PATH:
+            preview_file = CURRENT_OUTPUT_PATH.replace('.mp4', '_.mp4')
+            if os.path.exists(preview_file):
+                display_video = preview_file
+        
+        if data or (IS_PROCESSING and CURRENT_SCENE_INDEX == i):
+            v_val = display_video
+            p_val = display_preview
+            updates.append(gr.update(value=v_val, visible=True))
+            updates.append(gr.update(value=p_val, visible=True))
+        else:
+            updates.append(gr.update(value=None)) # Video
+            updates.append(gr.update(value=None)) # Image
+            
+    return tuple([LATEST_VIDEO_PATH, CURRENT_LOG, status] + updates)
+
+def cancel_job():
+    global CURRENT_PROCESS, CURRENT_LOG
+    if CURRENT_PROCESS:
+        try:
+            subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+            CURRENT_LOG += "\n--- CANCELED ---\n"
+            return "Canceled."
+        except:
+            return "Error canceling."
+    return "No active process."
+
+# --- UI Layout ---
+
+theme = gr.themes.Soft(primary_hue="purple").set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+)
+
+with gr.Blocks(title="LTX-2 Music Video Maker", theme=theme) as demo:
+    gr.Markdown("# 🎵 LTX-2 Music Video Maker")
+    
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_file = gr.Audio(label="Upload Music File", type="filepath")
+            master_prompt = gr.Textbox(label="Visual Style / Prompt", placeholder="Cyberpunk city, neon lights, rain...", lines=2)
+            
+            with gr.Accordion("LTX-2 Settings", open=True):
+                checkpoint = gr.Textbox(label="Checkpoint", value=DEFAULT_CHECKPOINT)
+                gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
+                upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
+                with gr.Row():
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=12)
+                    fps = gr.Number(label="FPS", value=24)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=1536)
+                    height = gr.Number(label="Height", value=1024)
+                num_frames = gr.Slider(label="Frames per Scene", minimum=9, maximum=257, step=8, value=121)
+                
+                slice_btn = gr.Button("🔪 Slice Music & Prepare Scenes", variant="primary")
+                
+                with gr.Row():
+                    seed = gr.Number(label="Seed", value=10, precision=0)
+                    random_seed = gr.Checkbox(label="Random Seed", value=True)
+                with gr.Row():
+                    # context compression not prioritized for now, keeping args for compatibility
+                    use_context_compression = gr.Checkbox(label="Use Context Compression", value=False, visible=False) 
+                    latent_reuse_count = gr.Slider(label="Latent Reuse", minimum=1, maximum=8, step=1, value=2, visible=False)
+                    context_depth = gr.Slider(label="Context Depth", minimum=1, maximum=5, step=1, value=2, visible=False)
+
+        with gr.Column(scale=3):
+            gr.Markdown("### 🎞️ Video Scenes")
+            scene_rows = []
+            scene_prompts = []
+            scene_audio_labels = [] 
+            scene_videos = []
+            scene_previews = []
+            scene_reg_chain_btns = []
+            scene_reg_single_btns = []
+            
+            for i in range(1, 21):
+                with gr.Row(visible=False) as row: # Hidden until decomposed
+                    scene_rows.append(row)
+                    with gr.Column(scale=3):
+                        prompt_box = gr.Textbox(label=f"Scene {i} Prompt", lines=2)
+                        scene_prompts.append(prompt_box)
+                        audio_lbl = gr.Textbox(label=f"Scene {i} Audio", interactive=False)
+                        scene_audio_labels.append(audio_lbl)
+                        
+                        with gr.Row():
+                            chain_btn = gr.Button(f"🔗 Chain From {i}", size="sm")
+                            single_btn = gr.Button(f"🎯 Only {i}", size="sm")
+                            scene_reg_chain_btns.append(chain_btn)
+                            scene_reg_single_btns.append(single_btn)
+                    
+                    video_comp = gr.Video(label="Clip", scale=2)
+                    preview_comp = gr.Image(label="Last Frame", scale=1) 
+                    
+                    scene_videos.append(video_comp)
+                    scene_previews.append(preview_comp)
+            
+            with gr.Row():
+                generate_btn = gr.Button("🚀 Start Full Forward Chain", variant="primary", size="lg")
+                stop_btn = gr.Button("🛑 Stop", variant="secondary", size="lg")
+                cancel_btn = gr.Button("🗑️ Kill Process", variant="stop")
+            
+            latest_video = gr.Video(label="Latest Generated Scene (Global View)")
+            status_box = gr.Textbox(label="Status", interactive=False)
+    
+    with gr.Accordion("Worker Log", open=False):
+        log_box = gr.Textbox(label=None, lines=10, interactive=False)
+
+    # --- Events ---
+    slice_btn.click(
+        fn=slice_audio,
+        inputs=[audio_file, master_prompt, fps, num_frames],
+        outputs=[status_box] + [comp for triple in zip(scene_rows, scene_prompts, scene_audio_labels) for comp in triple]
+    )
+    
+    # Helper wrapper to collect all prompts
+    def collect_prompts_and_start(*args):
+        # Args structure: [prompts_list..., checkpoint, ..., button_args]
+        # We need to slice args.
+        num_scenes = 20
+        prompts = args[:num_scenes]
+        rest = args[num_scenes:]
+        return start_generation_thread(prompts, *rest)
+
+    all_inputs = scene_prompts + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
+    
+    generate_btn.click(
+        fn=collect_prompts_and_start,
+        inputs=all_inputs,
+        outputs=[status_box]
+    )
+
+    stop_btn.click(fn=stop_generation, outputs=[status_box])
+    cancel_btn.click(fn=cancel_job, outputs=[status_box])
+
+    # Per-scene buttons
+    for i in range(20):
+        def make_chain_fn(index):
+            def chain_fn(*args):
+                num_scenes = 20
+                prompts = args[:num_scenes]
+                rest = args[num_scenes:]
+                return start_generation_thread(prompts, *rest, start_index=index, mode="forward")
+            return chain_fn
+            
+        def make_single_fn(index):
+            def single_fn(*args):
+                num_scenes = 20
+                prompts = args[:num_scenes]
+                rest = args[num_scenes:]
+                return start_generation_thread(prompts, *rest, start_index=index, mode="single")
+            return single_fn
+
+        scene_reg_chain_btns[i].click(
+            fn=make_chain_fn(i),
+            inputs=all_inputs,
+            outputs=[status_box]
+        )
+        
+        scene_reg_single_btns[i].click(
+            fn=make_single_fn(i),
+            inputs=all_inputs,
+            outputs=[status_box]
+        )
+    
+    timer = gr.Timer(2)
+    timer.tick(
+        fn=update_ui, 
+        outputs=[latest_video, log_box, status_box] + [comp for zip_list in zip(scene_videos, scene_previews) for comp in zip_list]
+    )
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
new file mode 100644
index 00000000..8492e4fd
--- /dev/null
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
@@ -0,0 +1,649 @@
+import logging
+import time
+import hashlib
+import os
+import argparse
+import einops
+
+from collections.abc import Iterator
+from dataclasses import replace
+import numpy as np
+
+import torch
+import torchaudio
+
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.components.protocols import DiffusionStepProtocol
+from ltx_core.loader import LoraPathStrengthAndSDOps
+from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
+from ltx_core.model.audio_vae import AudioEncoder
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.text_encoders.gemma import encode_text
+from ltx_core.types import AudioLatentShape, LatentState, VideoPixelShape
+from ltx_pipelines.utils import ModelLedger
+from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
+from ltx_pipelines.utils.constants import (
+    DISTILLED_SIGMA_VALUES,
+    STAGE_2_DISTILLED_SIGMA_VALUES,
+)
+from ltx_pipelines.utils.helpers import (
+    assert_resolution,
+    cleanup_memory,
+    denoise_audio_video,
+    euler_denoising_loop,
+    generate_enhanced_prompt,
+    get_device,
+    image_conditionings_by_replacing_latent,
+    simple_denoising_func,
+    noise_audio_state
+)
+from ltx_pipelines.utils.media_io import encode_video
+from ltx_pipelines.utils.types import PipelineComponents
+
+device = get_device()
+
+logging.basicConfig(level=logging.ERROR)
+logging.getLogger("accelerate").setLevel(logging.ERROR)
+logging.getLogger("ltx_core").setLevel(logging.ERROR)
+
+
+def load_audio_input(audio_path: str, target_sample_rate: int, device: torch.device) -> torch.Tensor:
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != target_sample_rate:
+        waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
+    
+    # Ensure stereo/mono match? LTX usually expects specific channels? 
+    # Usually we just take the first channel or mix if needed, but let's assume standard handling or strict match later.
+    # LTX Audio VAE likely handles what it needs, but we need to check channels.
+    # checking audio_vae.py -> ch=128 (latent), but input?
+    # The VAE expects spectrograms, usually.
+    # Wait, the pipeline usually generates audio.
+    # We need to *encode* the waveform to latents.
+    # The repository doesn't seem to expose a direct "wav -> latent" helper in the viewed files easily, 
+    # but `AudioEncoder` takes `spectrogram`.
+    # We need `AudioProcessor` to convert wav to spectrogram.
+    
+    return waveform.to(device)
+
+class MusicToVideoPipeline:
+    """
+    Modified DistilledPipeline for Music-to-Video generation.
+    Takes an input audio file, encodes it, and uses it to condition/guide the video generation.
+    """
+
+    def __init__(
+        self,
+        checkpoint_path: str,
+        gemma_root: str,
+        spatial_upsampler_path: str,
+        loras: list[LoraPathStrengthAndSDOps],
+        device: torch.device = device,
+        fp8transformer: bool = False,
+    ):
+        self.device = device
+        self.dtype = torch.bfloat16
+
+        self.model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=device,
+            checkpoint_path=checkpoint_path,
+            spatial_upsampler_path=spatial_upsampler_path,
+            gemma_root_path=gemma_root,
+            loras=loras,
+            fp8transformer=fp8transformer,
+        )
+
+        self.pipeline_components = PipelineComponents(
+            dtype=self.dtype,
+            device=device,
+        )
+
+    def encode_audio_latents(self, waveform: torch.Tensor) -> torch.Tensor:
+        """
+        Encodes the audio waveform into latents using the VAE encoder.
+        """
+        # We need the AudioVAE encoder and the AudioProcessor (spectrogram converter)
+        # The ModelLedger provides audio_encoder().
+        # We need to instantiate AudioProcessor manually or find where it is.
+        # It's in `ltx_core.model.audio_vae.ops`.
+        from ltx_core.model.audio_vae.ops import AudioProcessor
+        
+        # Defaults based on audio_vae.py
+        n_fft = 1024
+        mel_hop_length = 160
+        mel_bins = 64
+        
+        audio_processor = AudioProcessor(
+            sample_rate=16000,
+            mel_bins=mel_bins,
+            mel_hop_length=mel_hop_length,
+            n_fft=n_fft
+        ).to(self.device)
+        
+        # Spectrogram expects (batch, channels, time) or similar?
+        # AudioProcessor.get_spectrogram usually takes (batch, channels, time).
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0).unsqueeze(0)
+        elif waveform.dim() == 2:
+            waveform = waveform.unsqueeze(0) # (1, C, T)
+
+        # Ensure stereo (2 channels) as expected by AudioEncoder
+        if waveform.shape[1] == 1:
+            waveform = waveform.repeat(1, 2, 1)
+        elif waveform.shape[1] > 2:
+            waveform = waveform[:, :2, :]
+            
+        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), 16000)
+        # spectrogram shape: (Batch, Channels, Time, Freq)
+        
+        audio_encoder = self.model_ledger.audio_encoder()
+        
+        # Encoder expects (Batch, Channels, Time, Freq)
+        # Check AudioEncoder logic. It uses `conv_in` which is 2D conv... on what?
+        # Usually (B, C, H, W). Time and Freq are H, W.
+        # Spectrogram is (B, 1, T, F).
+        
+        encoded_latents = audio_encoder(spectrogram.to(self.dtype))
+        
+        # Clean up
+        del audio_encoder
+        del audio_processor
+        cleanup_memory()
+        
+        return encoded_latents
+
+    @torch.inference_mode()
+    def __call__(
+            self,
+            prompt: str,
+            seed: int,
+            height: int,
+            width: int,
+            num_frames: int,
+            frame_rate: float,
+            images: list[tuple[str, int, float]],
+            audio_input_path: str | None = None,
+            tiling_config: TilingConfig | None = None,
+            enhance_prompt: bool = False,
+            output_path: str = '',
+            video_chunks_number: int = 0,
+            fps: int = 0,
+            save_step_1_preview: bool = True,
+    ) -> tuple[Iterator[torch.Tensor], torch.Tensor | None]:
+        print("Preparing Inference (Music to Video)")
+        startAt = time.time()
+        assert_resolution(height=height, width=width, is_two_stage=True)
+
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+
+        # --- LOAD AUDIO ---
+        audio_waveform = None
+        audio_latents = None
+        
+        if audio_input_path:
+             print(f"Loading audio from {audio_input_path}")
+             # We load raw waveform for final mix, and encode for latents
+             audio_waveform = load_audio_input(audio_input_path, 16000, self.device)
+             print("Encoding audio latents...")
+             audio_latents = self.encode_audio_latents(audio_waveform)
+             
+             # Verify latent shape matches required frames?
+             # Audio VAE has downsample factor ~4 in time?
+             # LTX expects aligned video/audio latents.
+             # We might need to crop/pad audio_latents to match num_frames video equivalent.
+             pass
+
+        # --- PROMPT CACHE LOGIC START ---
+        CACHE_DIR = "./prompt_embeddings_cache"
+        os.makedirs(CACHE_DIR, exist_ok=True)
+
+        image_identifier = images[0][0] if (len(images) > 0 and enhance_prompt) else "no_img"
+
+        hash_input_str = (
+            f"prompt:{prompt}|"
+            f"pipeline:music_distilled|"
+            f"enhance:{enhance_prompt}|"
+            f"seed:{seed if enhance_prompt else 'ignored'}|"
+            f"img:{image_identifier}"
+        )
+
+        cache_filename = hashlib.md5(hash_input_str.encode('utf-8')).hexdigest() + ".pt"
+        cache_path = os.path.join(CACHE_DIR, cache_filename)
+
+        context_p = None
+
+        if os.path.exists(cache_path):
+            print(f"Prompt cache hit! Loading embeddings from {cache_path}")
+            try:
+                context_p = torch.load(cache_path, map_location=self.device)
+            except Exception as e:
+                print(f"Failed to load cache (corrupted?): {e}. Regenerating.")
+
+        if context_p is None:
+            print("Prompt cache miss. Running text encoder.")
+            text_encoder = self.model_ledger.text_encoder()
+            current_prompt = prompt
+            if enhance_prompt:
+                current_prompt = generate_enhanced_prompt(
+                    text_encoder, prompt, images[0][0] if len(images) > 0 else None
+                )
+            context_p = encode_text(text_encoder, prompts=[current_prompt])[0]
+
+            print(f"Saving embeddings to {cache_path}")
+            torch.save(context_p, cache_path)
+            print("Prompt encoded.", time.time() - startAt)
+
+            torch.cuda.synchronize()
+            del text_encoder
+            cleanup_memory()
+        
+        video_context, audio_context = context_p
+        
+        # If we have input audio, we theoretically ignore audio_context (text-to-audio guidance) 
+        # or we keep it to help semantic matching? 
+        # We'll keep it as the model expects it, but we force the audio latents.
+
+        print("Stage 1: Initial low resolution video generation.")
+
+        transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+
+        def music_denoising_loop(
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
+        ) -> tuple[LatentState, LatentState]:
+            
+            # Custom loop that enforces audio_state to be the input music
+            # We wrap the underlying euler_denoising_loop logic but override audio state updates?
+            # euler_denoising_loop iterates over sigmas.
+            # We can't easily inject into the internal loop of euler_denoising_loop without copying it.
+            # So we will copy `euler_denoising_loop` logic here or modify how we call it.
+            
+            # Actually, `euler_denoising_loop` takes `denoise_fn`.
+            # We can modify `denoise_fn` to output 0 correction for audio, 
+            # AND strictly set `audio_state` to `audio_latents` (noised appropriately? or clean?).
+            
+            # If we want the video to attend to the "clean" audio, we should present the clean audio to the transformer.
+            # But the transformer expects noised input at step t.
+            # If we pass "clean" audio as "noised audio", the model might be confused if it expects noise.
+            # HOWEVER, if we want strict control, we can just say audio_state is the latent.
+            
+            # Simplified approach:
+            # 1. We let the loop run.
+            # 2. Inside the denoise_fn, we ignore the predicted audio correction.
+            # 3. We overwrite the audio_state in the loop? `euler_denoising_loop` returns the final state.
+            
+            # Better: Copy `euler_denoising_loop` logic:
+            
+            v_x = video_state
+            a_x = audio_state
+            
+            # If audio_latents provided, we initialize a_x to it?
+            # But a_x starts as pure noise in the standard pipeline.
+            # If we want to condition, we should probably start with the noised version of our audio?
+            # Or just clean audio? 
+            # "Audio Reactive" usually means we use audio features to drive generation.
+            # In LTX, video and audio are generated jointly.
+            # If we feed the *clean* audio latent (encoded from file) as `audio_state` at every step,
+            # the transformer will see it via self-attention/cross-attention layers.
+            
+            for i in range(len(sigmas) - 1):
+                sigma_hat = sigmas[i]
+                sigma_next = sigmas[i + 1]
+                
+                # If we have fixed audio, we might want to force a_x to be the 'noised' version of our target audio at this sigma level?
+                # Or just the clean target audio if the model is robust enough? 
+                # Let's try forcing it to be the correct 'noised' level state of our ground truth audio.
+                if loop_audio_latents is not None:
+                     # Add noise to clean audio_latents matching current sigma
+                     # noise = torch.randn_like(audio_latents)
+                     # a_x_target = audio_latents + noise * sigma_hat
+                     # But this changes noise every step.
+                     # We should define the noise once or use consistent noise.
+                     # Simpler: Just force a_x = audio_latents (clean). 
+                     # The model might treat it as "denoised" and try to predict 0 noise.
+                     # Let's trust the detailed plan: "inject these latents".
+                     
+                     # We will set a_x to the audio_latents, but we need to ensure dimensions match.
+                     # Audio VAE latent might have different length than what 'noise_audio_state' produced if frames differ.
+                     # We align them before loop.
+                     
+                     # We align them before loop.
+                     
+                     a_x = replace(a_x, latent=loop_audio_latents) # Force strict guidance with flattened latents
+                     pass
+
+                denoised_v, denoised_a = simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                    is_conditioning=is_conditioning,
+                    disable_audio=False, # We want the model to see audio
+                )(v_x, a_x, sigmas, i)
+                
+                # Euler step for Video
+                d_v = (v_x.latent - denoised_v) / sigma_hat
+                dt = sigma_next - sigma_hat
+                v_x = replace(v_x, latent=v_x.latent + d_v * dt)
+                
+                # For Audio, if we are forcing it, we don't need to step it. 
+                # If we are NOT forcing it (just initializing), we would step it.
+                # Here we are FORCING it to be our input. So we skip audio update or reset it next loop.
+            
+            return v_x, a_x
+
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width // 2,
+            height=height // 2,
+            fps=frame_rate,
+        )
+        
+        
+        # Prepare conditionings
+        stage_1_conditionings = []
+        is_conditioning = False
+        if images:
+            is_conditioning = True
+            video_encoder = self.model_ledger.video_encoder()
+            stage_1_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+            torch.cuda.synchronize()
+            del video_encoder
+            cleanup_memory()
+
+        # Align audio latents to match video duration
+        if audio_latents is not None:
+             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(
+                 stage_1_output_shape, 
+                 sample_rate=16000
+             )
+             target_frames = expected_audio_shape.frames
+             current_frames = audio_latents.shape[2]
+             
+             if current_frames > target_frames:
+                 print(f"Aligning audio: Trimming from {current_frames} to {target_frames}")
+                 audio_latents = audio_latents[:, :, :target_frames, :]
+             elif current_frames < target_frames:
+                 print(f"Aligning audio: Padding from {current_frames} to {target_frames}")
+                 pad_amount = target_frames - current_frames
+                 # Pad dimension 2 (Time). 
+                 # F.pad for 4D input (B, C, T, F). 
+                 # Pad args are (dim_-1_left, dim_-1_right, dim_-2_left, dim_-2_right, ...)
+                 # We want to pad dim 2 (Time), which is dim_-2.
+                 # So args are (freq_left, freq_right, time_left, time_right)
+                 audio_latents = torch.nn.functional.pad(audio_latents, (0, 0, 0, pad_amount))
+        
+        if audio_latents is not None:
+             audio_state, audio_tools = noise_audio_state(
+                 stage_1_output_shape,
+                 noiser,
+                 [], # Audio has no image-based latents
+                 self.pipeline_components,
+                 self.dtype,
+                 self.device,
+                 noise_scale=1.0,
+                 initial_latent=audio_latents, # Pass 4D aligned latents
+             )
+             
+             # Create flattened version for loop injection
+             # b c t f -> b t (c f)
+             loop_audio_latents = einops.rearrange(audio_latents, "b c t f -> b t (c f)")
+             
+             # Check compatibility with transformer and slice if needed
+             # Transformer expects [B, T, 16] but encoder gives [B, T, 128]
+             # X0Model wraps LTXModel as velocity_model
+             in_features = transformer.velocity_model.audio_patchify_proj.in_features
+             if loop_audio_latents.shape[-1] != in_features:
+                 print(f"Aligning audio features for loop: {loop_audio_latents.shape[-1]} -> {in_features}")
+                 loop_audio_latents = loop_audio_latents[..., :in_features]
+        else:
+            audio_state, audio_tools = noise_audio_state(
+                stage_1_output_shape,
+                noiser,
+                stage_1_conditionings,
+                self.pipeline_components,
+                self.dtype,
+                self.device,
+                noise_scale=1.0,
+            )
+            loop_audio_latents = None
+
+
+
+        print("Stage 1: Starting denoising loop.", time.time() - startAt)
+        
+        # Initialize states
+        # We need to manually initialize if we want to inject audio properly from start?
+        # denoise_audio_video creates random noise.
+        # We can pass initial_audio_latent!
+        
+        # Resize audio_latents to match stage 1? 
+        # Audio VAE latent frames depend on time. 
+        # If we trained on specific fps/resolution, audio latent structure is independent of video resolution (mostly), 
+        # but depends on duration (frames / fps).
+        # We should check if stage 1 and stage 2 use different audio latent structures?
+        # Typically audio is same, video resolution changes.
+        
+        # So we can pass audio_latents as initial_audio_latent.
+        # BUT `denoise_audio_video` adds noise to initial_latent if provided (via `noise_audio_state`).
+        # We want to maybe start with it?
+        
+        # Let's stick to the `music_denoising_loop` strategy of forcing it.
+        # But we need to ensure `audio_state` has correct shape.
+        
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=stage_1_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=music_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            is_conditioning=is_conditioning,
+            initial_audio_latent=audio_latents if audio_latents is not None else None 
+            # If we pass it here, it gets fully noised. 
+            # The loop will then overwrite it with clean/fixed version if we implemented it right.
+        )
+        
+        print("Stage 1: Finish denoising loop.", time.time() - startAt)
+        
+        torch.cuda.synchronize()
+        del stage_1_sigmas
+        del stage_1_output_shape
+        del stage_1_conditionings
+        cleanup_memory()
+
+        if save_step_1_preview:
+            video_decoder = self.model_ledger.video_decoder()
+            decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
+            torch.cuda.synchronize()
+            del video_decoder
+            cleanup_memory()
+            
+            # For preview, we can use the decoded audio from state (which should match input if we forced it)
+            # OR just use the original input waveform.
+            # Using state validates that the VAE cycle works.
+            # But for "music to video", we probably want the HQ input audio in output.
+            
+            decoded_audio = None
+            if audio_latents is not None:
+                # Decode the latent to check it? Or just use raw waveform?
+                # Let's decode to be safe/consistent with pipeline flow.
+                vocoder = self.model_ledger.vocoder()
+                decoded_audio = vae_decode_audio(
+                    audio_state.latent, self.model_ledger.audio_decoder(), vocoder
+                )
+                torch.cuda.synchronize()
+                del vocoder
+                cleanup_memory()
+            
+            encode_video(
+                video=decoded_video,
+                fps=fps,
+                audio=audio_waveform.cpu() if audio_waveform is not None else decoded_audio, # Use HQ input if available
+                audio_sample_rate=24000,
+                output_path=output_path.replace('.mp4', '_.mp4'),
+                video_chunks_number=video_chunks_number,
+            )
+
+        print("Stage 2: Upsample and refine.")
+        
+        video_encoder = self.model_ledger.video_encoder()
+        upsampler = self.model_ledger.spatial_upsampler()
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1], video_encoder=video_encoder, upsampler=upsampler
+        )
+        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = []
+        if images:
+             stage_2_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_2_output_shape.height,
+                width=stage_2_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+
+        torch.cuda.synchronize()
+        del video_encoder
+        del upsampler
+        del video_state
+        cleanup_memory()
+
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=music_denoising_loop, # Reuse forced audio loop
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=audio_latents, # Reuse clean audio latents
+            is_conditioning=is_conditioning
+        )
+        
+        print("Stage 2: Finish upsample.", time.time() - startAt)
+        
+        torch.cuda.synchronize()
+        del transformer
+        del stage_2_output_shape
+        del stage_2_conditionings
+        del stage_2_sigmas
+        cleanup_memory()
+        
+        print("Stage 3: VAE Decode.")
+        video_decoder = self.model_ledger.video_decoder()
+        decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
+        del video_decoder
+        cleanup_memory()
+    
+        if audio_state is not None:
+            # Unpatchify if needed (3D -> 4D) before decoding
+            # If loop returned flattened latents, we must restore them.
+            # However, we sliced features to 16. Decoder expects 128 (8ch * 16bins).
+            # We cannot simply reshape 16 features back to 128.
+            # But wait, audio_state was initialized with FULL 4D latents.
+            # The loop only manipulated a_x which had sliced features.
+            # If the loop returns a_x, it has sliced features (16).
+            # We need to restore it to original 128 features (if we want to decode the result of the loop).
+            # OR: if we forced guidance, we just want to decode the ORIGINAL audio_latents (which we have).
+            
+            # If we just want to save the audio we generated/guided:
+            # Since we forced a_x = loop_audio_latents (which is sliced input),
+            # The output audio_state.latent is equal to loop_audio_latents (sliced).
+            
+            # We actually want to decode the original audio we passed in (audio_latents 4D).
+            # So we can just decode `audio_latents` directly? 
+            # Yes, since we are doing "music to video", the audio is input, not generated.
+            # So we should decode the `audio_latents` (the aligned 4D input) instead of `audio_state.latent`.
+            
+            # BUT `denoise_audio_video` returns `audio_state`. 
+            # If we trust the pipeline, we should use what it returns.
+            # But since we sliced it, it's destructive.
+            # Let's decode the original aligned `audio_latents` (which is 4D, 128 features).
+            pass
+
+        audio = None
+        if audio_state is not None:
+            # Decode the original aligned input audio, ensuring we have the full feature set
+            if audio_latents is not None:
+                 vocoder = self.model_ledger.vocoder()
+                 audio = vae_decode_audio(audio_latents, self.model_ledger.audio_decoder(), vocoder)
+                 torch.cuda.synchronize()
+                 del vocoder
+                 cleanup_memory()
+            else:
+                 # Generative case? We don't support generative audio here yet with this slicing hack.
+                 # Attempt decode (will likely fail if sliced)
+                 pass
+        
+        print("Stage 3: Done.", time.time() - startAt)
+        return decoded_video, audio_waveform.cpu() if audio_waveform is not None else audio
+
+
+@torch.inference_mode()
+def main() -> None:
+    logging.getLogger().setLevel(logging.INFO)
+    parser = default_2_stage_distilled_arg_parser()
+    parser.add_argument("--audio-input-path", type=str, default=None, help="Path to input audio file")
+    args = parser.parse_args()
+    
+    pipeline = MusicToVideoPipeline(
+        checkpoint_path=args.checkpoint_path,
+        spatial_upsampler_path=args.spatial_upsampler_path,
+        gemma_root=args.gemma_root,
+        loras=args.lora,
+        fp8transformer=args.enable_fp8,
+    )
+    tiling_config = TilingConfig.default()
+    video_chunks_number = get_video_chunks_number(args.num_frames, tiling_config)
+    
+    video, audio = pipeline(
+        prompt=args.prompt,
+        seed=args.seed,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        frame_rate=args.frame_rate,
+        images=args.images,
+        audio_input_path=args.audio_input_path,
+        tiling_config=tiling_config,
+        enhance_prompt=args.enhance_prompt,
+        output_path=args.output_path,
+        video_chunks_number=video_chunks_number,
+        fps=args.frame_rate,
+    )
+
+    encode_video(
+        video=video,
+        fps=args.frame_rate,
+        audio=audio,
+        audio_sample_rate=24000,
+        output_path=args.output_path,
+        video_chunks_number=video_chunks_number,
+    )
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
index b0f4de60..737135b9 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/constants.py
@@ -31,7 +31,7 @@
 # Audio
 # =============================================================================
 
-AUDIO_SAMPLE_RATE = 16000
+AUDIO_SAMPLE_RATE = 24000
 
 
 # =============================================================================
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
index bbf4cf40..bc7891a4 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
@@ -24,7 +24,6 @@
     DenoisingLoopFunc,
     PipelineComponents,
 )
-from ltx_pipelines.utils.constants import AUDIO_SAMPLE_RATE
 
 
 def get_device() -> torch.device:
@@ -256,7 +255,7 @@ def noise_audio_state(
     audio latent tools for further processing. If initial_latent is provided, it will be used to create the initial
     state, otherwise an empty initial state will be created.
     """
-    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape, sample_rate=AUDIO_SAMPLE_RATE)
+    audio_latent_shape = AudioLatentShape.from_video_pixel_shape(output_shape)
     audio_tools = AudioLatentTools(components.audio_patchifier, audio_latent_shape)
     audio_state = create_noised_state(
         tools=audio_tools,

From aec4a9b3ef0df716280794d9cf08277d98c00bb2 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Fri, 20 Feb 2026 06:46:15 +0100
Subject: [PATCH 29/38] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 56307ffc..25946f74 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,11 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 * https://youtu.be/eGOq0hUiri4
 * https://youtu.be/HAQqzPdDIj0
 
+## Music to Video UI
+<img width="612" height="343" alt="cm" src="https://github.com/user-attachments/assets/852845f9-f113-41f7-a5e6-8a4e1dec0778" />
+
+* https://youtu.be/HzK1nW-OVtQ
+
 
 
 ## 🚀 Features

From ab1b65cdaed1f46ad917f1a7a73b650c96c8178f Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Fri, 20 Feb 2026 09:34:43 +0100
Subject: [PATCH 30/38] Add music to video UI

---
 music_maker_ui.py                             |   6 +-
 .../src/ltx_pipelines/music_to_video.py       | 212 ++----------------
 2 files changed, 25 insertions(+), 193 deletions(-)

diff --git a/music_maker_ui.py b/music_maker_ui.py
index e727299b..aa0529d4 100644
--- a/music_maker_ui.py
+++ b/music_maker_ui.py
@@ -3,13 +3,11 @@
 import os
 import datetime
 import threading
-import json
 import sys
-import shutil
 import math
 import torchaudio
 from collections import deque
-import cv2  # For frame extraction
+import cv2
 
 # --- Configuration & Defaults ---
 DEFAULT_CHECKPOINT = "./models/ltx-2-19b-distilled-fp8.safetensors"
@@ -114,7 +112,7 @@ def slice_audio(audio_path, prompt, fps, num_frames):
         SCENES_DATA = new_scenes_data
         
         # Prepare UI updates
-        for i in range(20): 
+        for i in range(20):
             if i < num_scenes:
                 # Row visible, Textbox updated
                 ui_updates.append(gr.update(visible=True)) # Row
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
index 8492e4fd..e69992a9 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
@@ -2,12 +2,10 @@
 import time
 import hashlib
 import os
-import argparse
 import einops
 
 from collections.abc import Iterator
 from dataclasses import replace
-import numpy as np
 
 import torch
 import torchaudio
@@ -17,7 +15,6 @@
 from ltx_core.components.protocols import DiffusionStepProtocol
 from ltx_core.loader import LoraPathStrengthAndSDOps
 from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
-from ltx_core.model.audio_vae import AudioEncoder
 from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.model.video_vae import decode_video as vae_decode_video
@@ -26,6 +23,7 @@
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
 from ltx_pipelines.utils.constants import (
+    AUDIO_SAMPLE_RATE,
     DISTILLED_SIGMA_VALUES,
     STAGE_2_DISTILLED_SIGMA_VALUES,
 )
@@ -55,19 +53,9 @@ def load_audio_input(audio_path: str, target_sample_rate: int, device: torch.dev
     if sample_rate != target_sample_rate:
         waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
     
-    # Ensure stereo/mono match? LTX usually expects specific channels? 
-    # Usually we just take the first channel or mix if needed, but let's assume standard handling or strict match later.
-    # LTX Audio VAE likely handles what it needs, but we need to check channels.
-    # checking audio_vae.py -> ch=128 (latent), but input?
-    # The VAE expects spectrograms, usually.
-    # Wait, the pipeline usually generates audio.
-    # We need to *encode* the waveform to latents.
-    # The repository doesn't seem to expose a direct "wav -> latent" helper in the viewed files easily, 
-    # but `AudioEncoder` takes `spectrogram`.
-    # We need `AudioProcessor` to convert wav to spectrogram.
-    
     return waveform.to(device)
 
+
 class MusicToVideoPipeline:
     """
     Modified DistilledPipeline for Music-to-Video generation.
@@ -105,50 +93,33 @@ def encode_audio_latents(self, waveform: torch.Tensor) -> torch.Tensor:
         """
         Encodes the audio waveform into latents using the VAE encoder.
         """
-        # We need the AudioVAE encoder and the AudioProcessor (spectrogram converter)
-        # The ModelLedger provides audio_encoder().
-        # We need to instantiate AudioProcessor manually or find where it is.
-        # It's in `ltx_core.model.audio_vae.ops`.
         from ltx_core.model.audio_vae.ops import AudioProcessor
-        
-        # Defaults based on audio_vae.py
+
         n_fft = 1024
         mel_hop_length = 160
         mel_bins = 64
         
         audio_processor = AudioProcessor(
-            sample_rate=16000,
+            sample_rate=24000,
             mel_bins=mel_bins,
             mel_hop_length=mel_hop_length,
             n_fft=n_fft
         ).to(self.device)
-        
-        # Spectrogram expects (batch, channels, time) or similar?
-        # AudioProcessor.get_spectrogram usually takes (batch, channels, time).
+
         if waveform.dim() == 1:
             waveform = waveform.unsqueeze(0).unsqueeze(0)
         elif waveform.dim() == 2:
-            waveform = waveform.unsqueeze(0) # (1, C, T)
+            waveform = waveform.unsqueeze(0)
 
-        # Ensure stereo (2 channels) as expected by AudioEncoder
         if waveform.shape[1] == 1:
             waveform = waveform.repeat(1, 2, 1)
         elif waveform.shape[1] > 2:
             waveform = waveform[:, :2, :]
             
-        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), 16000)
-        # spectrogram shape: (Batch, Channels, Time, Freq)
-        
+        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), 24000)
         audio_encoder = self.model_ledger.audio_encoder()
-        
-        # Encoder expects (Batch, Channels, Time, Freq)
-        # Check AudioEncoder logic. It uses `conv_in` which is 2D conv... on what?
-        # Usually (B, C, H, W). Time and Freq are H, W.
-        # Spectrogram is (B, 1, T, F).
-        
         encoded_latents = audio_encoder(spectrogram.to(self.dtype))
-        
-        # Clean up
+
         del audio_encoder
         del audio_processor
         cleanup_memory()
@@ -188,15 +159,10 @@ def __call__(
         
         if audio_input_path:
              print(f"Loading audio from {audio_input_path}")
-             # We load raw waveform for final mix, and encode for latents
-             audio_waveform = load_audio_input(audio_input_path, 16000, self.device)
+
+             audio_waveform = load_audio_input(audio_input_path, 24000, self.device)
              print("Encoding audio latents...")
              audio_latents = self.encode_audio_latents(audio_waveform)
-             
-             # Verify latent shape matches required frames?
-             # Audio VAE has downsample factor ~4 in time?
-             # LTX expects aligned video/audio latents.
-             # We might need to crop/pad audio_latents to match num_frames video equivalent.
              pass
 
         # --- PROMPT CACHE LOGIC START ---
@@ -244,10 +210,6 @@ def __call__(
             cleanup_memory()
         
         video_context, audio_context = context_p
-        
-        # If we have input audio, we theoretically ignore audio_context (text-to-audio guidance) 
-        # or we keep it to help semantic matching? 
-        # We'll keep it as the model expects it, but we force the audio latents.
 
         print("Stage 1: Initial low resolution video generation.")
 
@@ -258,64 +220,14 @@ def music_denoising_loop(
                 sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
         ) -> tuple[LatentState, LatentState]:
             
-            # Custom loop that enforces audio_state to be the input music
-            # We wrap the underlying euler_denoising_loop logic but override audio state updates?
-            # euler_denoising_loop iterates over sigmas.
-            # We can't easily inject into the internal loop of euler_denoising_loop without copying it.
-            # So we will copy `euler_denoising_loop` logic here or modify how we call it.
-            
-            # Actually, `euler_denoising_loop` takes `denoise_fn`.
-            # We can modify `denoise_fn` to output 0 correction for audio, 
-            # AND strictly set `audio_state` to `audio_latents` (noised appropriately? or clean?).
-            
-            # If we want the video to attend to the "clean" audio, we should present the clean audio to the transformer.
-            # But the transformer expects noised input at step t.
-            # If we pass "clean" audio as "noised audio", the model might be confused if it expects noise.
-            # HOWEVER, if we want strict control, we can just say audio_state is the latent.
-            
-            # Simplified approach:
-            # 1. We let the loop run.
-            # 2. Inside the denoise_fn, we ignore the predicted audio correction.
-            # 3. We overwrite the audio_state in the loop? `euler_denoising_loop` returns the final state.
-            
-            # Better: Copy `euler_denoising_loop` logic:
-            
             v_x = video_state
             a_x = audio_state
             
-            # If audio_latents provided, we initialize a_x to it?
-            # But a_x starts as pure noise in the standard pipeline.
-            # If we want to condition, we should probably start with the noised version of our audio?
-            # Or just clean audio? 
-            # "Audio Reactive" usually means we use audio features to drive generation.
-            # In LTX, video and audio are generated jointly.
-            # If we feed the *clean* audio latent (encoded from file) as `audio_state` at every step,
-            # the transformer will see it via self-attention/cross-attention layers.
-            
             for i in range(len(sigmas) - 1):
                 sigma_hat = sigmas[i]
                 sigma_next = sigmas[i + 1]
-                
-                # If we have fixed audio, we might want to force a_x to be the 'noised' version of our target audio at this sigma level?
-                # Or just the clean target audio if the model is robust enough? 
-                # Let's try forcing it to be the correct 'noised' level state of our ground truth audio.
                 if loop_audio_latents is not None:
-                     # Add noise to clean audio_latents matching current sigma
-                     # noise = torch.randn_like(audio_latents)
-                     # a_x_target = audio_latents + noise * sigma_hat
-                     # But this changes noise every step.
-                     # We should define the noise once or use consistent noise.
-                     # Simpler: Just force a_x = audio_latents (clean). 
-                     # The model might treat it as "denoised" and try to predict 0 noise.
-                     # Let's trust the detailed plan: "inject these latents".
-                     
-                     # We will set a_x to the audio_latents, but we need to ensure dimensions match.
-                     # Audio VAE latent might have different length than what 'noise_audio_state' produced if frames differ.
-                     # We align them before loop.
-                     
-                     # We align them before loop.
-                     
-                     a_x = replace(a_x, latent=loop_audio_latents) # Force strict guidance with flattened latents
+                     a_x = replace(a_x, latent=loop_audio_latents)
                      pass
 
                 denoised_v, denoised_a = simple_denoising_func(
@@ -323,17 +235,12 @@ def music_denoising_loop(
                     audio_context=audio_context,
                     transformer=transformer,
                     is_conditioning=is_conditioning,
-                    disable_audio=False, # We want the model to see audio
+                    disable_audio=False,
                 )(v_x, a_x, sigmas, i)
-                
-                # Euler step for Video
+
                 d_v = (v_x.latent - denoised_v) / sigma_hat
                 dt = sigma_next - sigma_hat
                 v_x = replace(v_x, latent=v_x.latent + d_v * dt)
-                
-                # For Audio, if we are forcing it, we don't need to step it. 
-                # If we are NOT forcing it (just initializing), we would step it.
-                # Here we are FORCING it to be our input. So we skip audio update or reset it next loop.
             
             return v_x, a_x
 
@@ -344,9 +251,7 @@ def music_denoising_loop(
             height=height // 2,
             fps=frame_rate,
         )
-        
-        
-        # Prepare conditionings
+
         stage_1_conditionings = []
         is_conditioning = False
         if images:
@@ -364,12 +269,8 @@ def music_denoising_loop(
             del video_encoder
             cleanup_memory()
 
-        # Align audio latents to match video duration
         if audio_latents is not None:
-             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(
-                 stage_1_output_shape, 
-                 sample_rate=16000
-             )
+             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape)
              target_frames = expected_audio_shape.frames
              current_frames = audio_latents.shape[2]
              
@@ -379,11 +280,6 @@ def music_denoising_loop(
              elif current_frames < target_frames:
                  print(f"Aligning audio: Padding from {current_frames} to {target_frames}")
                  pad_amount = target_frames - current_frames
-                 # Pad dimension 2 (Time). 
-                 # F.pad for 4D input (B, C, T, F). 
-                 # Pad args are (dim_-1_left, dim_-1_right, dim_-2_left, dim_-2_right, ...)
-                 # We want to pad dim 2 (Time), which is dim_-2.
-                 # So args are (freq_left, freq_right, time_left, time_right)
                  audio_latents = torch.nn.functional.pad(audio_latents, (0, 0, 0, pad_amount))
         
         if audio_latents is not None:
@@ -395,16 +291,10 @@ def music_denoising_loop(
                  self.dtype,
                  self.device,
                  noise_scale=1.0,
-                 initial_latent=audio_latents, # Pass 4D aligned latents
+                 initial_latent=audio_latents,
              )
-             
-             # Create flattened version for loop injection
-             # b c t f -> b t (c f)
+
              loop_audio_latents = einops.rearrange(audio_latents, "b c t f -> b t (c f)")
-             
-             # Check compatibility with transformer and slice if needed
-             # Transformer expects [B, T, 16] but encoder gives [B, T, 128]
-             # X0Model wraps LTXModel as velocity_model
              in_features = transformer.velocity_model.audio_patchify_proj.in_features
              if loop_audio_latents.shape[-1] != in_features:
                  print(f"Aligning audio features for loop: {loop_audio_latents.shape[-1]} -> {in_features}")
@@ -421,29 +311,8 @@ def music_denoising_loop(
             )
             loop_audio_latents = None
 
-
-
         print("Stage 1: Starting denoising loop.", time.time() - startAt)
         
-        # Initialize states
-        # We need to manually initialize if we want to inject audio properly from start?
-        # denoise_audio_video creates random noise.
-        # We can pass initial_audio_latent!
-        
-        # Resize audio_latents to match stage 1? 
-        # Audio VAE latent frames depend on time. 
-        # If we trained on specific fps/resolution, audio latent structure is independent of video resolution (mostly), 
-        # but depends on duration (frames / fps).
-        # We should check if stage 1 and stage 2 use different audio latent structures?
-        # Typically audio is same, video resolution changes.
-        
-        # So we can pass audio_latents as initial_audio_latent.
-        # BUT `denoise_audio_video` adds noise to initial_latent if provided (via `noise_audio_state`).
-        # We want to maybe start with it?
-        
-        # Let's stick to the `music_denoising_loop` strategy of forcing it.
-        # But we need to ensure `audio_state` has correct shape.
-        
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
@@ -455,9 +324,7 @@ def music_denoising_loop(
             dtype=dtype,
             device=self.device,
             is_conditioning=is_conditioning,
-            initial_audio_latent=audio_latents if audio_latents is not None else None 
-            # If we pass it here, it gets fully noised. 
-            # The loop will then overwrite it with clean/fixed version if we implemented it right.
+            initial_audio_latent=audio_latents if audio_latents is not None else None
         )
         
         print("Stage 1: Finish denoising loop.", time.time() - startAt)
@@ -475,15 +342,8 @@ def music_denoising_loop(
             del video_decoder
             cleanup_memory()
             
-            # For preview, we can use the decoded audio from state (which should match input if we forced it)
-            # OR just use the original input waveform.
-            # Using state validates that the VAE cycle works.
-            # But for "music to video", we probably want the HQ input audio in output.
-            
             decoded_audio = None
             if audio_latents is not None:
-                # Decode the latent to check it? Or just use raw waveform?
-                # Let's decode to be safe/consistent with pipeline flow.
                 vocoder = self.model_ledger.vocoder()
                 decoded_audio = vae_decode_audio(
                     audio_state.latent, self.model_ledger.audio_decoder(), vocoder
@@ -495,8 +355,8 @@ def music_denoising_loop(
             encode_video(
                 video=decoded_video,
                 fps=fps,
-                audio=audio_waveform.cpu() if audio_waveform is not None else decoded_audio, # Use HQ input if available
-                audio_sample_rate=24000,
+                audio=audio_waveform.cpu() if audio_waveform is not None else decoded_audio,
+                audio_sample_rate=AUDIO_SAMPLE_RATE,
                 output_path=output_path.replace('.mp4', '_.mp4'),
                 video_chunks_number=video_chunks_number,
             )
@@ -534,13 +394,13 @@ def music_denoising_loop(
             noiser=noiser,
             sigmas=stage_2_sigmas,
             stepper=stepper,
-            denoising_loop_fn=music_denoising_loop, # Reuse forced audio loop
+            denoising_loop_fn=music_denoising_loop,
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
             noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=audio_latents, # Reuse clean audio latents
+            initial_audio_latent=audio_latents,
             is_conditioning=is_conditioning
         )
         
@@ -560,34 +420,10 @@ def music_denoising_loop(
         cleanup_memory()
     
         if audio_state is not None:
-            # Unpatchify if needed (3D -> 4D) before decoding
-            # If loop returned flattened latents, we must restore them.
-            # However, we sliced features to 16. Decoder expects 128 (8ch * 16bins).
-            # We cannot simply reshape 16 features back to 128.
-            # But wait, audio_state was initialized with FULL 4D latents.
-            # The loop only manipulated a_x which had sliced features.
-            # If the loop returns a_x, it has sliced features (16).
-            # We need to restore it to original 128 features (if we want to decode the result of the loop).
-            # OR: if we forced guidance, we just want to decode the ORIGINAL audio_latents (which we have).
-            
-            # If we just want to save the audio we generated/guided:
-            # Since we forced a_x = loop_audio_latents (which is sliced input),
-            # The output audio_state.latent is equal to loop_audio_latents (sliced).
-            
-            # We actually want to decode the original audio we passed in (audio_latents 4D).
-            # So we can just decode `audio_latents` directly? 
-            # Yes, since we are doing "music to video", the audio is input, not generated.
-            # So we should decode the `audio_latents` (the aligned 4D input) instead of `audio_state.latent`.
-            
-            # BUT `denoise_audio_video` returns `audio_state`. 
-            # If we trust the pipeline, we should use what it returns.
-            # But since we sliced it, it's destructive.
-            # Let's decode the original aligned `audio_latents` (which is 4D, 128 features).
             pass
 
         audio = None
         if audio_state is not None:
-            # Decode the original aligned input audio, ensuring we have the full feature set
             if audio_latents is not None:
                  vocoder = self.model_ledger.vocoder()
                  audio = vae_decode_audio(audio_latents, self.model_ledger.audio_decoder(), vocoder)
@@ -595,8 +431,6 @@ def music_denoising_loop(
                  del vocoder
                  cleanup_memory()
             else:
-                 # Generative case? We don't support generative audio here yet with this slicing hack.
-                 # Attempt decode (will likely fail if sliced)
                  pass
         
         print("Stage 3: Done.", time.time() - startAt)
@@ -640,7 +474,7 @@ def main() -> None:
         video=video,
         fps=args.frame_rate,
         audio=audio,
-        audio_sample_rate=24000,
+        audio_sample_rate=AUDIO_SAMPLE_RATE,
         output_path=args.output_path,
         video_chunks_number=video_chunks_number,
     )

From 51f323a051923768944911ab773ace74a9ef0214 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sat, 21 Feb 2026 16:20:35 +0100
Subject: [PATCH 31/38] fix

---
 .../ltx-pipelines/src/ltx_pipelines/music_to_video.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
index e69992a9..80cd9934 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
@@ -23,7 +23,6 @@
 from ltx_pipelines.utils import ModelLedger
 from ltx_pipelines.utils.args import default_2_stage_distilled_arg_parser
 from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
     DISTILLED_SIGMA_VALUES,
     STAGE_2_DISTILLED_SIGMA_VALUES,
 )
@@ -47,6 +46,8 @@
 logging.getLogger("accelerate").setLevel(logging.ERROR)
 logging.getLogger("ltx_core").setLevel(logging.ERROR)
 
+AUDIO_SAMPLE_RATE = 24000
+
 
 def load_audio_input(audio_path: str, target_sample_rate: int, device: torch.device) -> torch.Tensor:
     waveform, sample_rate = torchaudio.load(audio_path)
@@ -100,7 +101,7 @@ def encode_audio_latents(self, waveform: torch.Tensor) -> torch.Tensor:
         mel_bins = 64
         
         audio_processor = AudioProcessor(
-            sample_rate=24000,
+            sample_rate=AUDIO_SAMPLE_RATE,
             mel_bins=mel_bins,
             mel_hop_length=mel_hop_length,
             n_fft=n_fft
@@ -116,7 +117,7 @@ def encode_audio_latents(self, waveform: torch.Tensor) -> torch.Tensor:
         elif waveform.shape[1] > 2:
             waveform = waveform[:, :2, :]
             
-        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), 24000)
+        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), AUDIO_SAMPLE_RATE)
         audio_encoder = self.model_ledger.audio_encoder()
         encoded_latents = audio_encoder(spectrogram.to(self.dtype))
 
@@ -160,7 +161,7 @@ def __call__(
         if audio_input_path:
              print(f"Loading audio from {audio_input_path}")
 
-             audio_waveform = load_audio_input(audio_input_path, 24000, self.device)
+             audio_waveform = load_audio_input(audio_input_path, AUDIO_SAMPLE_RATE, self.device)
              print("Encoding audio latents...")
              audio_latents = self.encode_audio_latents(audio_waveform)
              pass
@@ -481,3 +482,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+# validation prompt: Cinematic macro shot. Static camera recording concert. White man and beautiful greek woman singing on the scene rock musicians on blurred depth of field background. constant lighting no darkening on start and end of video.
\ No newline at end of file

From af699b92502fd05c71374631f132ec84310b206f Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sat, 21 Feb 2026 18:09:45 +0100
Subject: [PATCH 32/38] fix

---
 .../src/ltx_pipelines/music_to_video.py         | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
index 80cd9934..d6d3d8b2 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
@@ -224,18 +224,25 @@ def music_denoising_loop(
             v_x = video_state
             a_x = audio_state
             
+            if a_x is not None and getattr(a_x, 'denoise_mask', None) is not None:
+                a_x = replace(a_x, denoise_mask=torch.ones_like(a_x.denoise_mask))
+
+            audio_noise = a_x.latent.clone() if a_x is not None else None
+            
             for i in range(len(sigmas) - 1):
                 sigma_hat = sigmas[i]
                 sigma_next = sigmas[i + 1]
                 if loop_audio_latents is not None:
-                     a_x = replace(a_x, latent=loop_audio_latents)
+                     sigma_val = sigma_hat.item() if isinstance(sigma_hat, torch.Tensor) else sigma_hat
+                     noised_audio = loop_audio_latents * (1.0 - sigma_val) + audio_noise * sigma_val
+                     a_x = replace(a_x, latent=noised_audio.to(a_x.latent.dtype))
                      pass
 
                 denoised_v, denoised_a = simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
                     transformer=transformer,
-                    is_conditioning=is_conditioning,
+                    is_conditioning=True,
                     disable_audio=False,
                 )(v_x, a_x, sigmas, i)
 
@@ -254,9 +261,7 @@ def music_denoising_loop(
         )
 
         stage_1_conditionings = []
-        is_conditioning = False
         if images:
-            is_conditioning = True
             video_encoder = self.model_ledger.video_encoder()
             stage_1_conditionings = image_conditionings_by_replacing_latent(
                 images=images,
@@ -324,7 +329,7 @@ def music_denoising_loop(
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
-            is_conditioning=is_conditioning,
+            is_conditioning=True,
             initial_audio_latent=audio_latents if audio_latents is not None else None
         )
         
@@ -402,7 +407,7 @@ def music_denoising_loop(
             noise_scale=stage_2_sigmas[0],
             initial_video_latent=upscaled_video_latent,
             initial_audio_latent=audio_latents,
-            is_conditioning=is_conditioning
+            is_conditioning=True
         )
         
         print("Stage 2: Finish upsample.", time.time() - startAt)

From bbcbe3694a249b3138372fc34f5521551da767f7 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sat, 21 Feb 2026 20:22:18 +0100
Subject: [PATCH 33/38] Fixed lipsynk

---
 music_maker_ui_v2.py                          | 471 ++++++++++++++++
 .../src/ltx_pipelines/music_to_video_v2.py    | 511 ++++++++++++++++++
 .../src/ltx_pipelines/utils/args.py           |  44 ++
 3 files changed, 1026 insertions(+)
 create mode 100644 music_maker_ui_v2.py
 create mode 100644 packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py

diff --git a/music_maker_ui_v2.py b/music_maker_ui_v2.py
new file mode 100644
index 00000000..d9db8d19
--- /dev/null
+++ b/music_maker_ui_v2.py
@@ -0,0 +1,471 @@
+import gradio as gr
+import subprocess
+import os
+import datetime
+import threading
+import sys
+import math
+import torchaudio
+from collections import deque
+import cv2
+
+# --- Configuration & Defaults ---
+DEFAULT_CHECKPOINT = "./models/ltx-2-19b-dev-fp8.safetensors"
+DEFAULT_CHECKPOINT_STAGE_2 = "./models/ltx-2-19b-distilled-fp8.safetensors"
+DEFAULT_GEMMA = "./models/gemma3"
+DEFAULT_UPSAMPLER = "./models/ltx-2-spatial-upscaler-x2-1.0.safetensors"
+AUDIO_CLIPS_DIR = "./audio_clips"
+
+# --- Global State ---
+JOB_QUEUE = deque()
+QUEUE_LOCK = threading.Lock()
+CURRENT_LOG = "System Ready. Waiting for jobs..."
+LATEST_VIDEO_PATH = None
+IS_PROCESSING = False
+STOP_GENERATION = False
+CURRENT_JOB_ID = None
+CURRENT_PROCESS = None
+CURRENT_OUTPUT_PATH = None
+CURRENT_SCENE_INDEX = -1
+# SCENES_DATA will store: {'prompt': str, 'video_path': str, 'audio_path': str, 'first_frame': str, 'last_frame': str}
+SCENES_DATA = [None] * 20  # Increased to 20 scenes support
+
+# --- Logic Functions ---
+
+def extract_frame(video_path, output_image_path, frame_idx=0):
+    """Extracts a specific frame by index using OpenCV"""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return False
+        
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    
+    # Handle negative indices (like -1 for last)
+    if frame_idx < 0:
+        frame_idx = frame_count + frame_idx
+    
+    # Bounds check
+    if frame_idx >= frame_count or frame_idx < 0:
+        cap.release()
+        return False
+
+    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+    success, frame = cap.read()
+    if success:
+        cv2.imwrite(output_image_path, frame)
+        cap.release()
+        return True
+    cap.release()
+    return False
+
+def extract_first_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, 0)
+
+def extract_last_frame(video_path, output_image_path):
+    return extract_frame(video_path, output_image_path, -1)
+
+def slice_audio(audio_path, prompt, fps, num_frames):
+    global SCENES_DATA
+    if not audio_path:
+        return "Please upload an audio file.", []
+    
+    try:
+        os.makedirs(AUDIO_CLIPS_DIR, exist_ok=True)
+        
+        # Load audio info
+        info = torchaudio.info(audio_path)
+        sample_rate = info.sample_rate
+        total_frames = info.num_frames
+        duration_sec = total_frames / sample_rate
+        
+        # Calculate video scene duration
+        scene_duration_sec = num_frames / fps
+        
+        num_scenes = math.ceil(duration_sec / scene_duration_sec)
+        num_scenes = min(num_scenes, 20) # Limit to 20 scenes
+        
+        waveform, sr = torchaudio.load(audio_path)
+        
+        new_scenes_data = [None] * 20
+        ui_updates = []
+        
+        samples_per_scene = int(scene_duration_sec * sr)
+        
+        for i in range(num_scenes):
+            start_sample = i * samples_per_scene
+            end_sample = min((i + 1) * samples_per_scene, total_frames)
+            
+            chunk_waveform = waveform[:, start_sample:end_sample]
+            
+            # Save chunk
+            chunk_filename = f"scene_{i+1}_audio.wav"
+            chunk_path = os.path.join(AUDIO_CLIPS_DIR, chunk_filename)
+            torchaudio.save(chunk_path, chunk_waveform, sr)
+            
+            new_scenes_data[i] = {
+                'prompt': prompt, # Copy master prompt
+                'video_path': None,
+                'audio_path': os.path.abspath(chunk_path),
+                'first_frame': None,
+                'last_frame': None
+            }
+            
+        SCENES_DATA = new_scenes_data
+        
+        # Prepare UI updates
+        for i in range(20):
+            if i < num_scenes:
+                # Row visible, Textbox updated
+                ui_updates.append(gr.update(visible=True)) # Row
+                ui_updates.append(gr.update(value=prompt, visible=True)) # Textbox
+                ui_updates.append(gr.update(value=new_scenes_data[i]['audio_path'])) # Audio path display
+            else:
+                ui_updates.append(gr.update(visible=False)) # Row
+                ui_updates.append(gr.update(visible=False)) # Textbox
+                ui_updates.append(gr.update(value=None))
+        
+        return tuple([f"Sliced into {num_scenes} scenes."] + ui_updates)
+        
+    except Exception as e:
+        print(f"DEBUG Error in slice_audio: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 3) # Update this count if UI structure changes
+
+def process_chain_generation(scenes_list, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    """
+    mode: "forward" (chain from start_index up to end) or "single" (only start_index)
+    """
+    global CURRENT_LOG, LATEST_VIDEO_PATH, CURRENT_PROCESS, CURRENT_OUTPUT_PATH, IS_PROCESSING, STOP_GENERATION, SCENES_DATA
+
+    IS_PROCESSING = True
+    STOP_GENERATION = False
+    
+    # scenes_list is a list of prompts (or None for empty slots)
+    valid_indices = [i for i, p in enumerate(scenes_list) if p]
+    if not valid_indices:
+        IS_PROCESSING = False
+        return
+
+    if mode == "single":
+        indices_to_process = [start_index]
+    else:
+        # Forward chain from start_index (or the first valid index) up to the end
+        current_start = start_index if start_index is not None else valid_indices[0]
+        indices_to_process = [i for i in range(current_start, len(scenes_list)) if i in valid_indices]
+
+    for i in indices_to_process:
+        global CURRENT_SCENE_INDEX
+        CURRENT_SCENE_INDEX = i
+        if STOP_GENERATION:
+            CURRENT_LOG += "\n--- STOPPED BY USER ---\n"
+            break
+            
+        prompt = scenes_list[i]
+        scene_data = SCENES_DATA[i]
+        
+        if not scene_data:
+             CURRENT_LOG += f"\nSkipping scene {i+1} : No Data\n"
+             continue
+             
+        audio_path = scene_data.get('audio_path')
+        scene_id = i + 1
+        CURRENT_LOG += f"\n\n--- GENERATING SCENE {scene_id} ---\n"
+        
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_filename = f"scene_{scene_id}_{timestamp}.mp4"
+        output_path = os.path.abspath(output_filename)
+        CURRENT_OUTPUT_PATH = output_path
+        
+        # --- Context & Continuity Setup ---
+        actual_prompt = prompt
+        actual_num_frames = int(num_frames)
+        conditioning_frames = []
+        
+        current_seed = seed
+        if random_seed:
+            current_seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
+
+        # Standard Continuity (Music Video likely doesn't need context compression as much as continuity? Let's keep Standard for now)
+        if mode == "forward":
+            # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
+            if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                prev_video = SCENES_DATA[i-1]['video_path']
+                
+                cap = cv2.VideoCapture(prev_video)
+                prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                cap.release()
+                last_lat = (prev_frame_cnt - 1) // 8
+                
+                f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
+                f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
+                if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                     conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                     CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
+
+        # Build Command for music_to_video_v2.py
+        cmd = [
+            sys.executable, "-m", "ltx_pipelines.music_to_video_v2",
+            "--checkpoint-path", checkpoint,
+            "--stage-2-checkpoint-path", stage_2_checkpoint,
+            "--gemma-root", gemma,
+            "--spatial-upsampler-path", upsampler,
+            "--prompt", actual_prompt,
+            "--output-path", output_path,
+            "--width", str(width),
+            "--height", str(height),
+            "--num-frames", str(int(actual_num_frames)),
+            "--frame-rate", str(fps),
+            "--num-inference-steps", str(int(steps)),
+            "--seed", str(int(current_seed)),
+            "--enable-fp8"
+        ]
+        
+        if audio_path:
+            cmd.extend(["--audio-input-path", audio_path])
+            
+        for frame_path, latent_idx, guidance in conditioning_frames:
+            cmd.extend(["--image", frame_path, str(latent_idx), str(guidance)])
+
+        try:
+            CURRENT_PROCESS = subprocess.Popen(
+                cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                text=True, bufsize=1, universal_newlines=True
+            )
+            for line in CURRENT_PROCESS.stdout:
+                if STOP_GENERATION:
+                    subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+                    break
+                CURRENT_LOG += line
+            CURRENT_PROCESS.wait()
+            
+            if CURRENT_PROCESS.returncode == 0 and os.path.exists(output_path):
+                CURRENT_LOG += f"Scene {scene_id} Complete.\n"
+                LATEST_VIDEO_PATH = output_path
+                
+                # Update SCENES_DATA
+                first_f = f"scene_{scene_id}_first.jpg"
+                last_f = f"scene_{scene_id}_last.jpg"
+                extract_first_frame(output_path, first_f)
+                extract_last_frame(output_path, last_f)
+                
+                SCENES_DATA[i]['video_path'] = output_path
+                SCENES_DATA[i]['first_frame'] = first_f
+                SCENES_DATA[i]['last_frame'] = last_f
+            else:
+                CURRENT_LOG += f"Scene {scene_id} Failed or Canceled.\n"
+                break
+        except Exception as e:
+            CURRENT_LOG += f"Exception: {str(e)}\n"
+            break
+            
+    CURRENT_LOG += "\n--- GENERATION CYCLE FINISHED ---\n"
+    IS_PROCESSING = False
+
+def start_generation_thread(prompts, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+    # Clear subsequent scenes in data if starting a chain or single regeneration logic?
+    # For music video, if we regenerate, we keep the audio_path!
+    # So we should only clear video_path.
+    
+    if mode == "forward":
+        begin_idx = start_index if start_index is not None else 0
+        for i in range(begin_idx, 20):
+            if SCENES_DATA[i]:
+                SCENES_DATA[i]['video_path'] = None
+                SCENES_DATA[i]['first_frame'] = None
+                SCENES_DATA[i]['last_frame'] = None
+
+    threading.Thread(target=process_chain_generation, args=(prompts, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
+    return "Generation started..."
+
+def stop_generation():
+    global STOP_GENERATION
+    STOP_GENERATION = True
+    return "Stopping..."
+
+def update_ui():
+    global LATEST_VIDEO_PATH, CURRENT_LOG, SCENES_DATA, CURRENT_OUTPUT_PATH, CURRENT_SCENE_INDEX
+    status = "Processing..." if IS_PROCESSING else "Idle"
+    
+    # Prepare updates for all scene boxes
+    updates = []
+    for i in range(20):
+        data = SCENES_DATA[i]
+        
+        display_video = data.get('video_path') if data else None
+        display_preview = data.get('last_frame') if data else None
+        
+        # Intermediate preview logic
+        if IS_PROCESSING and CURRENT_SCENE_INDEX == i and CURRENT_OUTPUT_PATH:
+            preview_file = CURRENT_OUTPUT_PATH.replace('.mp4', '_.mp4')
+            if os.path.exists(preview_file):
+                display_video = preview_file
+        
+        if data or (IS_PROCESSING and CURRENT_SCENE_INDEX == i):
+            v_val = display_video
+            p_val = display_preview
+            updates.append(gr.update(value=v_val, visible=True))
+            updates.append(gr.update(value=p_val, visible=True))
+        else:
+            updates.append(gr.update(value=None)) # Video
+            updates.append(gr.update(value=None)) # Image
+            
+    return tuple([LATEST_VIDEO_PATH, CURRENT_LOG, status] + updates)
+
+def cancel_job():
+    global CURRENT_PROCESS, CURRENT_LOG
+    if CURRENT_PROCESS:
+        try:
+            subprocess.run(["taskkill", "/F", "/T", "/PID", str(CURRENT_PROCESS.pid)], capture_output=True)
+            CURRENT_LOG += "\n--- CANCELED ---\n"
+            return "Canceled."
+        except:
+            return "Error canceling."
+    return "No active process."
+
+# --- UI Layout ---
+
+theme = gr.themes.Soft(primary_hue="purple").set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+)
+
+with gr.Blocks(title="LTX-2 Music Video Maker", theme=theme) as demo:
+    gr.Markdown("# 🎵 LTX-2 Music Video Maker")
+    
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_file = gr.Audio(label="Upload Music File", type="filepath")
+            master_prompt = gr.Textbox(label="Visual Style / Prompt", placeholder="Cyberpunk city, neon lights, rain...", lines=2)
+            
+            with gr.Accordion("LTX-2 Settings", open=True):
+                checkpoint = gr.Textbox(label="Stage 1 Checkpoint", value=DEFAULT_CHECKPOINT)
+                stage_2_checkpoint = gr.Textbox(label="Stage 2 Checkpoint", value=DEFAULT_CHECKPOINT_STAGE_2)
+                gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
+                upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
+                with gr.Row():
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=12)
+                    fps = gr.Number(label="FPS", value=24)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=1280)
+                    height = gr.Number(label="Height", value=704)
+                num_frames = gr.Slider(label="Frames per Scene", minimum=9, maximum=257, step=8, value=225)
+                
+                slice_btn = gr.Button("🔪 Slice Music & Prepare Scenes", variant="primary")
+                
+                with gr.Row():
+                    seed = gr.Number(label="Seed", value=10, precision=0)
+                    random_seed = gr.Checkbox(label="Random Seed", value=True)
+                with gr.Row():
+                    # context compression not prioritized for now, keeping args for compatibility
+                    use_context_compression = gr.Checkbox(label="Use Context Compression", value=False, visible=False) 
+                    latent_reuse_count = gr.Slider(label="Latent Reuse", minimum=1, maximum=8, step=1, value=2, visible=False)
+                    context_depth = gr.Slider(label="Context Depth", minimum=1, maximum=5, step=1, value=2, visible=False)
+
+        with gr.Column(scale=3):
+            gr.Markdown("### 🎞️ Video Scenes")
+            scene_rows = []
+            scene_prompts = []
+            scene_audio_labels = [] 
+            scene_videos = []
+            scene_previews = []
+            scene_reg_chain_btns = []
+            scene_reg_single_btns = []
+            
+            for i in range(1, 21):
+                with gr.Row(visible=False) as row: # Hidden until decomposed
+                    scene_rows.append(row)
+                    with gr.Column(scale=3):
+                        prompt_box = gr.Textbox(label=f"Scene {i} Prompt", lines=2)
+                        scene_prompts.append(prompt_box)
+                        audio_lbl = gr.Textbox(label=f"Scene {i} Audio", interactive=False)
+                        scene_audio_labels.append(audio_lbl)
+                        
+                        with gr.Row():
+                            chain_btn = gr.Button(f"🔗 Chain From {i}", size="sm")
+                            single_btn = gr.Button(f"🎯 Only {i}", size="sm")
+                            scene_reg_chain_btns.append(chain_btn)
+                            scene_reg_single_btns.append(single_btn)
+                    
+                    video_comp = gr.Video(label="Clip", scale=2)
+                    preview_comp = gr.Image(label="Last Frame", scale=1) 
+                    
+                    scene_videos.append(video_comp)
+                    scene_previews.append(preview_comp)
+            
+            with gr.Row():
+                generate_btn = gr.Button("🚀 Start Full Forward Chain", variant="primary", size="lg")
+                stop_btn = gr.Button("🛑 Stop", variant="secondary", size="lg")
+                cancel_btn = gr.Button("🗑️ Kill Process", variant="stop")
+            
+            latest_video = gr.Video(label="Latest Generated Scene (Global View)")
+            status_box = gr.Textbox(label="Status", interactive=False)
+    
+    with gr.Accordion("Worker Log", open=False):
+        log_box = gr.Textbox(label=None, lines=10, interactive=False)
+
+    # --- Events ---
+    slice_btn.click(
+        fn=slice_audio,
+        inputs=[audio_file, master_prompt, fps, num_frames],
+        outputs=[status_box] + [comp for triple in zip(scene_rows, scene_prompts, scene_audio_labels) for comp in triple]
+    )
+    
+    # Helper wrapper to collect all prompts
+    def collect_prompts_and_start(*args):
+        # Args structure: [prompts_list..., checkpoint, ..., button_args]
+        # We need to slice args.
+        num_scenes = 20
+        prompts = args[:num_scenes]
+        rest = args[num_scenes:]
+        return start_generation_thread(prompts, *rest)
+
+    all_inputs = scene_prompts + [checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
+    
+    generate_btn.click(
+        fn=collect_prompts_and_start,
+        inputs=all_inputs,
+        outputs=[status_box]
+    )
+
+    stop_btn.click(fn=stop_generation, outputs=[status_box])
+    cancel_btn.click(fn=cancel_job, outputs=[status_box])
+
+    # Per-scene buttons
+    for i in range(20):
+        def make_chain_fn(index):
+            def chain_fn(*args):
+                num_scenes = 20
+                prompts = args[:num_scenes]
+                rest = args[num_scenes:]
+                return start_generation_thread(prompts, *rest, start_index=index, mode="forward")
+            return chain_fn
+            
+        def make_single_fn(index):
+            def single_fn(*args):
+                num_scenes = 20
+                prompts = args[:num_scenes]
+                rest = args[num_scenes:]
+                return start_generation_thread(prompts, *rest, start_index=index, mode="single")
+            return single_fn
+
+        scene_reg_chain_btns[i].click(
+            fn=make_chain_fn(i),
+            inputs=all_inputs,
+            outputs=[status_box]
+        )
+        
+        scene_reg_single_btns[i].click(
+            fn=make_single_fn(i),
+            inputs=all_inputs,
+            outputs=[status_box]
+        )
+    
+    timer = gr.Timer(2)
+    timer.tick(
+        fn=update_ui, 
+        outputs=[latest_video, log_box, status_box] + [comp for zip_list in zip(scene_videos, scene_previews) for comp in zip_list]
+    )
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py
new file mode 100644
index 00000000..35ee1d5f
--- /dev/null
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py
@@ -0,0 +1,511 @@
+import logging
+import time
+import os
+import hashlib
+import einops
+
+from collections.abc import Iterator
+from dataclasses import replace
+
+import torch
+import torchaudio
+
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.guiders import CFGGuider
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.components.protocols import DiffusionStepProtocol
+from ltx_core.components.schedulers import LTX2Scheduler
+from ltx_core.loader import LoraPathStrengthAndSDOps
+from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.text_encoders.gemma import encode_text
+from ltx_core.types import AudioLatentShape, LatentState, VideoPixelShape
+from ltx_pipelines.utils import ModelLedger
+from ltx_pipelines.utils.args import default_2_stage_music_arg_parser
+from ltx_pipelines.utils.constants import STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    assert_resolution,
+    cleanup_memory,
+    denoise_audio_video,
+    euler_denoising_loop,
+    generate_enhanced_prompt,
+    get_device,
+    guider_denoising_func,
+    image_conditionings_by_replacing_latent,
+    simple_denoising_func,
+    noise_audio_state
+)
+from ltx_pipelines.utils.media_io import encode_video
+from ltx_pipelines.utils.types import PipelineComponents
+
+device = get_device()
+
+logging.basicConfig(level=logging.ERROR)
+logging.getLogger("accelerate").setLevel(logging.ERROR)
+logging.getLogger("ltx_core").setLevel(logging.ERROR)
+
+AUDIO_SAMPLE_RATE = 16000
+
+
+def load_audio_input(audio_path: str, target_sample_rate: int, device: torch.device) -> torch.Tensor:
+    waveform, sample_rate = torchaudio.load(audio_path)
+    if sample_rate != target_sample_rate:
+        waveform = torchaudio.functional.resample(waveform, sample_rate, target_sample_rate)
+    
+    return waveform.to(device)
+
+
+class MusicToVideoTwoStagesPipeline:
+    """
+    Two-stage text/image-to-video generation pipeline with audio conditioning.
+    Stage 1 generates video at the target resolution with CFG guidance, then
+    Stage 2 upsamples by 2x and refines using a distilled checkpoint for higher
+    quality output. Supports optional image and audio conditioning.
+    """
+
+    def __init__(
+        self,
+        checkpoint_path: str,
+        stage_2_checkpoint_path: str,
+        spatial_upsampler_path: str,
+        gemma_root: str,
+        loras: list[LoraPathStrengthAndSDOps],
+        device: torch.device = device,
+        fp8transformer: bool = False,
+    ):
+        print("Start Init")
+        startAt = time.time()
+        self.device = device
+        self.dtype = torch.bfloat16
+        
+        self.stage_1_model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=device,
+            checkpoint_path=checkpoint_path,
+            gemma_root_path=gemma_root,
+            spatial_upsampler_path=spatial_upsampler_path,
+            loras=loras,
+            fp8transformer=fp8transformer,
+        )
+
+        self.stage_2_model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=device,
+            checkpoint_path=stage_2_checkpoint_path,
+            gemma_root_path=gemma_root,
+            spatial_upsampler_path=spatial_upsampler_path,
+            loras=[],  # Stage 2 distilled checkpoint doesn't need loras
+            fp8transformer=fp8transformer,
+        )
+
+        self.pipeline_components = PipelineComponents(
+            dtype=self.dtype,
+            device=device,
+        )
+        print("End Init", time.time() - startAt)
+
+    def encode_audio_latents(self, waveform: torch.Tensor) -> torch.Tensor:
+        """
+        Encodes the audio waveform into latents using the VAE encoder.
+        """
+        from ltx_core.model.audio_vae.ops import AudioProcessor
+
+        n_fft = 1024
+        mel_hop_length = 160
+        mel_bins = 64
+        
+        audio_processor = AudioProcessor(
+            sample_rate=AUDIO_SAMPLE_RATE,
+            mel_bins=mel_bins,
+            mel_hop_length=mel_hop_length,
+            n_fft=n_fft
+        ).to(self.device)
+
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0).unsqueeze(0)
+        elif waveform.dim() == 2:
+            waveform = waveform.unsqueeze(0)
+
+        if waveform.shape[1] == 1:
+            waveform = waveform.repeat(1, 2, 1)
+        elif waveform.shape[1] > 2:
+            waveform = waveform[:, :2, :]
+            
+        spectrogram = audio_processor.waveform_to_mel(waveform.to(self.device).float(), AUDIO_SAMPLE_RATE)
+        audio_encoder = self.stage_1_model_ledger.audio_encoder()
+        encoded_latents = audio_encoder(spectrogram.to(self.dtype))
+
+        del audio_encoder
+        del audio_processor
+        cleanup_memory()
+        
+        return encoded_latents
+
+    @torch.inference_mode()
+    def __call__(  # noqa: PLR0913
+            self,
+            prompt: str,
+            negative_prompt: str,
+            seed: int,
+            height: int,
+            width: int,
+            num_frames: int,
+            frame_rate: float,
+            num_inference_steps: int,
+            cfg_guidance_scale: float,
+            images: list[tuple[str, int, float]],
+            audio_input_path: str | None = None,
+            tiling_config: TilingConfig | None = None,
+            enhance_prompt: bool = False,
+            output_path: str = '',
+            video_chunks_number: int = 0,
+            fps: int = 0,
+            save_step_1_preview: bool = True,
+    ) -> tuple[Iterator[torch.Tensor], torch.Tensor | None]:
+        print("Start Call")
+        startAt = time.time()
+        assert_resolution(height=height, width=width, is_two_stage=True)
+
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        cfg_guider = CFGGuider(cfg_guidance_scale)
+        dtype = torch.bfloat16
+        
+        # --- LOAD AUDIO ---
+        audio_waveform = None
+        audio_latents = None
+        
+        if audio_input_path:
+             print(f"Loading audio from {audio_input_path}")
+             audio_waveform = load_audio_input(audio_input_path, AUDIO_SAMPLE_RATE, self.device)
+             print("Encoding audio latents...")
+             audio_latents = self.encode_audio_latents(audio_waveform)
+             pass
+
+        print("starting text encoder", time.time() - startAt)
+
+        # --- DISK CACHE LOGIC START ---
+        CACHE_DIR = "./prompt_embeddings_cache"
+        os.makedirs(CACHE_DIR, exist_ok=True)
+
+        image_identifier = images[0][0] if (len(images) > 0 and enhance_prompt) else "no_img"
+
+        hash_input_str = (
+            f"prompt:{prompt}|"
+            f"neg:{negative_prompt}|"
+            f"enhance:{enhance_prompt}|"
+            f"seed:{seed if enhance_prompt else 'ignored'}|"
+            f"img:{image_identifier}"
+        )
+
+        cache_filename = hashlib.md5(hash_input_str.encode('utf-8')).hexdigest() + ".pt"
+        cache_path = os.path.join(CACHE_DIR, cache_filename)
+
+        context_p = None
+        context_n = None
+
+        if os.path.exists(cache_path):
+            print(f"Disk cache hit! Loading embeddings from {cache_path}")
+            try:
+                cached_data = torch.load(cache_path, map_location=self.device)
+                context_p, context_n = cached_data
+            except Exception as e:
+                print(f"Failed to load cache (corrupted?): {e}. Regenerating.")
+
+        if context_p is None:
+            print("Disk cache miss. Running text encoder.")
+            text_encoder = self.stage_1_model_ledger.text_encoder()
+
+            current_prompt = prompt
+            if enhance_prompt:
+                current_prompt = generate_enhanced_prompt(
+                    text_encoder, prompt, images[0][0] if len(images) > 0 else None, seed=seed
+                )
+
+            context_p, context_n = encode_text(text_encoder, prompts=[current_prompt, negative_prompt])
+
+            print(f"Saving embeddings to {cache_path}")
+            torch.save((context_p, context_n), cache_path)
+
+            torch.cuda.synchronize()
+            del text_encoder
+            cleanup_memory()
+        # --- DISK CACHE LOGIC END ---
+
+        v_context_p, a_context_p = context_p
+        v_context_n, a_context_n = context_n
+        print("end text encoder", time.time() - startAt)
+
+        print("Stage 1: Initial low resolution video generation.", time.time() - startAt)
+        
+        video_encoder = self.stage_1_model_ledger.video_encoder()
+        transformer = self.stage_1_model_ledger.transformer()
+        sigmas = LTX2Scheduler().execute(steps=num_inference_steps).to(dtype=torch.float32, device=self.device)
+
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width // 2,
+            height=height // 2,
+            fps=frame_rate,
+        )
+
+        if audio_latents is not None:
+             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape)
+             target_frames = expected_audio_shape.frames
+             current_frames = audio_latents.shape[2]
+             
+             if current_frames > target_frames:
+                 print(f"Aligning audio: Trimming from {current_frames} to {target_frames}")
+                 audio_latents = audio_latents[:, :, :target_frames, :]
+             elif current_frames < target_frames:
+                 print(f"Aligning audio: Padding from {current_frames} to {target_frames}")
+                 pad_amount = target_frames - current_frames
+                 audio_latents = torch.nn.functional.pad(audio_latents, (0, 0, 0, pad_amount))
+        
+        loop_audio_latents = None
+        if audio_latents is not None:
+             audio_state, audio_tools = noise_audio_state(
+                 stage_1_output_shape,
+                 noiser,
+                 [], 
+                 self.pipeline_components,
+                 self.dtype,
+                 self.device,
+                 noise_scale=1.0,
+                 initial_latent=audio_latents,
+             )
+
+             loop_audio_latents = einops.rearrange(audio_latents, "b c t f -> b t (c f)")
+             in_features = transformer.velocity_model.audio_patchify_proj.in_features
+             if loop_audio_latents.shape[-1] != in_features:
+                 print(f"Aligning audio features for loop: {loop_audio_latents.shape[-1]} -> {in_features}")
+                 loop_audio_latents = loop_audio_latents[..., :in_features]
+                 
+        def first_stage_denoising_loop(
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
+        ) -> tuple[LatentState, LatentState]:
+            
+            # Use masking explicitly since it's the non-distilled model
+            if audio_state is not None and getattr(audio_state, 'denoise_mask', None) is not None:
+                audio_state = replace(audio_state, denoise_mask=torch.zeros_like(audio_state.denoise_mask))
+
+            v_x = video_state
+            a_x = audio_state
+            
+            for i in range(len(sigmas) - 1):
+                sigma_hat = sigmas[i]
+                sigma_next = sigmas[i + 1]
+                
+                if loop_audio_latents is not None:
+                     a_x = replace(a_x, latent=loop_audio_latents)
+                     
+                denoised_v, denoised_a = guider_denoising_func(
+                    cfg_guider,
+                    v_context_p,
+                    v_context_n,
+                    a_context_p,
+                    a_context_n,
+                    transformer=transformer,
+                )(v_x, a_x, sigmas, i)
+
+                d_v = (v_x.latent - denoised_v) / sigma_hat
+                dt = sigma_next - sigma_hat
+                v_x = replace(v_x, latent=v_x.latent + d_v * dt)
+
+            return v_x, a_x
+
+        stage_1_conditionings = []
+        if images:
+            stage_1_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+
+        print("Stage 1: Starting denoising loop.", time.time() - startAt)
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=sigmas,
+            stepper=stepper,
+            denoising_loop_fn=first_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            initial_audio_latent=audio_latents if audio_latents is not None else None
+        )
+        print("Stage 1: End denoising loop.", time.time() - startAt)
+        
+        del transformer
+        cleanup_memory()
+
+        if save_step_1_preview:
+            video_decoder = self.stage_1_model_ledger.video_decoder()
+            decoded_video = vae_decode_video(video_state.latent, video_decoder, tiling_config)
+            del video_decoder
+            cleanup_memory()
+            
+            decoded_audio = None
+            if audio_latents is not None:
+                vocoder = self.stage_1_model_ledger.vocoder()
+                decoded_audio = vae_decode_audio(
+                    audio_state.latent, self.stage_1_model_ledger.audio_decoder(), vocoder
+                )
+                del vocoder
+                cleanup_memory()
+            
+            encode_video(
+                video=decoded_video,
+                fps=fps,
+                audio=audio_waveform.cpu() if audio_waveform is not None else decoded_audio,
+                audio_sample_rate=AUDIO_SAMPLE_RATE,
+                output_path=output_path.replace('.mp4', '_.mp4'),
+                video_chunks_number=video_chunks_number,
+            )
+
+        print("Stage 2: Upsample and refine the video at higher resolution with distilled Checkpoint.", time.time() - startAt)
+        
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.stage_1_model_ledger.spatial_upsampler(),
+        )
+        print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
+
+        cleanup_memory()
+
+        transformer = self.stage_2_model_ledger.transformer()
+        distilled_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+
+        def second_stage_denoising_loop(
+                sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol, is_conditioning: bool = True
+        ) -> tuple[LatentState, LatentState]:
+            
+            # In Stage 2 (distilled), we use flow-matching interpolation so we don't break the joint trajectory
+            if audio_state is not None and getattr(audio_state, 'denoise_mask', None) is not None:
+                audio_state = replace(audio_state, denoise_mask=torch.ones_like(audio_state.denoise_mask))
+
+            audio_noise = audio_state.latent.clone() if audio_state is not None else None
+            
+            v_x = video_state
+            a_x = audio_state
+            
+            for i in range(len(sigmas) - 1):
+                sigma_hat = sigmas[i]
+                sigma_next = sigmas[i + 1]
+                
+                if loop_audio_latents is not None:
+                     sigma_val = sigma_hat.item() if isinstance(sigma_hat, torch.Tensor) else sigma_hat
+                     noised_audio = loop_audio_latents * (1.0 - sigma_val) + audio_noise * sigma_val
+                     a_x = replace(a_x, latent=noised_audio.to(a_x.latent.dtype))
+                     
+                denoised_v, denoised_a = simple_denoising_func(
+                    video_context=v_context_p,
+                    audio_context=a_context_p,
+                    transformer=transformer, 
+                )(v_x, a_x, sigmas, i)
+
+                d_v = (v_x.latent - denoised_v) / sigma_hat
+                dt = sigma_next - sigma_hat
+                v_x = replace(v_x, latent=v_x.latent + d_v * dt)
+
+            return v_x, a_x
+
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = []
+        if images:
+            stage_2_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_2_output_shape.height,
+                width=stage_2_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=distilled_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=second_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=distilled_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=audio_latents,
+        )
+        print("Stage 2: Denoising loop end.", time.time() - startAt)
+        
+        del transformer
+        del video_encoder
+        cleanup_memory()
+
+        decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)
+        decoded_audio = None
+        if audio_latents is not None:
+            decoded_audio = vae_decode_audio(
+                audio_state.latent, self.stage_2_model_ledger.audio_decoder(), self.stage_2_model_ledger.vocoder()
+            )
+        print("Stage 2:vae decode video end.", time.time() - startAt)
+        return decoded_video, audio_waveform.cpu() if audio_waveform is not None else decoded_audio
+
+
+@torch.inference_mode()
+def main() -> None:
+    logging.getLogger().setLevel(logging.INFO)
+    parser = default_2_stage_music_arg_parser()
+    args = parser.parse_args()
+    
+    pipeline = MusicToVideoTwoStagesPipeline(
+        checkpoint_path=args.checkpoint_path,
+        stage_2_checkpoint_path=args.stage_2_checkpoint_path,
+        spatial_upsampler_path=args.spatial_upsampler_path,
+        gemma_root=args.gemma_root,
+        loras=args.lora,
+        fp8transformer=args.enable_fp8,
+    )
+    tiling_config = TilingConfig.default()
+    video_chunks_number = get_video_chunks_number(args.num_frames, tiling_config)
+    
+    video, audio = pipeline(
+        prompt=args.prompt,
+        negative_prompt=args.negative_prompt,
+        seed=args.seed,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        frame_rate=args.frame_rate,
+        num_inference_steps=args.num_inference_steps,
+        cfg_guidance_scale=args.cfg_guidance_scale,
+        images=args.images,
+        audio_input_path=args.audio_input_path,
+        tiling_config=tiling_config,
+        enhance_prompt=args.enhance_prompt,
+        output_path=args.output_path,
+        video_chunks_number=video_chunks_number,
+        fps=args.frame_rate,
+    )
+
+    encode_video(
+        video=video,
+        fps=args.frame_rate,
+        audio=audio,
+        audio_sample_rate=AUDIO_SAMPLE_RATE,
+        output_path=args.output_path,
+        video_chunks_number=video_chunks_number,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py b/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
index ccd49b63..d6feea2e 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/utils/args.py
@@ -276,3 +276,47 @@ def default_2_stage_distilled_arg_parser() -> argparse.ArgumentParser:
         ),
     )
     return parser
+
+
+def default_2_stage_music_arg_parser() -> argparse.ArgumentParser:
+    parser = default_1_stage_arg_parser()
+    parser.set_defaults(height=DEFAULT_2_STAGE_HEIGHT, width=DEFAULT_2_STAGE_WIDTH)
+    
+    for action in parser._actions:
+        if "--height" in action.option_strings:
+            action.help = (
+                f"Height of the generated video in pixels, should be divisible by 64 "
+                f"(default: {DEFAULT_2_STAGE_HEIGHT})."
+            )
+        if "--width" in action.option_strings:
+            action.help = (
+                f"Width of the generated video in pixels, should be divisible by 64 (default: {DEFAULT_2_STAGE_WIDTH})."
+            )
+            
+    parser.add_argument(
+        "--stage-2-checkpoint-path",
+        type=resolve_path,
+        required=True,
+        help=(
+            "Path to LTX-2 distilled model checkpoint (.safetensors file) for the second stage."
+        ),
+    )
+    
+    parser.add_argument(
+        "--spatial-upsampler-path",
+        type=resolve_path,
+        required=True,
+        help=(
+            "Path to the spatial upsampler model used to increase the resolution "
+            "of the generated video in the latent space."
+        ),
+    )
+    
+    parser.add_argument(
+        "--audio-input-path",
+        type=resolve_path,
+        required=False,
+        default=None,
+        help="Path to the input audio file used for conditioning the video."
+    )
+    return parser

From 2e76cdabe845e6d58a952ec8dafb80a2a8b2e9c3 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sat, 21 Feb 2026 21:45:44 +0100
Subject: [PATCH 34/38] Add 2 step pipeline for sound to video

---
 music_maker_ui.py                             |  8 +++---
 music_maker_ui_v2.py                          |  2 +-
 .../src/ltx_pipelines/music_to_video.py       |  4 +--
 .../src/ltx_pipelines/music_to_video_v2.py    | 28 +++++++++----------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/music_maker_ui.py b/music_maker_ui.py
index aa0529d4..b0b4be9c 100644
--- a/music_maker_ui.py
+++ b/music_maker_ui.py
@@ -341,12 +341,12 @@ def cancel_job():
                 gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
                 upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
                 with gr.Row():
-                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=12)
+                    steps = gr.Slider(label="Steps", minimum=8, maximum=8, value=8)
                     fps = gr.Number(label="FPS", value=24)
                 with gr.Row():
-                    width = gr.Number(label="Width", value=1536)
-                    height = gr.Number(label="Height", value=1024)
-                num_frames = gr.Slider(label="Frames per Scene", minimum=9, maximum=257, step=8, value=121)
+                    width = gr.Number(label="Width", value=1280)
+                    height = gr.Number(label="Height", value=704)
+                num_frames = gr.Slider(label="Frames per Scene", minimum=9, maximum=257, step=8, value=225)
                 
                 slice_btn = gr.Button("🔪 Slice Music & Prepare Scenes", variant="primary")
                 
diff --git a/music_maker_ui_v2.py b/music_maker_ui_v2.py
index d9db8d19..dbec3d27 100644
--- a/music_maker_ui_v2.py
+++ b/music_maker_ui_v2.py
@@ -344,7 +344,7 @@ def cancel_job():
                 gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
                 upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
                 with gr.Row():
-                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=12)
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=40)
                     fps = gr.Number(label="FPS", value=24)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1280)
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
index d6d3d8b2..b8eee222 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video.py
@@ -46,7 +46,7 @@
 logging.getLogger("accelerate").setLevel(logging.ERROR)
 logging.getLogger("ltx_core").setLevel(logging.ERROR)
 
-AUDIO_SAMPLE_RATE = 24000
+AUDIO_SAMPLE_RATE = 16000
 
 
 def load_audio_input(audio_path: str, target_sample_rate: int, device: torch.device) -> torch.Tensor:
@@ -276,7 +276,7 @@ def music_denoising_loop(
             cleanup_memory()
 
         if audio_latents is not None:
-             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape)
+             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape, sample_rate=AUDIO_SAMPLE_RATE)
              target_frames = expected_audio_shape.frames
              current_frames = audio_latents.shape[2]
              
diff --git a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py
index 35ee1d5f..72bcf43d 100644
--- a/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py
+++ b/packages/ltx-pipelines/src/ltx_pipelines/music_to_video_v2.py
@@ -254,7 +254,7 @@ def __call__(  # noqa: PLR0913
         )
 
         if audio_latents is not None:
-             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape)
+             expected_audio_shape = AudioLatentShape.from_video_pixel_shape(stage_1_output_shape, sample_rate=AUDIO_SAMPLE_RATE)
              target_frames = expected_audio_shape.frames
              current_frames = audio_latents.shape[2]
              
@@ -380,6 +380,19 @@ def first_stage_denoising_loop(
         )
         print("Stage 2: Upsample and refine the video end.", time.time() - startAt)
 
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = []
+        if images:
+            stage_2_conditionings = image_conditionings_by_replacing_latent(
+                images=images,
+                height=stage_2_output_shape.height,
+                width=stage_2_output_shape.width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+
+        del video_encoder
         cleanup_memory()
 
         transformer = self.stage_2_model_ledger.transformer()
@@ -419,18 +432,6 @@ def second_stage_denoising_loop(
 
             return v_x, a_x
 
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
-        stage_2_conditionings = []
-        if images:
-            stage_2_conditionings = image_conditionings_by_replacing_latent(
-                images=images,
-                height=stage_2_output_shape.height,
-                width=stage_2_output_shape.width,
-                video_encoder=video_encoder,
-                dtype=dtype,
-                device=self.device,
-            )
-
         video_state, audio_state = denoise_audio_video(
             output_shape=stage_2_output_shape,
             conditionings=stage_2_conditionings,
@@ -448,7 +449,6 @@ def second_stage_denoising_loop(
         print("Stage 2: Denoising loop end.", time.time() - startAt)
         
         del transformer
-        del video_encoder
         cleanup_memory()
 
         decoded_video = vae_decode_video(video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config)

From 4ac5b82cb95244a5333ea03b2812df6590c63839 Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sun, 22 Feb 2026 00:54:56 +0100
Subject: [PATCH 35/38] Add music maker v2

---
 music_maker_ui.py    | 110 ++++++++++++++++++++++++++++---------------
 music_maker_ui_v2.py | 108 ++++++++++++++++++++++++++++--------------
 2 files changed, 145 insertions(+), 73 deletions(-)

diff --git a/music_maker_ui.py b/music_maker_ui.py
index b0b4be9c..2f6247e8 100644
--- a/music_maker_ui.py
+++ b/music_maker_ui.py
@@ -118,10 +118,14 @@ def slice_audio(audio_path, prompt, fps, num_frames):
                 ui_updates.append(gr.update(visible=True)) # Row
                 ui_updates.append(gr.update(value=prompt, visible=True)) # Textbox
                 ui_updates.append(gr.update(value=new_scenes_data[i]['audio_path'])) # Audio path display
+                ui_updates.append(gr.update(value=None)) # Start Image
+                ui_updates.append(gr.update(value=None)) # Last Image Override
             else:
                 ui_updates.append(gr.update(visible=False)) # Row
                 ui_updates.append(gr.update(visible=False)) # Textbox
-                ui_updates.append(gr.update(value=None))
+                ui_updates.append(gr.update(value=None)) # Audio
+                ui_updates.append(gr.update(value=None)) # Start Image
+                ui_updates.append(gr.update(value=None)) # Last Image Override
         
         return tuple([f"Sliced into {num_scenes} scenes."] + ui_updates)
         
@@ -129,9 +133,9 @@ def slice_audio(audio_path, prompt, fps, num_frames):
         print(f"DEBUG Error in slice_audio: {str(e)}")
         import traceback
         traceback.print_exc()
-        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 3) # Update this count if UI structure changes
+        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 5) # Update this count if UI structure changes
 
-def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+def process_chain_generation(scenes_list, audios_list, start_images_list, last_images_list, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
     """
     mode: "forward" (chain from start_index up to end) or "single" (only start_index)
     """
@@ -167,7 +171,7 @@ def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, f
              CURRENT_LOG += f"\nSkipping scene {i+1} : No Data\n"
              continue
              
-        audio_path = scene_data.get('audio_path')
+        audio_path = audios_list[i] or scene_data.get('audio_path')
         scene_id = i + 1
         CURRENT_LOG += f"\n\n--- GENERATING SCENE {scene_id} ---\n"
         
@@ -186,21 +190,37 @@ def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, f
             current_seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
 
         # Standard Continuity (Music Video likely doesn't need context compression as much as continuity? Let's keep Standard for now)
-        if mode == "forward":
+        def get_valid_path(img_data):
+            if not img_data: return None
+            if isinstance(img_data, str) and img_data.strip(): return img_data
+            if isinstance(img_data, dict) and 'path' in img_data and img_data['path']: return img_data['path']
+            return None
+
+        custom_start = get_valid_path(start_images_list[i])
+        
+        if custom_start:
+            conditioning_frames = [(custom_start, 0, 1.0)]
+            CURRENT_LOG += f"Using custom Start Image for Scene {scene_id}\n"
+        elif mode == "forward":
             # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
-            if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
-                prev_video = SCENES_DATA[i-1]['video_path']
-                
-                cap = cv2.VideoCapture(prev_video)
-                prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-                cap.release()
-                last_lat = (prev_frame_cnt - 1) // 8
-                
-                f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
-                f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
-                if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
-                     conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
-                     CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
+            if i > 0:
+                custom_last = get_valid_path(last_images_list[i-1])
+                if custom_last:
+                    conditioning_frames = [(custom_last, 0, 1.0)]
+                    CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (using custom Last Image Override)\n"
+                elif SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                    prev_video = SCENES_DATA[i-1]['video_path']
+                    
+                    cap = cv2.VideoCapture(prev_video)
+                    prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    last_lat = (prev_frame_cnt - 1) // 8
+                    
+                    f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
+                    f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
+                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                         conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                         CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
 
         # Build Command for music_to_video.py
         cmd = [
@@ -260,7 +280,7 @@ def process_chain_generation(scenes_list, checkpoint, gemma, upsampler, steps, f
     CURRENT_LOG += "\n--- GENERATION CYCLE FINISHED ---\n"
     IS_PROCESSING = False
 
-def start_generation_thread(prompts, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+def start_generation_thread(prompts, audios, start_images, last_images, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
     # Clear subsequent scenes in data if starting a chain or single regeneration logic?
     # For music video, if we regenerate, we keep the audio_path!
     # So we should only clear video_path.
@@ -273,7 +293,7 @@ def start_generation_thread(prompts, checkpoint, gemma, upsampler, steps, fps, w
                 SCENES_DATA[i]['first_frame'] = None
                 SCENES_DATA[i]['last_frame'] = None
 
-    threading.Thread(target=process_chain_generation, args=(prompts, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
+    threading.Thread(target=process_chain_generation, args=(prompts, audios, start_images, last_images, checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
     return "Generation started..."
 
 def stop_generation():
@@ -341,7 +361,7 @@ def cancel_job():
                 gemma = gr.Textbox(label="Gemma Root", value=DEFAULT_GEMMA)
                 upsampler = gr.Textbox(label="Upsampler", value=DEFAULT_UPSAMPLER)
                 with gr.Row():
-                    steps = gr.Slider(label="Steps", minimum=8, maximum=8, value=8)
+                    steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=8)
                     fps = gr.Number(label="FPS", value=24)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1280)
@@ -363,7 +383,9 @@ def cancel_job():
             gr.Markdown("### 🎞️ Video Scenes")
             scene_rows = []
             scene_prompts = []
-            scene_audio_labels = [] 
+            scene_audios = [] 
+            scene_start_images = []
+            scene_last_images = []
             scene_videos = []
             scene_previews = []
             scene_reg_chain_btns = []
@@ -375,8 +397,14 @@ def cancel_job():
                     with gr.Column(scale=3):
                         prompt_box = gr.Textbox(label=f"Scene {i} Prompt", lines=2)
                         scene_prompts.append(prompt_box)
-                        audio_lbl = gr.Textbox(label=f"Scene {i} Audio", interactive=False)
-                        scene_audio_labels.append(audio_lbl)
+                        audio_comp = gr.Audio(label=f"Scene {i} Audio", type="filepath", interactive=True)
+                        scene_audios.append(audio_comp)
+                        
+                        with gr.Row():
+                            start_img = gr.Image(label="Start Image (optional)", type="filepath")
+                            last_img = gr.Image(label="Last Image Override (optional)", type="filepath")
+                            scene_start_images.append(start_img)
+                            scene_last_images.append(last_img)
                         
                         with gr.Row():
                             chain_btn = gr.Button(f"🔗 Chain From {i}", size="sm")
@@ -405,19 +433,21 @@ def cancel_job():
     slice_btn.click(
         fn=slice_audio,
         inputs=[audio_file, master_prompt, fps, num_frames],
-        outputs=[status_box] + [comp for triple in zip(scene_rows, scene_prompts, scene_audio_labels) for comp in triple]
+        outputs=[status_box] + [comp for tuple_5 in zip(scene_rows, scene_prompts, scene_audios, scene_start_images, scene_last_images) for comp in tuple_5]
     )
     
-    # Helper wrapper to collect all prompts
     def collect_prompts_and_start(*args):
-        # Args structure: [prompts_list..., checkpoint, ..., button_args]
+        # Args structure: [prompts_list..., audios..., start_img..., last_img..., checkpoint, ..., button_args]
         # We need to slice args.
         num_scenes = 20
-        prompts = args[:num_scenes]
-        rest = args[num_scenes:]
-        return start_generation_thread(prompts, *rest)
+        prompts = args[0:20]
+        audios = args[20:40]
+        start_images = args[40:60]
+        last_images = args[60:80]
+        rest = args[80:]
+        return start_generation_thread(prompts, audios, start_images, last_images, *rest)
 
-    all_inputs = scene_prompts + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
+    all_inputs = scene_prompts + scene_audios + scene_start_images + scene_last_images + [checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
     
     generate_btn.click(
         fn=collect_prompts_and_start,
@@ -433,17 +463,23 @@ def collect_prompts_and_start(*args):
         def make_chain_fn(index):
             def chain_fn(*args):
                 num_scenes = 20
-                prompts = args[:num_scenes]
-                rest = args[num_scenes:]
-                return start_generation_thread(prompts, *rest, start_index=index, mode="forward")
+                prompts = args[0:20]
+                audios = args[20:40]
+                start_images = args[40:60]
+                last_images = args[60:80]
+                rest = args[80:]
+                return start_generation_thread(prompts, audios, start_images, last_images, *rest, start_index=index, mode="forward")
             return chain_fn
             
         def make_single_fn(index):
             def single_fn(*args):
                 num_scenes = 20
-                prompts = args[:num_scenes]
-                rest = args[num_scenes:]
-                return start_generation_thread(prompts, *rest, start_index=index, mode="single")
+                prompts = args[0:20]
+                audios = args[20:40]
+                start_images = args[40:60]
+                last_images = args[60:80]
+                rest = args[80:]
+                return start_generation_thread(prompts, audios, start_images, last_images, *rest, start_index=index, mode="single")
             return single_fn
 
         scene_reg_chain_btns[i].click(
diff --git a/music_maker_ui_v2.py b/music_maker_ui_v2.py
index dbec3d27..471699a2 100644
--- a/music_maker_ui_v2.py
+++ b/music_maker_ui_v2.py
@@ -119,10 +119,14 @@ def slice_audio(audio_path, prompt, fps, num_frames):
                 ui_updates.append(gr.update(visible=True)) # Row
                 ui_updates.append(gr.update(value=prompt, visible=True)) # Textbox
                 ui_updates.append(gr.update(value=new_scenes_data[i]['audio_path'])) # Audio path display
+                ui_updates.append(gr.update(value=None)) # Start Image
+                ui_updates.append(gr.update(value=None)) # Last Image Override
             else:
                 ui_updates.append(gr.update(visible=False)) # Row
                 ui_updates.append(gr.update(visible=False)) # Textbox
-                ui_updates.append(gr.update(value=None))
+                ui_updates.append(gr.update(value=None)) # Audio
+                ui_updates.append(gr.update(value=None)) # Start Image
+                ui_updates.append(gr.update(value=None)) # Last Image Override
         
         return tuple([f"Sliced into {num_scenes} scenes."] + ui_updates)
         
@@ -130,9 +134,9 @@ def slice_audio(audio_path, prompt, fps, num_frames):
         print(f"DEBUG Error in slice_audio: {str(e)}")
         import traceback
         traceback.print_exc()
-        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 3) # Update this count if UI structure changes
+        return tuple([f"Error: {str(e)}"] + [gr.update(visible=False)] * 20 * 5) # Update this count if UI structure changes
 
-def process_chain_generation(scenes_list, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+def process_chain_generation(scenes_list, audios_list, start_images_list, last_images_list, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
     """
     mode: "forward" (chain from start_index up to end) or "single" (only start_index)
     """
@@ -168,7 +172,7 @@ def process_chain_generation(scenes_list, checkpoint, stage_2_checkpoint, gemma,
              CURRENT_LOG += f"\nSkipping scene {i+1} : No Data\n"
              continue
              
-        audio_path = scene_data.get('audio_path')
+        audio_path = audios_list[i] or scene_data.get('audio_path')
         scene_id = i + 1
         CURRENT_LOG += f"\n\n--- GENERATING SCENE {scene_id} ---\n"
         
@@ -187,21 +191,37 @@ def process_chain_generation(scenes_list, checkpoint, stage_2_checkpoint, gemma,
             current_seed = int(os.urandom(4).hex(), 16) % (2 ** 32)
 
         # Standard Continuity (Music Video likely doesn't need context compression as much as continuity? Let's keep Standard for now)
-        if mode == "forward":
+        def get_valid_path(img_data):
+            if not img_data: return None
+            if isinstance(img_data, str) and img_data.strip(): return img_data
+            if isinstance(img_data, dict) and 'path' in img_data and img_data['path']: return img_data['path']
+            return None
+
+        custom_start = get_valid_path(start_images_list[i])
+        
+        if custom_start:
+            conditioning_frames = [(custom_start, 0, 1.0)]
+            CURRENT_LOG += f"Using custom Start Image for Scene {scene_id}\n"
+        elif mode == "forward":
             # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
-            if i > 0 and SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
-                prev_video = SCENES_DATA[i-1]['video_path']
-                
-                cap = cv2.VideoCapture(prev_video)
-                prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-                cap.release()
-                last_lat = (prev_frame_cnt - 1) // 8
-                
-                f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
-                f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
-                if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
-                     conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
-                     CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
+            if i > 0:
+                custom_last = get_valid_path(last_images_list[i-1])
+                if custom_last:
+                    conditioning_frames = [(custom_last, 0, 1.0)]
+                    CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (using custom Last Image Override)\n"
+                elif SCENES_DATA[i-1] and SCENES_DATA[i-1]['video_path']:
+                    prev_video = SCENES_DATA[i-1]['video_path']
+                    
+                    cap = cv2.VideoCapture(prev_video)
+                    prev_frame_cnt = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                    cap.release()
+                    last_lat = (prev_frame_cnt - 1) // 8
+                    
+                    f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
+                    f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
+                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
+                         conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                         CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
 
         # Build Command for music_to_video_v2.py
         cmd = [
@@ -262,7 +282,7 @@ def process_chain_generation(scenes_list, checkpoint, stage_2_checkpoint, gemma,
     CURRENT_LOG += "\n--- GENERATION CYCLE FINISHED ---\n"
     IS_PROCESSING = False
 
-def start_generation_thread(prompts, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
+def start_generation_thread(prompts, audios, start_images, last_images, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index=None, mode="forward"):
     # Clear subsequent scenes in data if starting a chain or single regeneration logic?
     # For music video, if we regenerate, we keep the audio_path!
     # So we should only clear video_path.
@@ -275,7 +295,7 @@ def start_generation_thread(prompts, checkpoint, stage_2_checkpoint, gemma, upsa
                 SCENES_DATA[i]['first_frame'] = None
                 SCENES_DATA[i]['last_frame'] = None
 
-    threading.Thread(target=process_chain_generation, args=(prompts, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
+    threading.Thread(target=process_chain_generation, args=(prompts, audios, start_images, last_images, checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth, start_index, mode), daemon=True).start()
     return "Generation started..."
 
 def stop_generation():
@@ -366,7 +386,9 @@ def cancel_job():
             gr.Markdown("### 🎞️ Video Scenes")
             scene_rows = []
             scene_prompts = []
-            scene_audio_labels = [] 
+            scene_audios = [] 
+            scene_start_images = []
+            scene_last_images = []
             scene_videos = []
             scene_previews = []
             scene_reg_chain_btns = []
@@ -378,8 +400,14 @@ def cancel_job():
                     with gr.Column(scale=3):
                         prompt_box = gr.Textbox(label=f"Scene {i} Prompt", lines=2)
                         scene_prompts.append(prompt_box)
-                        audio_lbl = gr.Textbox(label=f"Scene {i} Audio", interactive=False)
-                        scene_audio_labels.append(audio_lbl)
+                        audio_comp = gr.Audio(label=f"Scene {i} Audio", type="filepath", interactive=True)
+                        scene_audios.append(audio_comp)
+                        
+                        with gr.Row():
+                            start_img = gr.Image(label="Start Image (optional)", type="filepath")
+                            last_img = gr.Image(label="Last Image Override (optional)", type="filepath")
+                            scene_start_images.append(start_img)
+                            scene_last_images.append(last_img)
                         
                         with gr.Row():
                             chain_btn = gr.Button(f"🔗 Chain From {i}", size="sm")
@@ -408,19 +436,21 @@ def cancel_job():
     slice_btn.click(
         fn=slice_audio,
         inputs=[audio_file, master_prompt, fps, num_frames],
-        outputs=[status_box] + [comp for triple in zip(scene_rows, scene_prompts, scene_audio_labels) for comp in triple]
+        outputs=[status_box] + [comp for tuple_5 in zip(scene_rows, scene_prompts, scene_audios, scene_start_images, scene_last_images) for comp in tuple_5]
     )
     
-    # Helper wrapper to collect all prompts
     def collect_prompts_and_start(*args):
-        # Args structure: [prompts_list..., checkpoint, ..., button_args]
+        # Args structure: [prompts_list..., audios..., start_img..., last_img..., checkpoint, ..., button_args]
         # We need to slice args.
         num_scenes = 20
-        prompts = args[:num_scenes]
-        rest = args[num_scenes:]
-        return start_generation_thread(prompts, *rest)
+        prompts = args[0:20]
+        audios = args[20:40]
+        start_images = args[40:60]
+        last_images = args[60:80]
+        rest = args[80:]
+        return start_generation_thread(prompts, audios, start_images, last_images, *rest)
 
-    all_inputs = scene_prompts + [checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
+    all_inputs = scene_prompts + scene_audios + scene_start_images + scene_last_images + [checkpoint, stage_2_checkpoint, gemma, upsampler, steps, fps, width, height, num_frames, seed, random_seed, use_context_compression, latent_reuse_count, context_depth]
     
     generate_btn.click(
         fn=collect_prompts_and_start,
@@ -436,17 +466,23 @@ def collect_prompts_and_start(*args):
         def make_chain_fn(index):
             def chain_fn(*args):
                 num_scenes = 20
-                prompts = args[:num_scenes]
-                rest = args[num_scenes:]
-                return start_generation_thread(prompts, *rest, start_index=index, mode="forward")
+                prompts = args[0:20]
+                audios = args[20:40]
+                start_images = args[40:60]
+                last_images = args[60:80]
+                rest = args[80:]
+                return start_generation_thread(prompts, audios, start_images, last_images, *rest, start_index=index, mode="forward")
             return chain_fn
             
         def make_single_fn(index):
             def single_fn(*args):
                 num_scenes = 20
-                prompts = args[:num_scenes]
-                rest = args[num_scenes:]
-                return start_generation_thread(prompts, *rest, start_index=index, mode="single")
+                prompts = args[0:20]
+                audios = args[20:40]
+                start_images = args[40:60]
+                last_images = args[60:80]
+                rest = args[80:]
+                return start_generation_thread(prompts, audios, start_images, last_images, *rest, start_index=index, mode="single")
             return single_fn
 
         scene_reg_chain_btns[i].click(

From 180735d4695538ed4d299826b21baa90a0475113 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sun, 22 Feb 2026 01:06:30 +0100
Subject: [PATCH 36/38] Update README.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 25946f74..989ae294 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,13 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 
 * https://youtu.be/HzK1nW-OVtQ
 
+* Added start/last frame (optional)
+
+```python
+python music_maker_ui.py # distilled 2 step (fast)
+python music_maker_ui_v2.py # 2 step (slow)
+```
+
 
 
 ## 🚀 Features

From af002b04ee00e25559a7aaafdd30ff8c43ed0311 Mon Sep 17 00:00:00 2001
From: nalexand <35492736+nalexand@users.noreply.github.com>
Date: Sun, 22 Feb 2026 01:08:28 +0100
Subject: [PATCH 37/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 989ae294..0aba428c 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ It includes a fully-featured **Gradio Web Interface** to make generating videos,
 * https://youtu.be/eGOq0hUiri4
 * https://youtu.be/HAQqzPdDIj0
 
-## Music to Video UI
+## Music to Video UI (S2V, Lip Sync)
 <img width="612" height="343" alt="cm" src="https://github.com/user-attachments/assets/852845f9-f113-41f7-a5e6-8a4e1dec0778" />
 
 * https://youtu.be/HzK1nW-OVtQ

From aced83c07ee946214dfde3b416710edee0b4840c Mon Sep 17 00:00:00 2001
From: koshe <koshelev.alexandr@gmail.com>
Date: Sun, 22 Feb 2026 02:13:59 +0100
Subject: [PATCH 38/38] Change start frame to 1 latent

---
 music_maker_ui.py    | 6 ++----
 music_maker_ui_v2.py | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/music_maker_ui.py b/music_maker_ui.py
index 2f6247e8..cc2186ad 100644
--- a/music_maker_ui.py
+++ b/music_maker_ui.py
@@ -202,7 +202,6 @@ def get_valid_path(img_data):
             conditioning_frames = [(custom_start, 0, 1.0)]
             CURRENT_LOG += f"Using custom Start Image for Scene {scene_id}\n"
         elif mode == "forward":
-            # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
             if i > 0:
                 custom_last = get_valid_path(last_images_list[i-1])
                 if custom_last:
@@ -217,9 +216,8 @@ def get_valid_path(img_data):
                     last_lat = (prev_frame_cnt - 1) // 8
                     
                     f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
-                    f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
-                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
-                         conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                    if extract_frame(prev_video, f_prev_l1, last_lat*8):
+                         conditioning_frames = [(f_prev_l1, 0, 1.0)]
                          CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
 
         # Build Command for music_to_video.py
diff --git a/music_maker_ui_v2.py b/music_maker_ui_v2.py
index 471699a2..961fdec6 100644
--- a/music_maker_ui_v2.py
+++ b/music_maker_ui_v2.py
@@ -203,7 +203,6 @@ def get_valid_path(img_data):
             conditioning_frames = [(custom_start, 0, 1.0)]
             CURRENT_LOG += f"Using custom Start Image for Scene {scene_id}\n"
         elif mode == "forward":
-            # Connect to previous scene's LAST frames (112, 120) as starting points (0, 1)
             if i > 0:
                 custom_last = get_valid_path(last_images_list[i-1])
                 if custom_last:
@@ -218,9 +217,8 @@ def get_valid_path(img_data):
                     last_lat = (prev_frame_cnt - 1) // 8
                     
                     f_prev_l1 = f"scene_{scene_id}_f_prev_last1.jpg"
-                    f_prev_l2 = f"scene_{scene_id}_f_prev_last2.jpg"
-                    if extract_frame(prev_video, f_prev_l1, (last_lat-1)*8) and extract_frame(prev_video, f_prev_l2, last_lat*8):
-                         conditioning_frames = [(f_prev_l1, 0, 1.0), (f_prev_l2, 1, 0.1)]
+                    if extract_frame(prev_video, f_prev_l1, last_lat*8):
+                         conditioning_frames = [(f_prev_l1, 0, 1.0)]
                          CURRENT_LOG += f"Connecting Scene {scene_id} to Scene {i} (last frames as latents)\n"
 
         # Build Command for music_to_video_v2.py