Testing0

Running on Zero

App Files Files Community

dagloop5 commited on 1 day ago

Commit

8d88e86

verified ·

1 Parent(s): 1be38e6

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -195

app.py CHANGED Viewed

@@ -46,38 +46,47 @@ import spaces
 import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from safetensors.torch import load_file, save_file
-from safetensors import safe_open
-import json
-import requests
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.protocols import DiffusionStepProtocol
-from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
 from ltx_core.quantization import QuantizationPolicy
-from ltx_core.types import Audio, LatentState, AudioLatentShape, VideoPixelShape
 from ltx_pipelines.distilled import DistilledPipeline
-from ltx_pipelines.utils import ModelLedger, euler_denoising_loop
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
 from ltx_pipelines.utils.helpers import (
     cleanup_memory,
     combined_image_conditionings,
     denoise_video_only,
-    denoise_audio_video,
-    get_device,
     encode_prompts,
     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
-from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
-from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
-from ltx_pipelines.utils.types import PipelineComponents
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
@@ -107,35 +116,9 @@ RESOLUTIONS = {
 }
-class LTX23DistilledA2VPipeline:
     """DistilledPipeline with optional audio conditioning."""
-    def __init__(
-        self,
-        distilled_checkpoint_path: str,
-        gemma_root: str,
-        spatial_upsampler_path: str,
-        loras: tuple,
-        quantization: QuantizationPolicy | None = None,
-    ):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.dtype = torch.bfloat16
-        self.model_ledger = ModelLedger(
-            dtype=self.dtype,
-            device=self.device,
-            checkpoint_path=distilled_checkpoint_path,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root_path=gemma_root,
-            loras=loras,
-            quantization=quantization,
-        )
-        self.pipeline_components = PipelineComponents(
-            dtype=self.dtype,
-            device=self.device,
-        )
     def __call__(
         self,
         prompt: str,
@@ -145,9 +128,24 @@ class LTX23DistilledA2VPipeline:
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
@@ -158,18 +156,38 @@ class LTX23DistilledA2VPipeline:
             [prompt],
             self.model_ledger,
             enhance_first_prompt=enhance_prompt,
-            enhance_prompt_image=images[0][0] if len(images) > 0 else None,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        # Stage 1: Initial low resolution video generation.
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
-        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-        def denoising_loop(
-            sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
-        ) -> tuple[LatentState, LatentState]:
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
@@ -178,15 +196,15 @@ class LTX23DistilledA2VPipeline:
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
-                    transformer=transformer,  # noqa: F821
                 ),
             )
         stage_1_output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
-            width=width,
-            height=height,
             fps=frame_rate,
         )
         stage_1_conditionings = combined_image_conditionings(
@@ -197,8 +215,7 @@ class LTX23DistilledA2VPipeline:
             dtype=dtype,
             device=self.device,
         )
-        video_state, audio_state = denoise_audio_video(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
@@ -208,6 +225,40 @@ class LTX23DistilledA2VPipeline:
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
         )
         torch.cuda.synchronize()
@@ -216,12 +267,16 @@ class LTX23DistilledA2VPipeline:
         cleanup_memory()
         decoded_video = vae_decode_video(
-            video_state.latent, self.model_ledger.video_decoder(), tiling_config, generator
         )
-        decoded_audio = vae_decode_audio(
-            audio_state.latent, self.model_ledger.audio_decoder(), self.model_ledger.vocoder()
         )
-        return decoded_video, decoded_audio
 # Model repos
@@ -233,20 +288,11 @@ print("=" * 80)
 print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
-# LoRA cache directory and currently-applied key
-LORA_CACHE_DIR = Path("lora_cache")
-LORA_CACHE_DIR.mkdir(exist_ok=True)
-current_lora_key: str | None = None
-PENDING_LORA_KEY: str | None = None
-PENDING_LORA_STATE: dict[str, torch.Tensor] | None = None
-PENDING_LORA_STATUS: str = "No LoRA state prepared yet."
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
 checkpoint_path = hf_hub_download(
-    repo_id="TenStrip/LTX2.3-10Eros",
-    filename="10Eros_v1_bf16.safetensors",
     local_dir=str(weights_dir),
     local_dir_use_symlinks=False,
 )
@@ -264,7 +310,7 @@ print("=" * 80)
 pose_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2_3_NSFW_furry_concat_v2.safetensors")
 general_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_reasoning_I2V_V3.safetensors")
 motion_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="motion_helper.safetensors")
-dreamlay_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="DR34ML4Y_LTXXX_PREVIEW_RC1.safetensors") # m15510n4ry, bl0wj0b, d0ubl3_bj, d0gg1e, c0wg1rl
 mself_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="Furry Hyper Masturbation - LTX-2 I2V v1.safetensors") # Hyperfap
 dramatic_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX-2.3 - Orgasm.safetensors") # "[He | She] is having am orgasm." (am or an?)
 fluid_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_CREAMPIE_ANIMATION-V0.1.safetensors") # cum
@@ -274,7 +320,8 @@ voice_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="hentai_voice_ltx2
 realism_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="FurryenhancerLTX2.3V1.215.safetensors")
 transition_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX-2_takerpov_lora_v1.2.safetensors") # takerpov1, taker pov
 physics_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_Better_Physics_PhysLTX.safetensors")
-reasoning_lora_path = hf_hub_download(repo_id="TenStrip/LTX2.3_Distilled_Lora_1.1_Experiments", filename="ltx-2.3-22b-distilled-lora-1.1_fro90_ceil72_condsafe.safetensors")
 print(f"Pose LoRA: {pose_lora_path}")
 print(f"General LoRA: {general_lora_path}")
@@ -290,6 +337,7 @@ print(f"Realism LoRA: {realism_lora_path}")
 print(f"Transition LoRA: {transition_lora_path}")
 print(f"Physics LoRA: {physics_lora_path}")
 print(f"Reasoning LoRA: {reasoning_lora_path}")
 # ----------------------------------------------------------------
 print(f"Checkpoint: {checkpoint_path}")
@@ -307,7 +355,7 @@ pipeline = LTX23DistilledA2VPipeline(
 )
 # ----------------------------------------------------------------
-def _make_lora_key(pose_strength: float, general_strength: float, motion_strength: float, dreamlay_strength: float, mself_strength: float, dramatic_strength: float, fluid_strength: float, liquid_strength: float, demopose_strength: float, voice_strength: float, realism_strength: float, transition_strength: float, physics_strength: float, reasoning_strength: float) -> tuple[str, str]:
     rp = round(float(pose_strength), 2)
     rg = round(float(general_strength), 2)
     rm = round(float(motion_strength), 2)
@@ -322,12 +370,12 @@ def _make_lora_key(pose_strength: float, general_strength: float, motion_strengt
     rt = round(float(transition_strength), 2)
     ry = round(float(physics_strength), 2)
     ri = round(float(reasoning_strength), 2)
-    key_str = f"{pose_lora_path}:{rp}|{general_lora_path}:{rg}|{motion_lora_path}:{rm}|{dreamlay_lora_path}:{rd}|{mself_lora_path}:{rs}|{dramatic_lora_path}:{rr}|{fluid_lora_path}:{rf}|{liquid_lora_path}:{rl}|{demopose_lora_path}:{ro}|{voice_lora_path}:{rv}|{realism_lora_path}:{re}|{transition_lora_path}:{rt}|{physics_lora_path}:{ry}|{reasoning_lora_path}:{ri}"
     key = hashlib.sha256(key_str.encode("utf-8")).hexdigest()
     return key, key_str
-def prepare_lora_cache(
     pose_strength: float,
     general_strength: float,
     motion_strength: float,
@@ -342,34 +390,10 @@ def prepare_lora_cache(
     transition_strength: float,
     physics_strength: float,
     reasoning_strength: float,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    """
-    CPU-only step:
-    - checks cache
-    - loads cached fused transformer state_dict, or
-    - builds fused transformer on CPU and saves it
-    The resulting state_dict is stored in memory and can be applied later.
-    """
-    global PENDING_LORA_KEY, PENDING_LORA_STATE, PENDING_LORA_STATUS
-    ledger = pipeline.model_ledger
-    key, _ = _make_lora_key(pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength)
-    cache_path = LORA_CACHE_DIR / f"{key}.safetensors"
-    progress(0.05, desc="Preparing LoRA state")
-    if cache_path.exists():
-        try:
-            progress(0.20, desc="Loading cached fused state")
-            state = load_file(str(cache_path))
-            PENDING_LORA_KEY = key
-            PENDING_LORA_STATE = state
-            PENDING_LORA_STATUS = f"Loaded cached LoRA state: {cache_path.name}"
-            return PENDING_LORA_STATUS
-        except Exception as e:
-            print(f"[LoRA] Cache load failed: {type(e).__name__}: {e}")
-    entries = [
         (pose_lora_path, round(float(pose_strength), 2)),
         (general_lora_path, round(float(general_strength), 2)),
         (motion_lora_path, round(float(motion_strength), 2)),
@@ -384,91 +408,68 @@ def prepare_lora_cache(
         (transition_lora_path, round(float(transition_strength), 2)),
         (physics_lora_path, round(float(physics_strength), 2)),
         (reasoning_lora_path, round(float(reasoning_strength), 2)),
-    ]
-    loras_for_builder = [
-        LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
-        for path, strength in entries
-        if path is not None and float(strength) != 0.0
     ]
-    if not loras_for_builder:
-        PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
-        PENDING_LORA_STATUS = "No non-zero LoRA strengths selected; nothing to prepare."
-        return PENDING_LORA_STATUS
-    tmp_ledger = None
-    new_transformer_cpu = None
-    try:
-        progress(0.35, desc="Building fused CPU transformer")
-        tmp_ledger = pipeline.model_ledger.__class__(
-            dtype=ledger.dtype,
-            device=torch.device("cpu"),
-            checkpoint_path=str(checkpoint_path),
-            spatial_upsampler_path=str(spatial_upsampler_path),
-            gemma_root_path=str(gemma_root),
-            loras=tuple(loras_for_builder),
-            quantization=getattr(ledger, "quantization", None),
-        )
-        new_transformer_cpu = tmp_ledger.transformer()
-        progress(0.70, desc="Extracting fused state_dict")
-        state = {
-            k: v.detach().cpu().contiguous()
-            for k, v in new_transformer_cpu.state_dict().items()
-        }
-        save_file(state, str(cache_path))
-        PENDING_LORA_KEY = key
-        PENDING_LORA_STATE = state
-        PENDING_LORA_STATUS = f"Built and cached LoRA state: {cache_path.name}"
-        return PENDING_LORA_STATUS
-    except Exception as e:
-        import traceback
-        print(f"[LoRA] Prepare failed: {type(e).__name__}: {e}")
-        print(traceback.format_exc())
-        PENDING_LORA_KEY = None
-        PENDING_LORA_STATE = None
-        PENDING_LORA_STATUS = f"LoRA prepare failed: {type(e).__name__}: {e}"
-        return PENDING_LORA_STATUS
-    finally:
-        try:
-            del new_transformer_cpu
-        except Exception:
-            pass
-        try:
-            del tmp_ledger
-        except Exception:
-            pass
-        gc.collect()
-def apply_prepared_lora_state_to_pipeline():
-    """
-    Fast step: copy the already prepared CPU state into the live transformer.
-    This is the only part that should remain near generation time.
-    """
-    global current_lora_key, PENDING_LORA_KEY, PENDING_LORA_STATE
-    if PENDING_LORA_STATE is None or PENDING_LORA_KEY is None:
-        print("[LoRA] No prepared LoRA state available; skipping.")
-        return False
-    if current_lora_key == PENDING_LORA_KEY:
-        print("[LoRA] Prepared LoRA state already active; skipping.")
-        return True
-    existing_transformer = _transformer
     with torch.no_grad():
-        missing, unexpected = existing_transformer.load_state_dict(PENDING_LORA_STATE, strict=False)
         if missing or unexpected:
-            print(f"[LoRA] load_state_dict mismatch: missing={len(missing)}, unexpected={len(unexpected)}")
-    current_lora_key = PENDING_LORA_KEY
-    print("[LoRA] Prepared LoRA state applied to the pipeline.")
-    return True
 # ---- REPLACE PRELOAD BLOCK START ----
 # Preload all models for ZeroGPU tensor packing.
@@ -489,6 +490,13 @@ _orig_gemma_embeddings_factory = ledger.gemma_embeddings_processor
 # Call the original factories once to create the cached instances we will serve by default.
 _transformer = _orig_transformer_factory()
 _video_encoder = _orig_video_encoder_factory()
 _video_decoder = _orig_video_decoder_factory()
 _audio_encoder = _orig_audio_encoder_factory()
@@ -559,6 +567,7 @@ def on_highres_toggle(first_image, last_image, high_res):
 def get_gpu_duration(
     first_image,
     last_image,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -581,6 +590,7 @@ def get_gpu_duration(
     transition_strength: float = 0.0,
     physics_strength: float = 0.0,
     reasoning_strength: float = 0.0,
     progress=None,
 ):
     return int(gpu_duration)
@@ -590,6 +600,7 @@ def get_gpu_duration(
 def generate_video(
     first_image,
     last_image,
     prompt: str,
     duration: float,
     gpu_duration: float,
@@ -612,6 +623,7 @@ def generate_video(
     transition_strength: float = 0.0,
     physics_strength: float = 0.0,
     reasoning_strength: float = 0.0,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
@@ -651,8 +663,13 @@ def generate_video(
         log_memory("before pipeline call")
-        apply_prepared_lora_state_to_pipeline()
         video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
@@ -661,6 +678,7 @@ def generate_video(
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
@@ -695,6 +713,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
@@ -771,15 +790,13 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                         minimum=0.0, maximum=2.0, value=0.0, step=0.01
                     )
                     reasoning_strength = gr.Slider(
-                        label="Distilled strength",
                         minimum=0.0, maximum=2.0, value=0.0, step=0.01
                     )
-                prepare_lora_btn = gr.Button("Prepare / Load LoRA Cache", variant="secondary")
-                lora_status = gr.Textbox(
-                    label="LoRA Cache Status",
-                    value="No LoRA state prepared yet.",
-                    interactive=False,
-                )
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=False)
@@ -796,6 +813,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             [
                 None,
                 "pinkknit.jpg",
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
@@ -823,12 +841,13 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 0.0,
                 0.0,
                 0.0,
             ],
         ],
         inputs=[
-            first_image, last_image, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
-            pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],
     )
@@ -850,18 +869,13 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
         outputs=[width, height],
     )
-    prepare_lora_btn.click(
-        fn=prepare_lora_cache,
-        inputs=[pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength],
-        outputs=[lora_status],
-    )
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            first_image, last_image, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
-            pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength,
         ],
         outputs=[output_video, seed],
     )
@@ -872,4 +886,4 @@ css = """
 """
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Citrus(), css=css)

 import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors.torch import load_file
+from ltx_core.loader.primitives import LoraPathStrengthAndSDOps
+from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
+try:
+    from ltx_core.loader.fuse_loras import apply_loras
+except ImportError:
+    from ltx_core.loader.fuse_loras import fuse_lora_weights
+    def apply_loras(model_sd, loras, dtype=None):
+        # fuse_lora_weights is the lower-level helper the repo uses internally;
+        # this wrapper turns its output into a regular state_dict.
+        return {
+            k: v
+            for k, v in fuse_lora_weights(
+                model_sd,
+                loras,
+                dtype=dtype,
+                preserve_input_device=False,
+            )
+        }
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
 from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
 from ltx_pipelines.distilled import DistilledPipeline
+from ltx_pipelines.utils import euler_denoising_loop
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
 from ltx_pipelines.utils.helpers import (
     cleanup_memory,
     combined_image_conditionings,
     denoise_video_only,
     encode_prompts,
     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
 }
+class LTX23DistilledA2VPipeline(DistilledPipeline):
     """DistilledPipeline with optional audio conditioning."""
     def __call__(
         self,
         prompt: str,
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
+        audio_path: str | None = None,
         tiling_config: TilingConfig | None = None,
         enhance_prompt: bool = False,
     ):
+        # Standard path when no audio input is provided.
+        print(prompt)
+        if audio_path is None:
+            return super().__call__(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                tiling_config=tiling_config,
+                enhance_prompt=enhance_prompt,
+            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
             [prompt],
             self.model_ledger,
             enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        video_duration = num_frames / frame_rate
+        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+        if decoded_audio is None:
+            raise ValueError(f"Could not extract audio stream from {audio_path}")
+        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+        expected_frames = audio_shape.frames
+        actual_frames = encoded_audio_latent.shape[2]
+        if actual_frames > expected_frames:
+            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+        elif actual_frames < expected_frames:
+            pad = torch.zeros(
+                encoded_audio_latent.shape[0],
+                encoded_audio_latent.shape[1],
+                expected_frames - actual_frames,
+                encoded_audio_latent.shape[3],
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
+            )
+            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
         video_encoder = self.model_ledger.video_encoder()
         transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
+                    transformer=transformer,
                 ),
             )
         stage_1_output_shape = VideoPixelShape(
             batch=1,
             frames=num_frames,
+            width=width // 2,
+            height=height // 2,
             fps=frame_rate,
         )
         stage_1_conditionings = combined_image_conditionings(
             dtype=dtype,
             device=self.device,
         )
+        video_state = denoise_video_only(
             output_shape=stage_1_output_shape,
             conditionings=stage_1_conditionings,
             noiser=noiser,
             components=self.pipeline_components,
             dtype=dtype,
             device=self.device,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.model_ledger.spatial_upsampler(),
+        )
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=encoded_audio_latent,
         )
         torch.cuda.synchronize()
         cleanup_memory()
         decoded_video = vae_decode_video(
+            video_state.latent,
+            self.model_ledger.video_decoder(),
+            tiling_config,
+            generator,
         )
+        original_audio = Audio(
+            waveform=decoded_audio.waveform.squeeze(0),
+            sampling_rate=decoded_audio.sampling_rate,
         )
+        return decoded_video, original_audio
 # Model repos
 print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
 weights_dir = Path("weights")
 weights_dir.mkdir(exist_ok=True)
 checkpoint_path = hf_hub_download(
+    repo_id="SulphurAI/Sulphur-2-base",
+    filename="sulphur_distil_bf16.safetensors",
     local_dir=str(weights_dir),
     local_dir_use_symlinks=False,
 )
 pose_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2_3_NSFW_furry_concat_v2.safetensors")
 general_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_reasoning_I2V_V3.safetensors")
 motion_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="motion_helper.safetensors")
+dreamlay_lora_path = hf_hub_download(repo_id="lynaNSFW/DR34ML4Y_AIO_NSFW_LTX23", filename="DR34ML4Y_LTXXX_V1.safetensors") # m15510n4ry, bl0wj0b, d0ubl3_bj, d0gg1e, c0wg1rl
 mself_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="Furry Hyper Masturbation - LTX-2 I2V v1.safetensors") # Hyperfap
 dramatic_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX-2.3 - Orgasm.safetensors") # "[He | She] is having am orgasm." (am or an?)
 fluid_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_CREAMPIE_ANIMATION-V0.1.safetensors") # cum
 realism_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="FurryenhancerLTX2.3V1.215.safetensors")
 transition_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX-2_takerpov_lora_v1.2.safetensors") # takerpov1, taker pov
 physics_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_Better_Physics_PhysLTX.safetensors")
+reasoning_lora_path = hf_hub_download(repo_id="LiconStudio/Ltx2.3-VBVR-lora-I2V", filename="Ltx2.3-Licon-VBVR-I2V-390K-R32.safetensors")
+twostep_lora_path = hf_hub_download(repo_id=LORA_REPO, filename="LTX2.3_Multi_step_video_reasoning_V0.1.safetensors")
 print(f"Pose LoRA: {pose_lora_path}")
 print(f"General LoRA: {general_lora_path}")
 print(f"Transition LoRA: {transition_lora_path}")
 print(f"Physics LoRA: {physics_lora_path}")
 print(f"Reasoning LoRA: {reasoning_lora_path}")
+print(f"Twostep LoRA: {twostep_lora_path}")
 # ----------------------------------------------------------------
 print(f"Checkpoint: {checkpoint_path}")
 )
 # ----------------------------------------------------------------
+def _make_lora_key(pose_strength: float, general_strength: float, motion_strength: float, dreamlay_strength: float, mself_strength: float, dramatic_strength: float, fluid_strength: float, liquid_strength: float, demopose_strength: float, voice_strength: float, realism_strength: float, transition_strength: float, physics_strength: float, reasoning_strength: float, twostep_strength: float) -> tuple[str, str]:
     rp = round(float(pose_strength), 2)
     rg = round(float(general_strength), 2)
     rm = round(float(motion_strength), 2)
     rt = round(float(transition_strength), 2)
     ry = round(float(physics_strength), 2)
     ri = round(float(reasoning_strength), 2)
+    rw = round(float(twostep_strength), 2)
+    key_str = f"{pose_lora_path}:{rp}|{general_lora_path}:{rg}|{motion_lora_path}:{rm}|{dreamlay_lora_path}:{rd}|{mself_lora_path}:{rs}|{dramatic_lora_path}:{rr}|{fluid_lora_path}:{rf}|{liquid_lora_path}:{rl}|{demopose_lora_path}:{ro}|{voice_lora_path}:{rv}|{realism_lora_path}:{re}|{transition_lora_path}:{rt}|{physics_lora_path}:{ry}|{reasoning_lora_path}:{ri}|{twostep_lora_path}:{rw}"
     key = hashlib.sha256(key_str.encode("utf-8")).hexdigest()
     return key, key_str
+def _collect_lora_specs(
     pose_strength: float,
     general_strength: float,
     motion_strength: float,
     transition_strength: float,
     physics_strength: float,
     reasoning_strength: float,
+    twostep_strength: float,
 ):
+    # Keep all 14 adapters in the active list; zero strength means no effect.
+    return [
         (pose_lora_path, round(float(pose_strength), 2)),
         (general_lora_path, round(float(general_strength), 2)),
         (motion_lora_path, round(float(motion_strength), 2)),
         (transition_lora_path, round(float(transition_strength), 2)),
         (physics_lora_path, round(float(physics_strength), 2)),
         (reasoning_lora_path, round(float(reasoning_strength), 2)),
+        (twostep_lora_path, round(float(twostep_strength), 2)),
     ]
+def apply_current_loras_to_transformer(
+    pose_strength: float,
+    general_strength: float,
+    motion_strength: float,
+    dreamlay_strength: float,
+    mself_strength: float,
+    dramatic_strength: float,
+    fluid_strength: float,
+    liquid_strength: float,
+    demopose_strength: float,
+    voice_strength: float,
+    realism_strength: float,
+    transition_strength: float,
+    physics_strength: float,
+    reasoning_strength: float,
+    twostep_strength: float,
+):
+    global ACTIVE_LORA_KEY
+    key, _ = _make_lora_key(
+        pose_strength, general_strength, motion_strength, dreamlay_strength,
+        mself_strength, dramatic_strength, fluid_strength, liquid_strength,
+        demopose_strength, voice_strength, realism_strength, transition_strength,
+        physics_strength, reasoning_strength, twostep_strength
+    )
+    if key == ACTIVE_LORA_KEY:
+        return "LoRAs already active."
+    if key in LORA_STATE_CACHE:
+        fused_state = LORA_STATE_CACHE[key]
+    else:
+        loras = [
+            LoraPathStrengthAndSDOps(path, strength, LTXV_LORA_COMFY_RENAMING_MAP)
+            for path, strength in _collect_lora_specs(
+                pose_strength, general_strength, motion_strength, dreamlay_strength,
+                mself_strength, dramatic_strength, fluid_strength, liquid_strength,
+                demopose_strength, voice_strength, realism_strength, transition_strength,
+                physics_strength, reasoning_strength, twostep_strength,
+            )
+        ]
+        fused_state = apply_loras(
+            BASE_TRANSFORMER_STATE,
+            loras,
+            dtype=pipeline.model_ledger.dtype,
+        )
+        LORA_STATE_CACHE[key] = fused_state
     with torch.no_grad():
+        missing, unexpected = _transformer.load_state_dict(fused_state, strict=False)
         if missing or unexpected:
+            print(
+                f"[LoRA] state_dict mismatch: missing={len(missing)}, unexpected={len(unexpected)}"
+            )
+    ACTIVE_LORA_KEY = key
+    return f"Applied LoRAs: {key[:12]}"
 # ---- REPLACE PRELOAD BLOCK START ----
 # Preload all models for ZeroGPU tensor packing.
 # Call the original factories once to create the cached instances we will serve by default.
 _transformer = _orig_transformer_factory()
+BASE_TRANSFORMER_STATE = {
+    k: v.detach().cpu().contiguous()
+    for k, v in _transformer.state_dict().items()
+}
+ACTIVE_LORA_KEY: str | None = None
+LORA_STATE_CACHE: dict[str, dict[str, torch.Tensor]] = {}
 _video_encoder = _orig_video_encoder_factory()
 _video_decoder = _orig_video_decoder_factory()
 _audio_encoder = _orig_audio_encoder_factory()
 def get_gpu_duration(
     first_image,
     last_image,
+    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
     transition_strength: float = 0.0,
     physics_strength: float = 0.0,
     reasoning_strength: float = 0.0,
+    twostep_strength: float = 0.0,
     progress=None,
 ):
     return int(gpu_duration)
 def generate_video(
     first_image,
     last_image,
+    input_audio,
     prompt: str,
     duration: float,
     gpu_duration: float,
     transition_strength: float = 0.0,
     physics_strength: float = 0.0,
     reasoning_strength: float = 0.0,
+    twostep_strength: float = 0.0,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         log_memory("before pipeline call")
+        apply_current_loras_to_transformer(
+            pose_strength, general_strength, motion_strength, dreamlay_strength,
+            mself_strength, dramatic_strength, fluid_strength, liquid_strength,
+            demopose_strength, voice_strength, realism_strength, transition_strength,
+            physics_strength, reasoning_strength,
+        )
         video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
+            audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
+            input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
                         minimum=0.0, maximum=2.0, value=0.0, step=0.01
                     )
                     reasoning_strength = gr.Slider(
+                        label="Official Reasoning strength",
+                        minimum=0.0, maximum=2.0, value=0.0, step=0.01
+                    )
+                    twostep_strength = gr.Slider(
+                        label="Two Step Reasoning strength",
                         minimum=0.0, maximum=2.0, value=0.0, step=0.01
                     )
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=False)
             [
                 None,
                 "pinkknit.jpg",
+                None,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
                 0.0,
                 0.0,
                 0.0,
+                0.0,
             ],
         ],
         inputs=[
+            first_image, last_image, input_audio, prompt, duration, gpu_duration,
             enhance_prompt, seed, randomize_seed, height, width,
+            pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength, twostep_strength,
         ],
     )
         outputs=[width, height],
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, input_audio, prompt, duration, gpu_duration, enhance_prompt,
             seed, randomize_seed, height, width,
+            pose_strength, general_strength, motion_strength, dreamlay_strength, mself_strength, dramatic_strength, fluid_strength, liquid_strength, demopose_strength, voice_strength, realism_strength, transition_strength, physics_strength, reasoning_strength, twostep_strength,
         ],
         outputs=[output_video, seed],
     )
 """
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)