Spaces:

Rafii
/

videovoice-dramabox

Running on Zero

App Files Files Community

github-actions[bot] commited on 7 days ago

Commit

2c3df98

1 Parent(s): 4230483

deploy: switch to dramabox requirements @ b5b35d7

Browse files

Files changed (8) hide show

dramabox_src/audio_conditioning.py +115 -0
dramabox_src/audio_conditioning.py.training_helpers +115 -0
dramabox_src/inference.py +678 -0
dramabox_src/inference_server.py +380 -0
dramabox_src/model_downloader.py +105 -0
dramabox_src/preprocess.py +351 -0
dramabox_src/train.py +882 -0
dramabox_src/validate.py +363 -0

dramabox_src/audio_conditioning.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Audio reference conditioning item for IC-LoRA voice cloning."""
+import torch
+from ltx_core.components.patchifiers import AudioPatchifier
+from ltx_core.conditioning.item import ConditioningItem
+from ltx_core.tools import AudioLatentTools
+from ltx_core.types import AudioLatentShape, LatentState
+class AudioConditionByReferenceLatent(ConditioningItem):
+    """Conditions audio generation on a reference audio latent for voice cloning.
+    Mirrors VideoConditionByReferenceLatent but for audio:
+    - Patchifies reference latent [B, C, T, F] -> [B, ref_T, 128]
+    - Computes 1D temporal positions via AudioPatchifier
+    - Sets denoise_mask = 1.0 - strength (strength=1.0 -> mask=0 -> frozen)
+    - Builds ASYMMETRIC attention mask: target->ref=1 (attend), ref->target=0 (read-only)
+    - APPENDS ref tokens to END of latent sequence (IC-LoRA pattern)
+    - Uses OVERLAPPING positions (same coordinate space) so RoPE doesn't
+      decay target->ref attention. The asymmetric mask provides the structural
+      signal that ref tokens are conditioning, not reconstruction targets.
+    Args:
+        latent: Reference audio latent [B, C, T, F] (pre-VAE-encoded).
+        strength: Conditioning strength. 1.0 = full (ref kept clean),
+            0.0 = none (ref fully denoised). Default 1.0.
+    """
+    def __init__(self, latent: torch.Tensor, strength: float = 1.0):
+        self.latent = latent
+        self.strength = strength
+    def apply_to(
+        self,
+        latent_state: LatentState,
+        latent_tools: AudioLatentTools,
+    ) -> LatentState:
+        """Append reference audio tokens with positions and attention mask."""
+        tokens = latent_tools.patchifier.patchify(self.latent)
+        # Compute positions for the reference audio — small offset (0.5s) from
+        # target start to avoid exact t=0 overlap (which causes ref content to
+        # bleed into target start), while keeping RoPE decay minimal.
+        # 0.5s / max_pos(20s) = 0.025 fractional — negligible RoPE decay.
+        ref_shape = AudioLatentShape(
+            batch=self.latent.shape[0],
+            channels=self.latent.shape[1],
+            frames=self.latent.shape[2],
+            mel_bins=self.latent.shape[3],
+        )
+        positions = latent_tools.patchifier.get_patch_grid_bounds(
+            output_shape=ref_shape,
+            device=self.latent.device,
+        )
+        # Small offset to prevent t=0 position collision between target and ref
+        positions = positions + 0.5
+        # Denoise mask: 0 for frozen (strength=1.0), 1 for fully denoised (strength=0.0)
+        denoise_mask = torch.full(
+            size=(*tokens.shape[:2], 1),
+            fill_value=1.0 - self.strength,
+            device=self.latent.device,
+            dtype=torch.float32,
+        )
+        # Build ASYMMETRIC attention mask manually.
+        # Structure:
+        #              target (N)    ref (M)
+        #            ┌────────────┬──────────┐
+        #   target   │    1.0     │   1.0    │  target attends to everything
+        #    (N)     │            │          │
+        #            ├────────────┼──────────┤
+        #    ref     │    0.0     │   1.0    │  ref only attends to itself
+        #    (M)     │            │          │
+        #            └────────────┴──────────┘
+        #
+        # This makes reference tokens "read-only conditioning":
+        # - Target tokens freely attend to ref (voice cloning signal)
+        # - Ref tokens don't attend to noisy target (stays clean/stable)
+        batch_size = tokens.shape[0]
+        num_target = latent_state.latent.shape[1]
+        num_ref = tokens.shape[1]
+        total = num_target + num_ref
+        # Use float32 for the [0,1] mask — _prepare_self_attention_mask converts
+        # to log-space bias in the model's compute dtype before it reaches attention.
+        mask = torch.zeros(
+            (batch_size, total, total),
+            device=self.latent.device,
+            dtype=torch.float32,
+        )
+        # Incorporate existing mask if present, otherwise full attention for target
+        if latent_state.attention_mask is not None:
+            mask[:, :num_target, :num_target] = latent_state.attention_mask
+        else:
+            mask[:, :num_target, :num_target] = 1.0
+        # Target -> ref: FULL attention (target can read reference voice)
+        mask[:, :num_target, num_target:] = 1.0
+        # Ref -> target: BLOCKED (ref is read-only, doesn't see noisy target)
+        # mask[:, num_target:, :num_target] remains 0.0
+        # Ref -> ref: full self-attention within reference
+        mask[:, num_target:, num_target:] = 1.0
+        return LatentState(
+            latent=torch.cat([latent_state.latent, tokens], dim=1),
+            denoise_mask=torch.cat([latent_state.denoise_mask, denoise_mask], dim=1),
+            positions=torch.cat([latent_state.positions, positions], dim=2),
+            clean_latent=torch.cat([latent_state.clean_latent, tokens], dim=1),
+            attention_mask=mask,
+        )

dramabox_src/audio_conditioning.py.training_helpers ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Audio reference conditioning item for IC-LoRA voice cloning."""
+import torch
+from ltx_core.components.patchifiers import AudioPatchifier
+from ltx_core.conditioning.item import ConditioningItem
+from ltx_core.tools import AudioLatentTools
+from ltx_core.types import AudioLatentShape, LatentState
+class AudioConditionByReferenceLatent(ConditioningItem):
+    """Conditions audio generation on a reference audio latent for voice cloning.
+    Mirrors VideoConditionByReferenceLatent but for audio:
+    - Patchifies reference latent [B, C, T, F] -> [B, ref_T, 128]
+    - Computes 1D temporal positions via AudioPatchifier
+    - Sets denoise_mask = 1.0 - strength (strength=1.0 -> mask=0 -> frozen)
+    - Builds ASYMMETRIC attention mask: target->ref=1 (attend), ref->target=0 (read-only)
+    - APPENDS ref tokens to END of latent sequence (IC-LoRA pattern)
+    - Uses OVERLAPPING positions (same coordinate space) so RoPE doesn't
+      decay target->ref attention. The asymmetric mask provides the structural
+      signal that ref tokens are conditioning, not reconstruction targets.
+    Args:
+        latent: Reference audio latent [B, C, T, F] (pre-VAE-encoded).
+        strength: Conditioning strength. 1.0 = full (ref kept clean),
+            0.0 = none (ref fully denoised). Default 1.0.
+    """
+    def __init__(self, latent: torch.Tensor, strength: float = 1.0):
+        self.latent = latent
+        self.strength = strength
+    def apply_to(
+        self,
+        latent_state: LatentState,
+        latent_tools: AudioLatentTools,
+    ) -> LatentState:
+        """Append reference audio tokens with positions and attention mask."""
+        tokens = latent_tools.patchifier.patchify(self.latent)
+        # Compute positions for the reference audio — small offset (0.5s) from
+        # target start to avoid exact t=0 overlap (which causes ref content to
+        # bleed into target start), while keeping RoPE decay minimal.
+        # 0.5s / max_pos(20s) = 0.025 fractional — negligible RoPE decay.
+        ref_shape = AudioLatentShape(
+            batch=self.latent.shape[0],
+            channels=self.latent.shape[1],
+            frames=self.latent.shape[2],
+            mel_bins=self.latent.shape[3],
+        )
+        positions = latent_tools.patchifier.get_patch_grid_bounds(
+            output_shape=ref_shape,
+            device=self.latent.device,
+        )
+        # Small offset to prevent t=0 position collision between target and ref
+        positions = positions + 0.5
+        # Denoise mask: 0 for frozen (strength=1.0), 1 for fully denoised (strength=0.0)
+        denoise_mask = torch.full(
+            size=(*tokens.shape[:2], 1),
+            fill_value=1.0 - self.strength,
+            device=self.latent.device,
+            dtype=torch.float32,
+        )
+        # Build ASYMMETRIC attention mask manually.
+        # Structure:
+        #              target (N)    ref (M)
+        #            ┌────────────┬──────────┐
+        #   target   │    1.0     │   1.0    │  target attends to everything
+        #    (N)     │            │          │
+        #            ├────────────┼──────────┤
+        #    ref     │    0.0     │   1.0    │  ref only attends to itself
+        #    (M)     │            │          │
+        #            └────────────┴──────────┘
+        #
+        # This makes reference tokens "read-only conditioning":
+        # - Target tokens freely attend to ref (voice cloning signal)
+        # - Ref tokens don't attend to noisy target (stays clean/stable)
+        batch_size = tokens.shape[0]
+        num_target = latent_state.latent.shape[1]
+        num_ref = tokens.shape[1]
+        total = num_target + num_ref
+        # Use float32 for the [0,1] mask — _prepare_self_attention_mask converts
+        # to log-space bias in the model's compute dtype before it reaches attention.
+        mask = torch.zeros(
+            (batch_size, total, total),
+            device=self.latent.device,
+            dtype=torch.float32,
+        )
+        # Incorporate existing mask if present, otherwise full attention for target
+        if latent_state.attention_mask is not None:
+            mask[:, :num_target, :num_target] = latent_state.attention_mask
+        else:
+            mask[:, :num_target, :num_target] = 1.0
+        # Target -> ref: FULL attention (target can read reference voice)
+        mask[:, :num_target, num_target:] = 1.0
+        # Ref -> target: BLOCKED (ref is read-only, doesn't see noisy target)
+        # mask[:, num_target:, :num_target] remains 0.0
+        # Ref -> ref: full self-attention within reference
+        mask[:, num_target:, num_target:] = 1.0
+        return LatentState(
+            latent=torch.cat([latent_state.latent, tokens], dim=1),
+            denoise_mask=torch.cat([latent_state.denoise_mask, denoise_mask], dim=1),
+            positions=torch.cat([latent_state.positions, positions], dim=2),
+            clean_latent=torch.cat([latent_state.clean_latent, tokens], dim=1),
+            attention_mask=mask,
+        )

dramabox_src/inference.py ADDED Viewed

	@@ -0,0 +1,678 @@

+#!/usr/bin/env python3
+"""
+LTX-2.3 TTS with IC-LoRA voice cloning.
+Uses AudioConditionByReferenceLatent to append reference audio tokens to the
+end of the target sequence.  Auto-detects distilled vs dev checkpoint and
+selects the appropriate denoiser (SimpleDenoiser / GuidedDenoiser) and sigma
+schedule.  Leverages the official euler_denoising_loop, AudioLatentTools,
+GaussianNoiser, and X0Model wrapper throughout.
+Usage (distilled):
+    python tts_iclora.py \
+        --voice-sample reference.wav \
+        --prompt "A woman speaks clearly: The weather today will be sunny." \
+        --output tts_output.wav
+Usage (dev):
+    python tts_iclora.py \
+        --voice-sample reference.wav \
+        --prompt "A woman speaks clearly: The weather today will be sunny." \
+        --checkpoint ltx-2.3-22b-dev-audio-only.safetensors \
+        --full-checkpoint ltx-2.3-22b-dev.safetensors \
+        --output tts_output.wav
+"""
+import argparse
+import json
+import logging
+import os
+import re
+import struct
+import sys
+import time
+from pathlib import Path
+import torch
+import torchaudio
+REPO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ltx2"))
+# ltx-pipelines already on path via ltx2/
+# Also add the local directory so audio_conditioning.py is importable
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+MODEL_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "models")
+GEMMA_DIR = os.environ.get("GEMMA_DIR", "gemma-3-12b-it-qat-q4_0-unquantized")
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def detect_model_type(checkpoint_path: str) -> str:
+    """Detect if checkpoint is distilled or dev by checking filename and metadata."""
+    path_lower = checkpoint_path.lower()
+    if "distilled" in path_lower:
+        return "distilled"
+    if "dev" in path_lower:
+        return "dev"
+    # Fallback: try to read safetensors metadata
+    try:
+        with open(checkpoint_path, "rb") as f:
+            header_size = struct.unpack("<Q", f.read(8))[0]
+            header = json.loads(f.read(header_size).decode())
+        metadata = header.get("__metadata__", {})
+        version = metadata.get("model_version", "")
+        if "distilled" in version.lower():
+            return "distilled"
+    except Exception:
+        pass
+    # Default to distilled (most common for audio-only)
+    return "distilled"
+_LAUGH_VERBS = {
+    # base seconds per occurrence; gets scaled by the modifier found nearby.
+    # Verb regex covers inflections: laugh/laughs/laughed/laughing.
+    r"\blaugh(?:s|ed|ing)?\b": 1.5,
+    r"\bcackl(?:e|es|ed|ing)\b": 1.5,
+    r"\bchuckl(?:e|es|ed|ing)\b": 1.0,
+    r"\bgiggl(?:e|es|ed|ing)\b": 1.0,
+    r"\bsnicker(?:s|ed|ing)?\b": 0.8,
+    r"\bcru?el laugh\b": 1.5,
+}
+def _contextual_laugh_duration(text: str) -> float:
+    """Context-aware laugh budget.
+    For each laugh verb in the prompt, look at the adjective/adverb that
+    modifies it and scale the base duration:
+      - short modifiers  (briefly, softly, once)     -> 0.4x base
+      - long modifiers   (maniacally, heartily, ...) -> 1.2x base
+      - default (no mod / neutral)                   -> 1.0x base
+    Also reward phonetic repetition inside quotes -- 'Hahahahahaha' buys more
+    time than 'Haha' -- at ~0.2s per extra repeated syllable.
+    """
+    # "softly" / "quietly" describe volume not length, so keep at default 1.0x.
+    short_mod = re.compile(
+        r"^\s*(?:[a-z]+ly )?(?:briefly|shortly|once|quickly)",
+        re.IGNORECASE)
+    long_mod = re.compile(
+        r"^\s*(?:[a-z]+ly )?(?:maniacally|heartily|uproariously|uncontrollably|"
+        r"hysterically|darkly|wickedly|evilly|loudly|long)"
+        r"|^\s*between phrases", re.IGNORECASE)
+    total = 0.0
+    for pat, base_dur in _LAUGH_VERBS.items():
+        for m in re.finditer(pat, text, re.IGNORECASE):
+            ctx = text[m.end(): m.end() + 40]
+            if short_mod.match(ctx):
+                total += base_dur * 0.4
+            elif long_mod.match(ctx):
+                total += base_dur * 1.2
+            else:
+                total += base_dur
+    # Phonetic laugh repetition inside quotes:
+    #   'Haha' = 2 syllables (base, no bonus)
+    #   'Hahahaha' = 4 syllables (+0.4s)
+    #   'Hehehehahahahahahahaha' ~ 10 syllables (+1.6s)
+    for q in re.findall(r'"([^"]+)"', text) + re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text):
+        for run in re.findall(r"(?:h[ae]){3,}|(?:h[ae][ \-]?){3,}", q, re.IGNORECASE):
+            syls = len(re.findall(r"h[ae]", run, re.IGNORECASE))
+            total += 0.2 * max(syls - 2, 0)
+    return total
+def _estimate_nonverbal_duration(text: str) -> float:
+    """Estimate extra duration for non-verbal sounds and actions in the prompt.
+    Laugh-verb handling lives in ``_contextual_laugh_duration`` so cackle /
+    chuckle / laugh budgets scale with the adjective ("maniacally" vs
+    "briefly") and with the repetition length of 'Ha'/'He' tokens inside
+    quotes.
+    """
+    PATTERNS = {
+        # Breathing / sighs
+        r'\bsighs?\b': 0.8, r'\bshaky breath\b': 1.0, r'\bbreathing deeply\b': 1.0,
+        r'\bgasps?\b': 0.5, r'\bburps?\b': 0.5, r'\byawns?\b': 1.0,
+        r'\bpants?\b': 0.8, r'\bwheezes?\b': 0.8, r'\bcoughs?\b': 0.8,
+        r'\bsniffles?\b': 0.5, r'\bsnorts?\b': 0.3, r'\bgroans?\b': 0.8,
+        # Pauses (trimmed; earlier values over-budgeted silence)
+        r'\blong pause\b': 1.0, r'\bpauses? briefly\b': 0.3,
+        r'\bpauses?\b': 0.5, r'\bsilence\b': 1.0,
+        r'\blets? the .{1,20} hang\b': 1.0, r'\blets? .{1,20} sink in\b': 1.0,
+        # Physical actions that produce sound
+        r'\bslams?\b': 0.5, r'\bclaps?\b': 0.3,
+        r'\bdraws? (?:his|her|a) sword\b': 0.5,
+        r'\btakes? a (?:drag|swig|sip|drink)\b': 0.5,
+        r'\bwhistles?\b': 1.0, r'\bhums?\b': 0.8,
+        # Vocal actions (not in quotes but take time)
+        r'\bmutters?\b': 1.5, r'\bmumbles?\b': 1.0, r'\bwhispers?\b': 0.0,
+        r'\bclears? (?:his|her) throat\b': 0.5, r'\bgulps?\b': 0.5,
+        r'\bswallows?\b': 0.5,
+        # (laugh / chuckle / cackle / giggle / snicker handled by
+        # _contextual_laugh_duration below -- modifier-aware, not flat.)
+        # Emotional transitions
+        r'\bvoice (?:breaks?|cracks?|trembles?|drops?|rises?)\b': 0.5,
+        r'\bsteadies? (?:him|her)self\b': 1.0,
+        r'\bcatches? (?:his|her) breath\b': 1.0,
+        r'\bcomposes? (?:him|her)self\b': 0.8,
+        # Scene transitions that imply time
+        r'\bdemeanor shifts?\b': 0.5, r'\bsettles? in\b': 0.5,
+        r'\bleans? in\b': 0.3, r'\bwipes? (?:his|her) eyes\b': 0.5,
+    }
+    extra = 0.0
+    for pattern, dur in PATTERNS.items():
+        extra += dur * len(re.findall(pattern, text, re.IGNORECASE))
+    extra += _contextual_laugh_duration(text)
+    return extra
+def estimate_speech_duration(text: str, speed: float = 1.0) -> float:
+    """Estimate speech duration from spoken content + non-verbal actions.
+    Extracts spoken text by priority:
+    1. Quoted text ('...' or "...") -- official prompt guide format
+    2. Text after colon -- simple "Speaker: dialogue" format
+    3. Full text -- fallback
+    Also scans the full prompt for non-verbal cues (laughs, pauses, sighs,
+    gasps, etc.) and adds estimated duration for each.
+    """
+    # Try double quotes first (clean, no contraction issues)
+    quotes = re.findall(r'"([^"]+)"', text)
+    if not quotes:
+        # Single quotes: allow apostrophes in contractions (don't, can't, it's)
+        # Match ' to ' but apostrophes NOT followed by space/punctuation are kept inside
+        quotes = re.findall(r"'((?:[^']|'(?![\s.,!?)\]]))+)'", text)
+        # Filter out short fragments (scene directions like "He pauses")
+        quotes = [q for q in quotes if len(q.split()) > 3]
+    if quotes:
+        spoken = " ".join(quotes)
+    elif ":" in text:
+        spoken = text.split(":", 1)[1].strip()
+    else:
+        spoken = text
+    CHARS_PER_SEC = 14.0
+    text_len = len(spoken)
+    if text_len < 40:
+        chars_per_sec = CHARS_PER_SEC * 0.6
+    elif text_len < 80:
+        chars_per_sec = CHARS_PER_SEC * 0.8
+    else:
+        chars_per_sec = CHARS_PER_SEC
+    chars_per_sec *= speed
+    duration = text_len / chars_per_sec
+    sentence_count = spoken.count(".") + spoken.count("!") + spoken.count("?")
+    duration += sentence_count * 0.3
+    # Add time for non-verbal sounds/actions in the full prompt
+    duration += _estimate_nonverbal_duration(text)
+    return max(3.0, round(duration + 2.0, 1))
+def parse_args():
+    p = argparse.ArgumentParser(description="LTX-2.3 TTS with IC-LoRA voice cloning")
+    p.add_argument("--voice-sample", default=None, help="Voice reference WAV")
+    p.add_argument("--no-ref", action="store_true", help="Skip voice reference conditioning (raw base model)")
+    p.add_argument("--prompt", required=True, help="Text/scene description to synthesize")
+    p.add_argument("--output", default="tts_output.wav")
+    p.add_argument("--ref-duration", type=float, default=10.0, help="Seconds of voice reference to use")
+    p.add_argument("--gen-duration", type=float, default=0.0,
+                   help="Target output duration in seconds (0 = auto from prompt + multiplier). "
+                        "Set explicitly for long-form prompts (e.g. --gen-duration 30 for music). "
+                        "Outputs >20.5s automatically engage the end-of-clip silence-prior patch.")
+    p.add_argument("--pad-start", type=float, default=0.0,
+                   help="Prepend N seconds of silent padding, trimmed after decode (use 0 for clean starts)")
+    p.add_argument("--speed", type=float, default=1.0)
+    p.add_argument("--duration-multiplier", type=float, default=1.0,
+                   help="Multiply auto-estimated duration by this factor (e.g. 1.1 for 10%% more breathing room)")
+    p.add_argument("--checkpoint", default=os.path.join(MODEL_DIR, "ltx-2.3-audio-only.safetensors"))
+    p.add_argument("--full-checkpoint", default=os.path.join(MODEL_DIR, "ltx-2.3-22b-distilled.safetensors"))
+    p.add_argument("--gemma-root", default=GEMMA_DIR)
+    p.add_argument("--bnb-4bit", dest="bnb_4bit", action="store_true", default=True,
+                   help="Load Gemma text encoder via the bitsandbytes 4-bit path "
+                        "(required for the default unsloth/gemma-3-12b-it-bnb-4bit "
+                        "pre-quantized weights). Default: on.")
+    p.add_argument("--no-bnb-4bit", dest="bnb_4bit", action="store_false",
+                   help="Disable the bitsandbytes path (use only if --gemma-root "
+                        "points at an unquantized Gemma checkpoint).")
+    p.add_argument("--lora", default=None, help="Path to trained IC-LoRA .safetensors (audio-only)")
+    p.add_argument("--lora-rank", type=int, default=128, help="LoRA rank (must match training)")
+    p.add_argument("--id-guidance-scale", type=float, default=3.0, help="Identity guidance scale (0=disabled)")
+    p.add_argument("--seed", type=int, default=42)
+    # Auto-set based on model type but overridable
+    p.add_argument("--no-watermark", action="store_true",
+                   help="Skip Perth audio watermarking on the output (default: watermark on).")
+    p.add_argument("--sampler", choices=["euler", "heun"], default="euler",
+                   help="Denoising loop. 'heun' = jkass_quality 2nd-order predictor-corrector (~2x model calls, cleaner audio).")
+    p.add_argument("--cfg-scale", type=float, default=None, help="CFG scale (auto: 1.0 distilled, 7.0 dev)")
+    p.add_argument("--stg-scale", type=float, default=None, help="STG scale (auto: 0.0 distilled, 1.0 dev)")
+    p.add_argument("--stg-block", type=int, default=29, help="Block index for STG perturbation")
+    p.add_argument("--rescale-scale", type=float, default=None,
+                   help="Latent CFG std-rescale (default auto: cfg-aware schedule that prevents "
+                        "output clipping at high cfg; pass any float in [0,1] to override).")
+    p.add_argument("--modality-scale", type=float, default=None, help="Modality (auto: 1.0 distilled, 3.0 dev)")
+    p.add_argument("--cfg-clamp", type=float, default=0.0, help="Clamp guided pred std to N * cond std (0=disabled)")
+    p.add_argument("--steps", type=int, default=None, help="Override steps (auto: distilled sigmas / 30 dev)")
+    p.add_argument("--fps", type=float, default=None, help="FPS (auto: 24.0 distilled, 25.0 dev)")
+    p.add_argument(
+        "--negative-prompt",
+        default=(
+            "worst quality, inconsistent motion, blurry, jittery, distorted, "
+            "robotic voice, echo, background noise, off-sync audio, repetitive speech"
+        ),
+        help="Negative prompt for CFG (dev model)",
+    )
+    return p.parse_args()
+@torch.inference_mode()
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    args = parse_args()
+    t0 = time.time()
+    # ---- Imports (deferred to avoid startup cost when checking --help) ----
+    from audio_conditioning import AudioConditionByReferenceLatent
+    from ltx_core.batch_split import BatchSplitAdapter
+    from ltx_core.components.diffusion_steps import EulerDiffusionStep
+    from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
+    from ltx_core.components.noisers import GaussianNoiser
+    from ltx_core.components.patchifiers import AudioPatchifier
+    from ltx_core.components.schedulers import LTX2Scheduler
+    from ltx_core.loader.registry import DummyRegistry
+    from ltx_core.loader.sd_ops import SDOps
+    from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+    from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+    from ltx_core.model.model_protocol import ModelConfigurator
+    from ltx_core.model.transformer.attention import AttentionFunction
+    from ltx_core.model.transformer.model import LTXModel, LTXModelType, X0Model
+    from ltx_core.model.transformer.rope import LTXRopeType
+    from ltx_core.tools import AudioLatentTools
+    from ltx_core.types import Audio, AudioLatentShape, LatentState, VideoPixelShape
+    from ltx_pipelines.utils.blocks import AudioConditioner, AudioDecoder, PromptEncoder
+    from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES
+    from ltx_pipelines.utils.denoisers import GuidedDenoiser, SimpleDenoiser
+    from ltx_pipelines.utils.gpu_model import gpu_model
+    from ltx_pipelines.utils.media_io import decode_audio_from_file
+    from ltx_pipelines.utils.samplers import euler_denoising_loop, heun_denoising_loop
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16
+    patchifier = AudioPatchifier(patch_size=1)
+    # ---- Detect model type and set defaults ----
+    model_type = detect_model_type(args.full_checkpoint)
+    logging.info(f"Detected model type: {model_type}")
+    is_distilled = model_type == "distilled"
+    if args.cfg_scale is None:
+        args.cfg_scale = 1.0 if is_distilled else 7.0
+    if args.stg_scale is None:
+        args.stg_scale = 0.0 if is_distilled else 1.0
+    if args.rescale_scale is None:
+        # Auto cfg-aware rescale: imported from inference_server to keep one source of truth.
+        from inference_server import auto_rescale_for_cfg
+        args.rescale_scale = 0.0 if is_distilled else auto_rescale_for_cfg(args.cfg_scale)
+    if args.modality_scale is None:
+        args.modality_scale = 1.0 if is_distilled else 3.0
+    if args.fps is None:
+        args.fps = 24.0 if is_distilled else 25.0
+    logging.info(
+        f"Params: cfg={args.cfg_scale}, stg={args.stg_scale}, rescale={args.rescale_scale}, "
+        f"modality={args.modality_scale}, fps={args.fps}"
+    )
+    # ---- Auto duration ----
+    if args.gen_duration <= 0:
+        args.gen_duration = estimate_speech_duration(args.prompt, args.speed)
+        if args.duration_multiplier != 1.0:
+            args.gen_duration = round(args.gen_duration * args.duration_multiplier, 1)
+        logging.info(f"Auto duration: {args.gen_duration}s for {len(args.prompt)} chars"
+                     f"{f' (x{args.duration_multiplier})' if args.duration_multiplier != 1.0 else ''}")
+    # ---- Compute target shape (include pad_start in duration) ----
+    padded_duration = args.gen_duration + args.pad_start
+    raw_frames = int(round(padded_duration * args.fps)) + 1
+    num_frames = ((raw_frames - 1 + 4) // 8) * 8 + 1
+    pixel_shape = VideoPixelShape(batch=1, frames=num_frames, height=64, width=64, fps=args.fps)
+    tgt_shape = AudioLatentShape.from_video_pixel_shape(pixel_shape)
+    logging.info(f"Target shape: {tgt_shape} ({args.gen_duration}s, {num_frames} frames)")
+    # ---- AudioLatentTools for target ----
+    audio_tools = AudioLatentTools(patchifier=patchifier, target_shape=tgt_shape)
+    # ---- Create initial state ----
+    state = audio_tools.create_initial_state(device, dtype)
+    logging.info(
+        f"Initial state: latent={state.latent.shape}, positions={state.positions.shape}, "
+        f"denoise_mask={state.denoise_mask.shape}"
+    )
+    if not args.no_ref and args.voice_sample:
+        # ---- Encode voice reference ----
+        logging.info(f"Loading voice reference: {args.voice_sample}")
+        voice = decode_audio_from_file(args.voice_sample, device, 0.0, args.ref_duration)
+        if voice is None:
+            raise ValueError(f"Could not load audio from {args.voice_sample}")
+        w = voice.waveform
+        if w.dim() == 2:
+            if w.shape[0] == 1:
+                w = w.repeat(2, 1)
+            w = w.unsqueeze(0)
+        elif w.dim() == 3 and w.shape[1] == 1:
+            w = w.repeat(1, 2, 1)
+        target_samples = int(args.ref_duration * voice.sampling_rate)
+        if w.shape[-1] < target_samples:
+            w = w.repeat(1, 1, (target_samples // w.shape[-1]) + 1)
+        w = w[..., :target_samples]
+        # Peak normalize reference
+        peak = w.abs().max()
+        if peak > 0:
+            target_peak = 10 ** (-4.0 / 20)  # -4dB
+            w = w * (target_peak / peak)
+            logging.info(f"Normalized reference: peak {peak:.4f} -> {target_peak:.4f}")
+        voice = Audio(waveform=w, sampling_rate=voice.sampling_rate)
+        logging.info("Encoding voice through Audio VAE...")
+        ac = AudioConditioner(checkpoint_path=args.full_checkpoint, dtype=dtype, device=device)
+        ref_latent = ac(lambda enc: vae_encode_audio(voice, enc, None))
+        del ac
+        torch.cuda.empty_cache()
+        logging.info(f"Reference latent: {ref_latent.shape}")
+        # ---- Apply conditioning: append ref tokens to END ----
+        conditioning = AudioConditionByReferenceLatent(latent=ref_latent.to(device, dtype), strength=1.0)
+        state = conditioning.apply_to(latent_state=state, latent_tools=audio_tools)
+        logging.info(
+            f"After conditioning: latent={state.latent.shape}, positions={state.positions.shape}, "
+            f"attention_mask={'None' if state.attention_mask is None else state.attention_mask.shape}"
+        )
+    else:
+        logging.info("No voice reference — running raw base model")
+    # ---- Apply noise ----
+    generator = torch.Generator(device=device).manual_seed(args.seed)
+    noiser = GaussianNoiser(generator=generator)
+    noised_state = noiser(state, noise_scale=1.0)
+    logging.info("Applied Gaussian noise to state")
+    # ---- Encode prompt ----
+    use_cfg = args.cfg_scale > 1.0
+    logging.info("Encoding prompt...")
+    pe = PromptEncoder(checkpoint_path=args.full_checkpoint, gemma_root=args.gemma_root, dtype=dtype, device=device,
+                       use_bnb_4bit=args.bnb_4bit, warm=True)
+    prompts_to_encode = [args.prompt]
+    if use_cfg:
+        prompts_to_encode.append(args.negative_prompt)
+    ctx = pe(prompts_to_encode, streaming_prefetch_count=None)
+    a_ctx = ctx[0].audio_encoding
+    a_ctx_neg = ctx[1].audio_encoding if use_cfg else None
+    del pe
+    torch.cuda.empty_cache()
+    logging.info(f"Prompt encoded: a_ctx={a_ctx.shape}" + (f", a_ctx_neg={a_ctx_neg.shape}" if a_ctx_neg is not None else ""))
+    # ---- Build audio-only model ----
+    logging.info("Building audio-only model...")
+    audio_only_sd_ops = SDOps("AO").with_matching(prefix="model.diffusion_model.").with_replacement(
+        "model.diffusion_model.", ""
+    )
+    class AudioOnlyConfigurator(ModelConfigurator[LTXModel]):
+        @classmethod
+        def from_config(cls, config):
+            t = config.get("transformer", {})
+            cp = None
+            if not t.get("caption_proj_before_connector", False):
+                from ltx_core.model.transformer.text_projection import create_caption_projection
+                with torch.device("meta"):
+                    cp = create_caption_projection(t, audio=True)
+            return LTXModel(
+                model_type=LTXModelType.AudioOnly,
+                audio_num_attention_heads=t.get("audio_num_attention_heads", 32),
+                audio_attention_head_dim=t.get("audio_attention_head_dim", 64),
+                audio_in_channels=t.get("audio_in_channels", 128),
+                audio_out_channels=t.get("audio_out_channels", 128),
+                num_layers=t.get("num_layers", 48),
+                audio_cross_attention_dim=t.get("audio_cross_attention_dim", 2048),
+                norm_eps=t.get("norm_eps", 1e-6),
+                attention_type=AttentionFunction(t.get("attention_type", "default")),
+                positional_embedding_theta=10000.0,
+                audio_positional_embedding_max_pos=[20.0],
+                timestep_scale_multiplier=t.get("timestep_scale_multiplier", 1000),
+                use_middle_indices_grid=t.get("use_middle_indices_grid", True),
+                rope_type=LTXRopeType(t.get("rope_type", "interleaved")),
+                double_precision_rope=t.get("frequencies_precision", False) == "float64",
+                apply_gated_attention=t.get("apply_gated_attention", False),
+                audio_caption_projection=cp,
+                cross_attention_adaln=t.get("cross_attention_adaln", False),
+            )
+    builder = Builder(
+        model_path=args.checkpoint,
+        model_class_configurator=AudioOnlyConfigurator,
+        model_sd_ops=audio_only_sd_ops,
+        registry=DummyRegistry(),
+    )
+    velocity_model = builder.build(device=device, dtype=dtype).to(device).eval()
+    # ---- Load LoRA weights (if provided) ----
+    if args.lora and os.path.exists(args.lora):
+        from peft import LoraConfig, get_peft_model
+        from safetensors.torch import load_file as st_load
+        logging.info(f"Loading LoRA: {args.lora}")
+        lora_sd = st_load(args.lora)
+        is_peft_format = any("base_model.model." in k for k in lora_sd.keys())
+        is_original_idlora = any("diffusion_model." in k for k in lora_sd.keys())
+        lora_config = LoraConfig(
+            r=args.lora_rank,
+            lora_alpha=args.lora_rank,
+            lora_dropout=0.0,
+            bias="none",
+            target_modules=[
+                "audio_attn1.to_k",
+                "audio_attn1.to_q",
+                "audio_attn1.to_v",
+                "audio_attn1.to_out.0",
+                "audio_attn2.to_k",
+                "audio_attn2.to_q",
+                "audio_attn2.to_v",
+                "audio_attn2.to_out.0",
+                "audio_ff.net.0.proj",
+                "audio_ff.net.2",
+            ],
+        )
+        velocity_model = get_peft_model(velocity_model, lora_config)
+        if is_peft_format:
+            mapped_sd = {}
+            for k, v in lora_sd.items():
+                new_key = k
+                if ".lora_A.weight" in k and ".lora_A.default.weight" not in k:
+                    new_key = k.replace(".lora_A.weight", ".lora_A.default.weight")
+                if ".lora_B.weight" in k and ".lora_B.default.weight" not in k:
+                    new_key = k.replace(".lora_B.weight", ".lora_B.default.weight")
+                mapped_sd[new_key] = v
+            missing, unexpected = velocity_model.load_state_dict(mapped_sd, strict=False)
+            loaded = len(mapped_sd) - len(unexpected)
+            logging.info(f"Loaded {loaded} LoRA weights (peft format)")
+        elif is_original_idlora:
+            audio_keys = {
+                k: v
+                for k, v in lora_sd.items()
+                if "audio_attn1" in k or "audio_attn2" in k or "audio_ff" in k
+            }
+            mapped_sd = {}
+            for k, v in audio_keys.items():
+                new_key = k.replace("diffusion_model.", "base_model.model.")
+                new_key = new_key.replace(".lora_A.weight", ".lora_A.default.weight")
+                new_key = new_key.replace(".lora_B.weight", ".lora_B.default.weight")
+                mapped_sd[new_key] = v
+            missing, unexpected = velocity_model.load_state_dict(mapped_sd, strict=False)
+            loaded = len(mapped_sd) - len(unexpected)
+            logging.info(f"Loaded {loaded} LoRA weights (original ID-LoRA)")
+        velocity_model = velocity_model.merge_and_unload()
+        logging.info("Merged LoRA into model")
+    logging.info(f"Model: {sum(p.numel() for p in velocity_model.parameters()) / 1e9:.1f}B params")
+    # ---- Wrap velocity model in X0Model ----
+    x0_model = X0Model(velocity_model)
+    # ---- Build denoiser and sigmas ----
+    stepper = EulerDiffusionStep()
+    # ---- Sigma schedule ----
+    if is_distilled:
+        if args.steps is not None and args.steps > 0:
+            sigmas = LTX2Scheduler().execute(steps=args.steps, latent=noised_state.latent).to(device)
+            logging.info(f"Distilled with custom {args.steps}-step schedule")
+        else:
+            sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, dtype=torch.float32, device=device)
+            logging.info(f"Distilled {len(DISTILLED_SIGMA_VALUES) - 1}-step schedule")
+    else:
+        steps = args.steps if args.steps is not None and args.steps > 0 else 30
+        sigmas = LTX2Scheduler().execute(steps=steps, latent=noised_state.latent).to(device)
+        logging.info(f"Dev {steps}-step schedule")
+    # ---- Denoiser: use GuidedDenoiser if any guidance is active, SimpleDenoiser otherwise ----
+    needs_guidance = args.cfg_scale > 1.0 or args.stg_scale > 0.0 or args.modality_scale > 1.0
+    if needs_guidance:
+        audio_guider = MultiModalGuider(
+            params=MultiModalGuiderParams(
+                cfg_scale=args.cfg_scale,
+                stg_scale=args.stg_scale,
+                stg_blocks=[args.stg_block] if args.stg_scale > 0 else [],
+                rescale_scale=args.rescale_scale,
+                modality_scale=args.modality_scale,
+                cfg_clamp_scale=args.cfg_clamp,
+            ),
+            negative_context=a_ctx_neg,
+        )
+        denoiser = GuidedDenoiser(
+            v_context=None,
+            a_context=a_ctx,
+            video_guider=None,
+            audio_guider=audio_guider,
+        )
+        logging.info(f"GuidedDenoiser: cfg={args.cfg_scale}, stg={args.stg_scale}, "
+                     f"rescale={args.rescale_scale}, modality={args.modality_scale}")
+    else:
+        denoiser = SimpleDenoiser(v_context=None, a_context=a_ctx)
+        logging.info("SimpleDenoiser (no guidance)")
+    logging.info(f"Sigmas: {sigmas.tolist()}")
+    # ---- Denoising loop ----
+    logging.info(f"Running denoising loop ({len(sigmas) - 1} steps)...")
+    with gpu_model(x0_model) as model:
+        batched_model = BatchSplitAdapter(model, max_batch_size=1)
+        denoise_fn = heun_denoising_loop if args.sampler == "heun" else euler_denoising_loop
+        _, audio_state = denoise_fn(
+            sigmas=sigmas,
+            video_state=None,
+            audio_state=noised_state,
+            stepper=stepper,
+            transformer=batched_model,
+            denoiser=denoiser,
+        )
+    del velocity_model, x0_model
+    torch.cuda.empty_cache()
+    # ---- Strip ref tokens and unpatchify ----
+    logging.info("Stripping conditioning and unpatchifying...")
+    audio_state = audio_tools.clear_conditioning(audio_state)
+    audio_state = audio_tools.unpatchify(audio_state)
+    logging.info(f"Final latent shape: {audio_state.latent.shape}")
+    # ---- End-of-clip silence-prior fix ----
+    # Base LTX-2.3 22B was trained on audio clips ≤ ~20 s and learned a strong
+    # "clip-end silence" prior at the next patchifier-aligned latent boundary
+    # (frame 513 = 8 × 64 + 1). For longer outputs that prior leaks through as
+    # a ~30 ms hard silence dip near 20.4 s. Linearly interpolating frames
+    # 512–513 between their neighbours (511 and 514) removes the dip cleanly.
+    latent_in = audio_state.latent
+    if latent_in.shape[2] > 513:
+        f0, f1 = 511, 514
+        n = f1 - f0
+        patched = latent_in.clone()
+        for f in (512, 513):
+            t = (f - f0) / n
+            patched[:, :, f, :] = (1.0 - t) * latent_in[:, :, f0, :] + t * latent_in[:, :, f1, :]
+        latent_in = patched
+    # ---- Decode audio ----
+    logging.info("Decoding audio...")
+    ad = AudioDecoder(checkpoint_path=args.full_checkpoint, dtype=dtype, device=device)
+    decoded = ad(latent_in)
+    del ad
+    torch.cuda.empty_cache()
+    wav = decoded.waveform
+    if wav.dim() == 1:
+        wav = wav.unsqueeze(0)
+    sr = decoded.sampling_rate
+    # Trim leading pad if --pad-start was used
+    if args.pad_start > 0:
+        trim_samples = int(args.pad_start * sr)
+        wav = wav[..., trim_samples:]
+        logging.info(f"Trimmed {args.pad_start}s ({trim_samples} samples) of start padding")
+    # Apply Perth (Perceptual Threshold) imperceptible neural watermark — see
+    # https://github.com/resemble-ai/perth. Mono waveform required; if stereo,
+    # we average to mono for the watermark and broadcast back. Skip on
+    # --no-watermark for debugging.
+    wav_cpu = wav.float().cpu()
+    if not getattr(args, "no_watermark", False):
+        try:
+            import perth
+            import numpy as np
+            wm = perth.PerthImplicitWatermarker()
+            mono = wav_cpu.mean(dim=0).numpy() if wav_cpu.shape[0] > 1 else wav_cpu[0].numpy()
+            mono_wm = wm.apply_watermark(mono, sample_rate=sr)
+            mono_wm_t = torch.from_numpy(np.asarray(mono_wm, dtype=np.float32)).unsqueeze(0)
+            wav_cpu = mono_wm_t if wav_cpu.shape[0] == 1 else mono_wm_t.repeat(wav_cpu.shape[0], 1)
+        except Exception as e:
+            logging.warning(f"Perth watermark skipped ({e})")
+    os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+    torchaudio.save(args.output, wav_cpu, sr)
+    elapsed = time.time() - t0
+    logging.info(f"Output: {args.output} ({wav.shape[-1] / sr:.1f}s)")
+    logging.info(f"Total time: {elapsed:.1f}s")
+if __name__ == "__main__":
+    main()

dramabox_src/inference_server.py ADDED Viewed

	@@ -0,0 +1,380 @@

+#!/usr/bin/env python3
+"""
+Warm TTS server — loads models once, accepts requests via stdin or function call.
+The key insight: inference.py spends 11s on Gemma + 8s on model load every call.
+This server loads everything once and keeps it warm.
+We import and call the same code paths as inference.py but cache the heavy objects.
+"""
+import json
+import logging
+import os
+import re
+import sys
+import time
+from pathlib import Path
+import torch
+import torchaudio
+# Setup paths
+APP_DIR = Path(__file__).parent.parent
+sys.path.insert(0, str(APP_DIR / "ltx2"))
+sys.path.insert(0, str(APP_DIR / "src"))
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+from audio_conditioning import AudioConditionByReferenceLatent
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.components.patchifiers import AudioPatchifier
+from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
+from ltx_core.components.schedulers import LTX2Scheduler
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.loader import DummyRegistry
+from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+from ltx_core.loader.sd_ops import SDOps
+from ltx_core.model.transformer.model import LTXModel, LTXModelType, X0Model
+from ltx_core.model.transformer.rope import LTXRopeType
+from ltx_core.model.transformer.text_projection import create_caption_projection
+from ltx_core.model.transformer.attention import AttentionFunction
+from ltx_core.model.model_protocol import ModelConfigurator
+from ltx_core.tools import AudioLatentTools
+from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_pipelines.utils.blocks import AudioConditioner, AudioDecoder, PromptEncoder
+from ltx_pipelines.utils.media_io import decode_audio_from_file
+from ltx_pipelines.utils.denoisers import GuidedDenoiser
+from ltx_pipelines.utils.samplers import euler_denoising_loop
+from safetensors import safe_open
+DEFAULT_NEG = "worst quality, inconsistent, robotic, distorted, noise, static, muffled, unclear, unnatural, monotone"
+def estimate_duration(prompt, multiplier=1.1):
+    """Defer to the richer CLI estimator (sentence-aware + non-verbal action
+    budget) so warm-server outputs match the lengths of the per-call CLI runs."""
+    from inference import estimate_speech_duration
+    base = estimate_speech_duration(prompt)
+    return max(3.0, round(base * multiplier, 1))
+def auto_rescale_for_cfg(cfg: float) -> float:
+    """CFG-aware std-rescale schedule that prevents output clipping at high cfg.
+    The CFG formula `pred = cond + (cfg-1)*(cond - uncond)` makes pred.std()
+    grow roughly linearly with cfg, which the audio VAE+vocoder render as
+    progressively louder waveforms. By cfg≈3 the output starts hard-clipping
+    at 0 dBFS — and clipped information is unrecoverable in post.
+    Empirical sweep on the blues prompt with the back-porch-boogie ref
+    (rescale_scale needed for ≥1 dB peak headroom):
+        cfg=2.5 → 0.2 ;  cfg=3 → 0.6 ; cfg=4 → 0.8 ; cfg=5–8 → 0.8 ; cfg=10 → 1.0
+    Piecewise-linear fit through those points; returns 0 below cfg=2 (no CFG
+    even applied at cfg=1), plateaus at 0.8 between cfg=4 and cfg=8 to
+    preserve the "extra punch" of high-CFG generations, and ramps to 1.0 by
+    cfg=10.
+    """
+    if cfg <= 2.0:
+        return 0.0
+    if cfg <= 3.0:
+        return 0.6 * (cfg - 2.0)               # 0 → 0.6
+    if cfg <= 4.0:
+        return 0.6 + 0.2 * (cfg - 3.0)         # 0.6 → 0.8
+    if cfg <= 8.0:
+        return 0.8                              # plateau
+    return min(1.0, 0.8 + 0.1 * (cfg - 8.0))   # 0.8 → 1.0 at cfg=10
+class TTSServer:
+    def __init__(self, checkpoint=None, full_checkpoint=None, gemma_root=None,
+                 device="cuda", dtype="bf16", compile_model=True, bnb_4bit=True):
+        MODELS = APP_DIR / "models"
+        self.checkpoint = checkpoint or str(MODELS / "ltx-2.3-22b-dev-audio-only-v13-merged.safetensors")
+        self.full_checkpoint = full_checkpoint or os.environ.get(
+            "LTX_FULL_CHECKPOINT", "/mnt/persistent0/manmay/models/ltx23/ltx-2.3-22b-dev.safetensors")
+        if gemma_root is None and not os.environ.get("GEMMA_DIR"):
+            from model_downloader import get_gemma_path
+            gemma_root = get_gemma_path()
+        self.gemma_root = gemma_root or os.environ["GEMMA_DIR"]
+        self.device = torch.device(device)
+        self.dtype = torch.float16 if dtype == "fp16" else torch.bfloat16
+        self.compile_model = compile_model
+        self.bnb_4bit = bnb_4bit
+        self.patchifier = AudioPatchifier(patch_size=1)
+        # Cached models
+        self._prompt_encoder = None
+        self._velocity_model = None
+        self._audio_conditioner = None
+        self._audio_decoder = None
+        logging.info(f"TTSServer loading on {device}...")
+        t0 = time.time()
+        self._load_all()
+        logging.info(f"All models loaded in {time.time()-t0:.1f}s — ready for requests")
+    def _load_all(self):
+        # 1. Prompt encoder (Gemma + embeddings processor kept warm)
+        t0 = time.time()
+        self._prompt_encoder = PromptEncoder(
+            checkpoint_path=self.full_checkpoint,
+            gemma_root=self.gemma_root,
+            dtype=self.dtype, device=self.device,
+            warm=True,
+            use_bnb_4bit=self.bnb_4bit,
+            audio_only=True,
+        )
+        logging.info(f"  PromptEncoder (warm): {time.time()-t0:.1f}s")
+        # 2. Audio conditioner (VAE encoder kept warm)
+        t0 = time.time()
+        self._audio_conditioner = AudioConditioner(
+            checkpoint_path=self.full_checkpoint,
+            dtype=self.dtype, device=self.device,
+            warm=True,
+        )
+        logging.info(f"  AudioConditioner (warm): {time.time()-t0:.1f}s")
+        # 3. Transformer
+        t0 = time.time()
+        with safe_open(self.checkpoint, framework="pt") as f:
+            config = json.loads(f.metadata()["config"])
+        t = config.get("transformer", {})
+        class AudioOnlyConfigurator(ModelConfigurator[LTXModel]):
+            @classmethod
+            def from_config(cls, cfg):
+                t = cfg.get("transformer", {})
+                cp = None
+                if not t.get("caption_proj_before_connector", False):
+                    with torch.device("meta"):
+                        cp = create_caption_projection(t, audio=True)
+                return LTXModel(
+                    model_type=LTXModelType.AudioOnly,
+                    audio_num_attention_heads=t.get("audio_num_attention_heads", 32),
+                    audio_attention_head_dim=t.get("audio_attention_head_dim", 64),
+                    audio_in_channels=t.get("audio_in_channels", 128),
+                    audio_out_channels=t.get("audio_out_channels", 128),
+                    num_layers=t.get("num_layers", 48),
+                    audio_cross_attention_dim=t.get("audio_cross_attention_dim", 2048),
+                    norm_eps=t.get("norm_eps", 1e-6),
+                    attention_type=AttentionFunction(t.get("attention_type", "default")),
+                    positional_embedding_theta=10000.0,
+                    audio_positional_embedding_max_pos=[20.0],
+                    timestep_scale_multiplier=t.get("timestep_scale_multiplier", 1000),
+                    use_middle_indices_grid=t.get("use_middle_indices_grid", True),
+                    rope_type=LTXRopeType(t.get("rope_type", "interleaved")),
+                    double_precision_rope=t.get("frequencies_precision", False) == "float64",
+                    apply_gated_attention=t.get("apply_gated_attention", False),
+                    audio_caption_projection=cp,
+                    cross_attention_adaln=t.get("cross_attention_adaln", False),
+                )
+        audio_sd_ops = SDOps("AO").with_matching(prefix="model.diffusion_model.").with_replacement(
+            "model.diffusion_model.", "")
+        builder = Builder(
+            model_path=self.checkpoint,
+            model_class_configurator=AudioOnlyConfigurator,
+            model_sd_ops=audio_sd_ops,
+            registry=DummyRegistry(),
+        )
+        self._velocity_model = builder.build(device=self.device, dtype=self.dtype).to(self.device).eval()
+        n_params = sum(p.numel() for p in self._velocity_model.parameters()) / 1e9
+        vram_gb = sum(p.numel() * p.element_size() for p in self._velocity_model.parameters()) / 1e9
+        logging.info(f"  Transformer: {time.time()-t0:.1f}s ({n_params:.1f}B params, {vram_gb:.1f}GB VRAM, {self.dtype})")
+        # torch.compile for faster denoising
+        if self.compile_model:
+            t0 = time.time()
+            logging.info("  Compiling transformer with torch.compile (default mode)...")
+            self._velocity_model = torch.compile(self._velocity_model, mode="default", dynamic=True)
+            logging.info(f"  Compiled: {time.time()-t0:.1f}s (first call triggers actual compilation)")
+        # 4. Audio decoder (VAE decoder + vocoder kept warm)
+        t0 = time.time()
+        self._audio_decoder = AudioDecoder(
+            checkpoint_path=self.full_checkpoint,
+            dtype=self.dtype, device=self.device,
+            warm=True,
+        )
+        logging.info(f"  AudioDecoder (warm): {time.time()-t0:.1f}s")
+    @torch.inference_mode()
+    def generate(self, prompt, voice_ref=None, cfg_scale=2.5, stg_scale=1.5,
+                 duration_multiplier=1.1, seed=42, ref_duration=10.0,
+                 rescale_scale="auto", gen_duration: float = 0.0):
+        """Generate audio. Returns (waveform_path, duration_seconds).
+        rescale_scale: latent-side CFG std-rescale that prevents clipping at
+            high cfg. Set to "auto" (default) for the cfg-aware schedule, a
+            float in [0, 1] for a fixed override, or 0 to disable.
+        gen_duration: explicit target duration in seconds. 0 (default) → auto
+            from prompt + duration_multiplier; >0 overrides everything else.
+        """
+        t_total = time.time()
+        # Duration + target shape — explicit gen_duration wins over the estimator.
+        if gen_duration and gen_duration > 0:
+            gen_dur = float(gen_duration)
+        else:
+            gen_dur = estimate_duration(prompt, duration_multiplier)
+        fps = 25.0
+        n_frames = int(round(gen_dur * fps)) + 1
+        n_frames = ((n_frames - 1 + 4) // 8) * 8 + 1
+        pixel_shape = VideoPixelShape(batch=1, frames=n_frames, height=64, width=64, fps=fps)
+        target_shape = AudioLatentShape.from_video_pixel_shape(pixel_shape)
+        audio_tools = AudioLatentTools(patchifier=self.patchifier, target_shape=target_shape)
+        # Initial state
+        state = audio_tools.create_initial_state(device=self.device, dtype=self.dtype)
+        # Voice ref conditioning
+        if voice_ref and os.path.exists(voice_ref):
+            t0 = time.time()
+            voice = decode_audio_from_file(voice_ref, self.device, 0.0, ref_duration)
+            w = voice.waveform
+            if w.dim() == 2:
+                if w.shape[0] == 1:
+                    w = w.repeat(2, 1)
+                w = w.unsqueeze(0)
+            elif w.dim() == 3 and w.shape[1] == 1:
+                w = w.repeat(1, 2, 1)
+            target_samples = int(ref_duration * voice.sampling_rate)
+            if w.shape[-1] < target_samples:
+                w = w.repeat(1, 1, (target_samples // w.shape[-1]) + 1)
+            w = w[..., :target_samples]
+            peak = w.abs().max()
+            if peak > 0:
+                w = w * (10 ** (-4.0 / 20) / peak)
+            voice = Audio(waveform=w, sampling_rate=voice.sampling_rate)
+            ref_latent = self._audio_conditioner(lambda enc: vae_encode_audio(voice, enc, None))
+            cond = AudioConditionByReferenceLatent(latent=ref_latent.to(self.device, self.dtype), strength=1.0)
+            state = cond.apply_to(state, audio_tools)
+            logging.info(f"Voice ref: {time.time()-t0:.2f}s")
+        # Noise
+        gen = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=gen)
+        state = noiser(state, noise_scale=1.0)
+        # Prompt encode
+        t0 = time.time()
+        prompts = [prompt, DEFAULT_NEG] if cfg_scale > 1.0 else [prompt]
+        ctx = self._prompt_encoder(prompts, streaming_prefetch_count=None)
+        a_ctx = ctx[0].audio_encoding
+        a_ctx_neg = ctx[1].audio_encoding if cfg_scale > 1.0 else None
+        logging.info(f"Prompt: {time.time()-t0:.2f}s")
+        # Denoiser
+        resc = auto_rescale_for_cfg(cfg_scale) if rescale_scale == "auto" else float(rescale_scale)
+        if rescale_scale == "auto":
+            logging.info(f"Auto rescale_scale = {resc:.2f} for cfg={cfg_scale}")
+        guider = MultiModalGuider(
+            params=MultiModalGuiderParams(
+                cfg_scale=cfg_scale, stg_scale=stg_scale,
+                stg_blocks=[29], rescale_scale=resc, modality_scale=1.0,
+            ),
+            negative_context=a_ctx_neg,
+        )
+        denoiser = GuidedDenoiser(
+            v_context=None, a_context=a_ctx,
+            video_guider=None, audio_guider=guider,
+        )
+        # Sigmas
+        sigmas = LTX2Scheduler().execute(steps=30, latent=state.latent).to(self.device)
+        # Denoise
+        t0 = time.time()
+        x0 = X0Model(self._velocity_model)
+        _, audio_state = euler_denoising_loop(
+            sigmas=sigmas, video_state=None, audio_state=state,
+            stepper=EulerDiffusionStep(), transformer=x0, denoiser=denoiser,
+        )
+        logging.info(f"Denoise (30 steps): {time.time()-t0:.2f}s")
+        # Strip + unpatchify + decode
+        audio_state = audio_tools.clear_conditioning(audio_state)
+        audio_state = audio_tools.unpatchify(audio_state)
+        # End-of-clip silence-prior fix.
+        # The base LTX-2.3 22B DiT was trained on audio clips ≤ ~20 s and
+        # learned a strong "clip-end silence" prior that lands on the next
+        # patchifier-aligned latent frame after 20 s — index 513 = 8*64+1.
+        # When inference produces longer audio, this prior leaks through as a
+        # high-norm latent burst at frame 513 (and adjacent 512), which the
+        # audio VAE + vocoder render as a ~30 ms hard silence dip near 20.4 s.
+        # Linear interpolation across the two affected frames removes the dip
+        # cleanly without any retraining. Only runs when the latent is long
+        # enough to actually contain the boundary.
+        latent = audio_state.latent
+        if latent.shape[2] > 513:
+            f0, f1 = 511, 514          # neighbours used for interpolation
+            n = f1 - f0                # = 3
+            patched = latent.clone()
+            for f in (512, 513):
+                t = (f - f0) / n
+                patched[:, :, f, :] = (1.0 - t) * latent[:, :, f0, :] + t * latent[:, :, f1, :]
+            latent = patched
+        t0 = time.time()
+        decoded = self._audio_decoder(latent)
+        logging.info(f"Decode: {time.time()-t0:.2f}s")
+        total = time.time() - t_total
+        dur = decoded.waveform.shape[-1] / decoded.sampling_rate
+        logging.info(f"Total: {total:.2f}s for {dur:.1f}s audio")
+        return decoded.waveform, decoded.sampling_rate
+    def generate_to_file(self, prompt, output, watermark: bool = True, **kwargs):
+        waveform, sr = self.generate(prompt, **kwargs)
+        wav_cpu = waveform.cpu().float()
+        if watermark:
+            try:
+                import numpy as np, perth
+                if not hasattr(self, "_perth"):
+                    self._perth = perth.PerthImplicitWatermarker()
+                mono = wav_cpu.mean(dim=0).numpy() if wav_cpu.shape[0] > 1 else wav_cpu[0].numpy()
+                mono_wm = self._perth.apply_watermark(mono, sample_rate=sr)
+                mono_wm_t = torch.from_numpy(np.asarray(mono_wm, dtype=np.float32)).unsqueeze(0)
+                wav_cpu = mono_wm_t if wav_cpu.shape[0] == 1 else mono_wm_t.repeat(wav_cpu.shape[0], 1)
+            except Exception as e:
+                logging.warning(f"Perth watermark skipped ({e})")
+        torchaudio.save(output, wav_cpu, sr)
+        logging.info(f"Saved: {output}")
+        return output
+if __name__ == "__main__":
+    import argparse
+    p = argparse.ArgumentParser()
+    p.add_argument("--device", default="cuda")
+    p.add_argument("--dtype", default="fp16", choices=["fp16", "bf16"])
+    p.add_argument("--no-compile", action="store_true")
+    p.add_argument("--no-bnb-4bit", action="store_true",
+                   help="Disable bitsandbytes 4-bit path (default: on, since the default "
+                        "unsloth Gemma checkpoint is pre-quantized).")
+    args = p.parse_args()
+    server = TTSServer(device=args.device, dtype=args.dtype, compile_model=not args.no_compile,
+                       bnb_4bit=not args.no_bnb_4bit)
+    # First call - includes any warmup
+    logging.info("=== First request ===")
+    server.generate_to_file(
+        prompt='A woman speaks clearly, "The weather today will be sunny."',
+        output="/tmp/warm_test1.wav",
+        voice_ref="/mnt/persistent0/manmay/expressive/female_radio_nikole/female_radio_nikole.wav",
+    )
+    # Second call - should be much faster (models already warm)
+    logging.info("\n=== Second request (warm) ===")
+    server.generate_to_file(
+        prompt='A man speaks excitedly, "This is amazing, I cannot believe it!"',
+        output="/tmp/warm_test2.wav",
+        voice_ref="/mnt/persistent0/manmay/expressive/male_arnie/male_arnie.mp3",
+    )

dramabox_src/model_downloader.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+"""
+Download Dramabox models from HuggingFace.
+Models are cached locally after first download.
+Gemma text encoder is fetched separately from Google's repo.
+"""
+import logging
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download, snapshot_download
+logger = logging.getLogger(__name__)
+DRAMABOX_REPO = "ResembleAI/Dramabox"
+GEMMA_REPO = "unsloth/gemma-3-12b-it-bnb-4bit"
+# Default cache directory
+DEFAULT_CACHE = os.environ.get(
+    "DRAMABOX_CACHE",
+    os.path.join(os.path.expanduser("~"), ".cache", "dramabox"),
+)
+# Model files in the HF repo (flat structure)
+MODEL_FILES = {
+    "transformer": "dramabox-dit-v1.safetensors",
+    "audio_components": "dramabox-audio-components.safetensors",
+    "silence_latent": "assets/silence_latent_frame.pt",
+}
+def get_model_path(name: str, cache_dir: str = None) -> str:
+    """Download a model file from HF and return local path.
+    Args:
+        name: One of 'transformer', 'audio_components', 'silence_latent'
+        cache_dir: Local cache directory (default: ~/.cache/dramabox)
+    Returns:
+        Local file path
+    """
+    cache_dir = cache_dir or DEFAULT_CACHE
+    if name not in MODEL_FILES:
+        raise ValueError(f"Unknown model: {name}. Choose from: {list(MODEL_FILES.keys())}")
+    repo_path = MODEL_FILES[name]
+    logger.info(f"Fetching {name} from {DRAMABOX_REPO}/{repo_path}...")
+    local_path = hf_hub_download(
+        repo_id=DRAMABOX_REPO,
+        filename=repo_path,
+        cache_dir=cache_dir,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    logger.info(f"  -> {local_path}")
+    return local_path
+def get_gemma_path(cache_dir: str = None) -> str:
+    """Download Gemma 3 12B IT (pre-quantized bnb-4bit via unsloth) and return
+    the snapshot directory. Using the pre-quantized variant skips runtime
+    bitsandbytes quantization and ~halves the Gemma load time.
+    """
+    cache_dir = cache_dir or DEFAULT_CACHE
+    logger.info(f"Fetching Gemma from {GEMMA_REPO}...")
+    local_dir = snapshot_download(
+        repo_id=GEMMA_REPO,
+        cache_dir=cache_dir,
+        token=os.environ.get("HF_TOKEN"),
+    )
+    logger.info(f"  -> {local_dir}")
+    return local_dir
+def get_all_paths(cache_dir: str = None) -> dict:
+    """Download all required models and return paths dict.
+    Returns:
+        {
+            'transformer': '/path/to/transformer.safetensors',
+            'audio_components': '/path/to/audio-components.safetensors',
+            'silence_latent': '/path/to/silence_latent_frame.pt',
+            'gemma_root': '/path/to/unsloth/gemma-3-12b-it-bnb-4bit/',
+        }
+    """
+    cache_dir = cache_dir or DEFAULT_CACHE
+    paths = {}
+    for name in MODEL_FILES:
+        paths[name] = get_model_path(name, cache_dir)
+    paths["gemma_root"] = get_gemma_path(cache_dir)
+    return paths
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    paths = get_all_paths()
+    print("\nAll models downloaded:")
+    for k, v in paths.items():
+        size = os.path.getsize(v) / 1e9 if os.path.isfile(v) else "dir"
+        print(f"  {k}: {v} ({size:.2f}GB)" if isinstance(size, float) else f"  {k}: {v} (directory)")

dramabox_src/preprocess.py ADDED Viewed

	@@ -0,0 +1,351 @@

+#!/usr/bin/env python3
+"""
+Preprocess TTS datasets for LTX-2.3 audio-only LoRA fine-tuning.
+Takes paired (audio, transcript) data and produces the format expected by
+the LTX trainer:
+    .precomputed/
+    ├── latents/sample_N.pt         # Dummy video latents (minimal)
+    ├── conditions/sample_N.pt      # Text embeddings from Gemma
+    └── audio_latents/sample_N.pt   # Audio VAE-encoded latents
+Supports multiple dataset formats:
+  - gemini_synthetic: index.txt with ~-separated fields (id~speaker~lang~sr~samples~dur~phonemes~text)
+  - libriheavy: index_ft.txt with ~-separated fields (id~speaker~lang~samples~dur~phonemes~text)
+  - manifest: JSON/JSONL with {"audio_filepath": ..., "text": ...}
+  - tsv: TSV file with audio_path<TAB>text columns
+Usage:
+    python preprocess_tts_data.py \
+        --dataset-type gemini_synthetic \
+        --index /mnt/large-datasets/gemini_synthetic_dataset/conversational_dataset_pp/index.txt \
+        --audio-dir /mnt/large-datasets/gemini_synthetic_dataset/conversational_dataset_pp/wavs \
+        --output-dir /mnt/persistent0/manmay/tts_training_data \
+        --max-samples 10000 \
+        --max-duration 20.0 \
+        --min-duration 3.0
+"""
+import argparse
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+import torch
+import torchaudio
+REPO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ltx2"))
+# ltx-pipelines on path via ltx2/
+MODEL_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+GEMMA_DIR = os.environ.get("GEMMA_DIR", "gemma-3-12b-it-qat-q4_0-unquantized")
+def parse_args():
+    p = argparse.ArgumentParser(description="Preprocess TTS data for LTX-2.3 fine-tuning")
+    p.add_argument("--dataset-type", required=True,
+                   choices=["gemini_synthetic", "libriheavy", "manifest", "tsv"],
+                   help="Dataset format type")
+    p.add_argument("--index", required=True, help="Path to index/manifest file")
+    p.add_argument("--audio-dir", default=None,
+                   help="Base directory for audio files (if paths in index are relative)")
+    p.add_argument("--output-dir", required=True, help="Output directory for preprocessed data")
+    p.add_argument("--checkpoint", default=os.path.join(MODEL_DIR, "ltx-2.3-22b-distilled.safetensors"))
+    p.add_argument("--gemma-root", default=GEMMA_DIR)
+    p.add_argument("--max-samples", type=int, default=0, help="Max samples to process (0=all)")
+    p.add_argument("--max-duration", type=float, default=20.0, help="Max audio duration in seconds")
+    p.add_argument("--min-duration", type=float, default=2.0, help="Min audio duration in seconds")
+    p.add_argument("--batch-size", type=int, default=8, help="Batch size for text encoding")
+    p.add_argument("--skip-existing", action="store_true", help="Skip already processed samples")
+    p.add_argument("--audio-only-ckpt", default=None,
+                   help="Audio-only checkpoint for VAE encoding (optional, uses full ckpt if not set)")
+    p.add_argument("--shard", type=int, default=0, help="Shard index (for parallel processing)")
+    p.add_argument("--num-shards", type=int, default=1, help="Total number of shards")
+    p.add_argument("--gpu", type=int, default=None, help="GPU device index to use")
+    return p.parse_args()
+def parse_gemini_synthetic(index_path: str, audio_dir: str | None) -> list[dict]:
+    """Parse gemini_synthetic format: id~speaker~lang~sr~samples~dur~phonemes~text"""
+    samples = []
+    with open(index_path) as f:
+        for line in f:
+            parts = line.strip().split("~")
+            if len(parts) < 7:
+                continue
+            file_id = parts[0]
+            text = parts[-1]  # Last field is always the text
+            sr = int(parts[3])
+            n_samples = int(parts[4])
+            duration = n_samples / sr
+            # Find audio file
+            if audio_dir:
+                # Try common extensions
+                for ext in [".flac", ".wav", ".mp3"]:
+                    audio_path = os.path.join(audio_dir, file_id + ext)
+                    if os.path.exists(audio_path):
+                        break
+                else:
+                    continue
+            else:
+                audio_path = file_id
+            samples.append({
+                "id": file_id,
+                "audio_path": audio_path,
+                "text": text,
+                "duration": duration,
+            })
+    return samples
+def parse_libriheavy(index_path: str, audio_dir: str | None) -> list[dict]:
+    """Parse libriheavy format: id~speaker~lang~samples~dur~phonemes~text"""
+    samples = []
+    with open(index_path) as f:
+        for line in f:
+            parts = line.strip().split("~")
+            if len(parts) < 7:
+                continue
+            file_id = parts[0]
+            text = parts[-1]
+            n_samples = int(parts[3])
+            duration = int(parts[4]) / 1000.0  # milliseconds to seconds
+            if audio_dir:
+                for ext in [".flac", ".wav", ".mp3"]:
+                    audio_path = os.path.join(audio_dir, file_id + ext)
+                    if os.path.exists(audio_path):
+                        break
+                else:
+                    continue
+            else:
+                audio_path = file_id
+            samples.append({
+                "id": file_id,
+                "audio_path": audio_path,
+                "text": text,
+                "duration": duration,
+            })
+    return samples
+def parse_manifest(index_path: str, audio_dir: str | None) -> list[dict]:
+    """Parse JSON/JSONL manifest with audio_filepath and text fields."""
+    samples = []
+    with open(index_path) as f:
+        for line in f:
+            entry = json.loads(line.strip())
+            audio_path = entry.get("audio_filepath", entry.get("audio_path", ""))
+            text = entry.get("text", entry.get("transcript", ""))
+            duration = entry.get("duration", 0.0)
+            if audio_dir and not os.path.isabs(audio_path):
+                audio_path = os.path.join(audio_dir, audio_path)
+            if os.path.exists(audio_path) and text:
+                samples.append({
+                    "id": Path(audio_path).stem,
+                    "audio_path": audio_path,
+                    "text": text,
+                    "duration": duration,
+                })
+    return samples
+def parse_tsv(index_path: str, audio_dir: str | None) -> list[dict]:
+    """Parse TSV file with audio_path<TAB>text."""
+    samples = []
+    with open(index_path) as f:
+        for line in f:
+            parts = line.strip().split("\t")
+            if len(parts) < 2:
+                continue
+            audio_path, text = parts[0], parts[1]
+            if audio_dir and not os.path.isabs(audio_path):
+                audio_path = os.path.join(audio_dir, audio_path)
+            if os.path.exists(audio_path):
+                samples.append({
+                    "id": Path(audio_path).stem,
+                    "audio_path": audio_path,
+                    "text": text,
+                    "duration": 0.0,
+                })
+    return samples
+PARSERS = {
+    "gemini_synthetic": parse_gemini_synthetic,
+    "libriheavy": parse_libriheavy,
+    "manifest": parse_manifest,
+    "tsv": parse_tsv,
+}
+@torch.inference_mode()
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    args = parse_args()
+    from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+    from ltx_core.types import Audio
+    from ltx_pipelines.utils.blocks import AudioConditioner
+    from ltx_pipelines.utils.media_io import decode_audio_from_file
+    from ltx_trainer.model_loader import load_text_encoder, load_embeddings_processor
+    if args.gpu is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16
+    # Create output directories
+    out = Path(args.output_dir)
+    (out / "latents").mkdir(parents=True, exist_ok=True)
+    (out / "conditions").mkdir(parents=True, exist_ok=True)
+    (out / "audio_latents").mkdir(parents=True, exist_ok=True)
+    # Parse dataset
+    logging.info(f"Parsing {args.dataset_type} dataset from {args.index}...")
+    samples = PARSERS[args.dataset_type](args.index, args.audio_dir)
+    logging.info(f"Found {len(samples)} samples")
+    # Filter by duration
+    before = len(samples)
+    samples = [s for s in samples if args.min_duration <= s["duration"] <= args.max_duration]
+    logging.info(f"After duration filter [{args.min_duration}s, {args.max_duration}s]: {len(samples)} (dropped {before - len(samples)})")
+    if args.max_samples > 0:
+        samples = samples[:args.max_samples]
+        logging.info(f"Limiting to {len(samples)} samples")
+    # Assign global indices before sharding
+    for i, s in enumerate(samples):
+        s["global_idx"] = i
+    # Shard the data for parallel processing
+    if args.num_shards > 1:
+        total = len(samples)
+        samples = samples[args.shard::args.num_shards]
+        logging.info(f"Shard {args.shard}/{args.num_shards}: {len(samples)} samples (of {total} total)")
+    # ── Step 1: Encode text with Gemma (Blocks 1+2 only) ──
+    # The trainer runs Block 3 (embeddings processor/connectors) during training,
+    # so we only precompute Blocks 1+2 here (Gemma LLM + feature extractor).
+    logging.info("Loading text encoder (Gemma + feature extractor)...")
+    text_encoder = load_text_encoder(args.gemma_root, device=device, dtype=dtype)
+    # Load feature extractor on CPU first to save GPU memory, then move to device
+    logging.info("Loading feature extractor (on CPU first to save GPU memory)...")
+    emb_proc = load_embeddings_processor(args.checkpoint, device="cpu", dtype=dtype)
+    text_encoder.feature_extractor = emb_proc.feature_extractor.to(device)
+    del emb_proc
+    torch.cuda.empty_cache()
+    logging.info("Encoding text prompts (Blocks 1+2: Gemma + feature extractor)...")
+    for i, sample in enumerate(samples):
+        gidx = sample["global_idx"]
+        cond_path = out / "conditions" / f"sample_{gidx:06d}.pt"
+        if args.skip_existing and cond_path.exists():
+            continue
+        text = sample["text"]
+        # Run Blocks 1+2: Gemma LLM → feature extractor
+        hidden_states, attention_mask = text_encoder.encode(text)
+        video_feats, audio_feats = text_encoder.feature_extractor(
+            hidden_states, attention_mask, "left"
+        )
+        torch.save({
+            "video_prompt_embeds": video_feats.squeeze(0).cpu(),
+            "audio_prompt_embeds": audio_feats.squeeze(0).cpu() if audio_feats is not None else video_feats.squeeze(0).cpu(),
+            "prompt_attention_mask": attention_mask.squeeze(0).bool().cpu(),
+        }, cond_path)
+        if i % 100 == 0:
+            logging.info(f"  Text encoding: {i}/{len(samples)}")
+    del text_encoder
+    torch.cuda.empty_cache()
+    # ── Step 2: Encode audio with Audio VAE ──
+    ckpt_for_vae = args.audio_only_ckpt or args.checkpoint
+    logging.info(f"Loading audio VAE from {ckpt_for_vae}...")
+    ac = AudioConditioner(checkpoint_path=ckpt_for_vae, dtype=dtype, device=device)
+    logging.info("Encoding audio samples...")
+    for idx, sample in enumerate(samples):
+        gidx = sample["global_idx"]
+        audio_path = out / "audio_latents" / f"sample_{gidx:06d}.pt"
+        if args.skip_existing and audio_path.exists():
+            continue
+        try:
+            # Load audio
+            voice = decode_audio_from_file(sample["audio_path"], device, 0.0, args.max_duration)
+            if voice is None:
+                logging.warning(f"  Skipping {sample['id']}: no audio")
+                continue
+            w = voice.waveform
+            if w.dim() == 2:
+                if w.shape[0] == 1:
+                    w = w.repeat(2, 1)
+                w = w.unsqueeze(0)
+            elif w.dim() == 3 and w.shape[1] == 1:
+                w = w.repeat(1, 2, 1)
+            voice = Audio(waveform=w, sampling_rate=voice.sampling_rate)
+            # Encode through Audio VAE
+            audio_latent = ac(lambda enc: vae_encode_audio(voice, enc, None))
+            # Save audio latent
+            torch.save({
+                "latents": audio_latent.squeeze(0).cpu(),  # [C=8, T, F=16]
+                "sample_rate": 16000,
+            }, audio_path)
+        except Exception as e:
+            logging.warning(f"  Skipping {sample['id']}: {e}")
+            continue
+        if idx % 100 == 0:
+            logging.info(f"  Audio encoding: {idx}/{len(samples)}")
+    del ac
+    torch.cuda.empty_cache()
+    # ── Step 3: Create dummy video latents ──
+    logging.info("Creating dummy video latents...")
+    # Minimal video: 1 frame, 64x64 = 2x2 in latent space
+    dummy_video = {
+        "latents": torch.zeros(128, 1, 2, 2),
+        "num_frames": 1,
+        "height": 2,
+        "width": 2,
+        "fps": 24.0,
+    }
+    for idx, sample in enumerate(samples):
+        gidx = sample["global_idx"]
+        latent_path = out / "latents" / f"sample_{gidx:06d}.pt"
+        if args.skip_existing and latent_path.exists():
+            continue
+        torch.save(dummy_video, latent_path)
+    # ── Summary ──
+    n_audio = len(list((out / "audio_latents").glob("*.pt")))
+    n_cond = len(list((out / "conditions").glob("*.pt")))
+    n_lat = len(list((out / "latents").glob("*.pt")))
+    logging.info(f"\nDone! Output: {args.output_dir}")
+    logging.info(f"  audio_latents: {n_audio} files")
+    logging.info(f"  conditions:    {n_cond} files")
+    logging.info(f"  latents:       {n_lat} files")
+if __name__ == "__main__":
+    main()

dramabox_src/train.py ADDED Viewed

	@@ -0,0 +1,882 @@

+#!/usr/bin/env python3
+"""
+Audio-Only IC-LoRA Training for Voice Cloning on LTX-2.3.
+Uses the IC-LoRA pattern: reference audio tokens are APPENDED to the end of
+the target sequence using AudioConditionByReferenceLatent.  Loss is computed
+only on target tokens; reference tokens remain clean (denoise_mask=0).
+This follows the official video-to-video IC-LoRA strategy closely, but adapted
+for the audio-only modality path.
+Usage (single GPU):
+    CUDA_VISIBLE_DEVICES=0 python train_audio_iclora.py --data-dir ... --speaker-index ...
+Usage (multi-GPU with accelerate):
+    CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch --num_processes=4 train_audio_iclora.py ...
+"""
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+REPO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ltx2"))
+# ltx-pipelines already on path via ltx2/
+MODEL_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+# Import audio conditioning item from our module
+sys.path.insert(0, MODEL_DIR)
+from audio_conditioning import AudioConditionByReferenceLatent
+# ─── Timestep Sampling ───
+class DistilledTimestepSampler:
+    """Sample timesteps from the distilled sigma schedule.
+    The distilled model was trained to denoise at these specific sigma values.
+    We sample uniformly from the intervals between consecutive sigmas,
+    matching the distribution the model actually operates on.
+    """
+    # Distilled 8-step sigma values (boundaries of denoising intervals)
+    SIGMAS = [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
+    def __init__(self, jitter: float = 0.02):
+        self.jitter = jitter
+    def sample(self, batch_size: int, seq_length: int = None, device: torch.device = None) -> torch.Tensor:
+        n_intervals = len(self.SIGMAS) - 1
+        interval_idx = torch.randint(0, n_intervals, (batch_size,), device=device)
+        t = torch.rand(batch_size, device=device)
+        sigma_high = torch.tensor([self.SIGMAS[i] for i in interval_idx], device=device)
+        sigma_low = torch.tensor([self.SIGMAS[i + 1] for i in interval_idx], device=device)
+        sigma = sigma_low + t * (sigma_high - sigma_low)
+        return sigma.clamp(0.01, 0.99)
+class ShiftedLogitNormalTimestepSampler:
+    """Shifted logit-normal distribution, shift depends on sequence length."""
+    def __init__(self, std: float = 1.0, eps: float = 1e-3, uniform_prob: float = 0.1):
+        self.std = std
+        self.eps = eps
+        self.uniform_prob = uniform_prob
+        self.normal_999_percentile = 3.0902 * std
+        self.normal_005_percentile = -2.5758 * std
+    def sample(self, batch_size: int, seq_length: int, device: torch.device = None) -> torch.Tensor:
+        mu = self._get_shift(seq_length)
+        normal = torch.randn(batch_size, device=device) * self.std + mu
+        logitnormal = torch.sigmoid(normal)
+        p999 = torch.sigmoid(torch.tensor(mu + self.normal_999_percentile, device=device))
+        p005 = torch.sigmoid(torch.tensor(mu + self.normal_005_percentile, device=device))
+        stretched = (logitnormal - p005) / (p999 - p005)
+        stretched = torch.where(stretched >= self.eps, stretched, 2 * self.eps - stretched)
+        stretched = stretched.clamp(0, 1)
+        uniform = (1 - self.eps) * torch.rand(batch_size, device=device) + self.eps
+        prob = torch.rand(batch_size, device=device)
+        return torch.where(prob > self.uniform_prob, stretched, uniform)
+    @staticmethod
+    def _get_shift(seq_length, min_tok=1024, max_tok=4096, min_s=0.95, max_s=2.05):
+        m = (max_s - min_s) / (max_tok - min_tok)
+        return m * seq_length + (min_s - m * min_tok)
+# ─── Dataset ───
+def build_speaker_map(index_paths, data_dirs):
+    """Map speaker → [(data_dir, sample_idx)] from index file(s).
+    The sample index comes from field 0 of the `~`-delimited row when it
+    parses as int (allows subset indexes that keep original sample numbers),
+    otherwise we fall back to the row's line number (legacy behaviour for
+    string-keyed indexes like tts_training_data_podcast).
+    """
+    speaker_to_samples = defaultdict(list)
+    for index_path, data_dir in zip(index_paths, data_dirs):
+        with open(index_path) as f:
+            for line_num, line in enumerate(f):
+                parts = line.strip().split("~")
+                if len(parts) < 7:
+                    continue
+                try:
+                    idx = int(parts[0])
+                except ValueError:
+                    idx = line_num
+                speaker_id = parts[1]
+                speaker_to_samples[speaker_id].append((data_dir, idx))
+    return {k: v for k, v in speaker_to_samples.items() if len(v) >= 2}
+class IDLoRADataset(Dataset):
+    # Silence-latent reference loaded once, used to detect and strip any
+    # leading silence frames baked into the preprocessed audio_latents. The
+    # training loop ALREADY prepends 0-25 random silence frames, so we don't
+    # want accidental silence in the source data compounding on top.
+    _silence_ref = None
+    @classmethod
+    def _load_silence_ref(cls):
+        if cls._silence_ref is None:
+            p = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+                             "assets", "silence_latent_frame.pt")
+            if os.path.exists(p):
+                cls._silence_ref = torch.load(p, weights_only=True).float().squeeze()  # [C, F]
+        return cls._silence_ref
+    def __init__(self, speaker_map):
+        self.samples = []
+        self.speaker_map = {}
+        for speaker, entries in speaker_map.items():
+            valid = []
+            for data_dir, idx in entries:
+                audio_path = Path(data_dir) / "audio_latents" / f"sample_{idx:06d}.pt"
+                cond_path = Path(data_dir) / "conditions" / f"sample_{idx:06d}.pt"
+                if audio_path.exists() and cond_path.exists():
+                    valid.append((data_dir, idx))
+            if len(valid) >= 2:
+                self.speaker_map[speaker] = valid
+        for speaker, entries in self.speaker_map.items():
+            for entry in entries:
+                self.samples.append((entry, speaker))
+        IDLoRADataset._load_silence_ref()
+    def __len__(self):
+        return len(self.samples)
+    def _load_sample(self, data_dir, idx):
+        base = Path(data_dir)
+        audio = torch.load(base / "audio_latents" / f"sample_{idx:06d}.pt", weights_only=False)
+        # Prefer prefix-stripped text embeddings if they exist (re-encoded with
+        # just the quoted dialogue, dropping the "A woman says, " / "A man
+        # speaks with X accent, " scene-description prefix).
+        stripped = base / "conditions_stripped" / f"sample_{idx:06d}.pt"
+        cond_path = stripped if stripped.exists() else base / "conditions" / f"sample_{idx:06d}.pt"
+        cond = torch.load(cond_path, weights_only=False)
+        if isinstance(audio, dict):
+            audio = audio.get("audio_latent", audio.get("latent", list(audio.values())[0]))
+        if audio.dim() == 2:
+            audio = audio.unsqueeze(0)
+        audio_feats = cond.get("audio_prompt_embeds", cond.get("prompt_embeds"))
+        attn_mask = cond.get("prompt_attention_mask")
+        # The audio_connector has num_learnable_registers=128 and asserts the
+        # input sequence length is divisible by 128. Our new preprocessing
+        # saved trimmed conditions (dropping left-padding to save disk), which
+        # produces short/irregular sequence lengths. Left-pad back to the next
+        # multiple of 128 with zeros (matching the tokenizer's left-padding
+        # convention) so this assertion holds.
+        REG = 128
+        L = audio_feats.shape[0]
+        target_L = ((L + REG - 1) // REG) * REG
+        if target_L != L:
+            pad_len = target_L - L
+            pad_emb = torch.zeros(pad_len, audio_feats.shape[1],
+                                  dtype=audio_feats.dtype)
+            pad_mask = torch.zeros(pad_len, dtype=attn_mask.dtype)
+            audio_feats = torch.cat([pad_emb, audio_feats], dim=0)
+            attn_mask = torch.cat([pad_mask, attn_mask], dim=0)
+        return audio, audio_feats, attn_mask
+    def __getitem__(self, idx):
+        (data_dir, tgt_idx), speaker = self.samples[idx]
+        tgt_latent, audio_feats, attn_mask = self._load_sample(data_dir, tgt_idx)
+        # Drop the reference entirely for non-voice-cloning categories:
+        #   - SFX samples (speaker starts with "sfx_"): descriptive sound events,
+        #     no speaker identity to clone.
+        #   - Song/music samples (suno dataset): prompts describe the music style,
+        #     reference audio doesn't transfer anything useful.
+        # Return a zero-length ref so the model trains target-only for these.
+        drop_ref = speaker.startswith("sfx_") or "preprocessed_ltx_suno" in str(data_dir)
+        if drop_ref:
+            C, F_dim = tgt_latent.shape[0], tgt_latent.shape[2]
+            ref_latent = torch.zeros(C, 0, F_dim, dtype=tgt_latent.dtype)
+        else:
+            entries = self.speaker_map[speaker]
+            ref_entry = random.choice([e for e in entries if e[1] != tgt_idx])
+            ref_latent, _, _ = self._load_sample(*ref_entry)
+        return {
+            "tgt_latent": tgt_latent,
+            "ref_latent": ref_latent,
+            "audio_features": audio_feats,
+            "attention_mask": attn_mask,
+        }
+# ─── Model building ───
+def build_audio_only_model(checkpoint_path, device, dtype):
+    from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+    from ltx_core.loader.registry import DummyRegistry
+    from ltx_core.loader.sd_ops import SDOps
+    from ltx_core.model.transformer.model import LTXModel, LTXModelType
+    from ltx_core.model.model_protocol import ModelConfigurator
+    from ltx_core.model.transformer.attention import AttentionFunction
+    from ltx_core.model.transformer.rope import LTXRopeType
+    sd_ops = SDOps("AO").with_matching(prefix="model.diffusion_model.").with_replacement("model.diffusion_model.", "")
+    class Cfg(ModelConfigurator[LTXModel]):
+        @classmethod
+        def from_config(cls, config):
+            t = config.get("transformer", {})
+            cp = None
+            if not t.get("caption_proj_before_connector", False):
+                from ltx_core.model.transformer.text_projection import create_caption_projection
+                with torch.device("meta"):
+                    cp = create_caption_projection(t, audio=True)
+            return LTXModel(
+                model_type=LTXModelType.AudioOnly,
+                audio_num_attention_heads=t.get("audio_num_attention_heads", 32),
+                audio_attention_head_dim=t.get("audio_attention_head_dim", 64),
+                audio_in_channels=t.get("audio_in_channels", 128),
+                audio_out_channels=t.get("audio_out_channels", 128),
+                num_layers=t.get("num_layers", 48),
+                audio_cross_attention_dim=t.get("audio_cross_attention_dim", 2048),
+                norm_eps=t.get("norm_eps", 1e-6),
+                attention_type=AttentionFunction(t.get("attention_type", "default")),
+                positional_embedding_theta=t.get("positional_embedding_theta", 10000.0),
+                audio_positional_embedding_max_pos=t.get("audio_positional_embedding_max_pos", [20]),
+                timestep_scale_multiplier=t.get("timestep_scale_multiplier", 1000),
+                use_middle_indices_grid=t.get("use_middle_indices_grid", True),
+                rope_type=LTXRopeType(t.get("rope_type", "interleaved")),
+                double_precision_rope=t.get("frequencies_precision", False) == "float64",
+                apply_gated_attention=t.get("apply_gated_attention", False),
+                audio_caption_projection=cp,
+                cross_attention_adaln=t.get("cross_attention_adaln", False),
+            )
+    builder = Builder(model_path=checkpoint_path, model_class_configurator=Cfg,
+                      model_sd_ops=sd_ops, registry=DummyRegistry())
+    return builder.build(device=device, dtype=dtype)
+def load_audio_connector(checkpoint_path, device, dtype):
+    # ltx-trainer already on path via ltx2/
+    from ltx_trainer.model_loader import load_embeddings_processor
+    emb_proc = load_embeddings_processor(checkpoint_path, device=device, dtype=dtype)
+    connector = emb_proc.audio_connector
+    del emb_proc
+    return connector
+def apply_lora(model, rank, alpha, dropout=0.0):
+    from peft import LoraConfig, get_peft_model
+    config = LoraConfig(
+        r=rank, lora_alpha=alpha, lora_dropout=dropout, bias="none",
+        target_modules=[
+            # Self-attention over audio tokens (voice-transfer pathway via ref).
+            "audio_attn1.to_k", "audio_attn1.to_q", "audio_attn1.to_v", "audio_attn1.to_out.0",
+            # Cross-attention (audio ↔ text context) NOT adapted — keep base
+            # model's prompt→audio behaviour intact and rely on dataset balance
+            # to drive expressiveness. (v15c tried this with adaLN unfreeze,
+            # that proved too destructive; v16 tries it adaLN-frozen.)
+            # FFN — non-linear capacity for style/phonetic adaptation.
+            "audio_ff.net.0.proj", "audio_ff.net.2",
+        ],
+    )
+    model = get_peft_model(model, config)
+    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total = sum(p.numel() for p in model.parameters())
+    logging.info(f"LoRA: {trainable:,} trainable / {total:,} total ({100*trainable/total:.1f}%)")
+    return model
+@torch.no_grad()
+def prepare_audio_context(audio_connector, audio_features, attention_mask, device, dtype):
+    from ltx_core.text_encoders.gemma.embeddings_processor import convert_to_additive_mask
+    audio_features = audio_features.to(device=device, dtype=dtype)
+    attention_mask = attention_mask.to(device=device)
+    if audio_features.shape[0] > 1:
+        results = []
+        for i in range(audio_features.shape[0]):
+            feat_i = audio_features[i:i+1]
+            mask_i = attention_mask[i:i+1]
+            additive = convert_to_additive_mask(mask_i, feat_i.dtype)
+            enc_i, _ = audio_connector(feat_i, additive)
+            results.append(enc_i)
+        return torch.cat(results, dim=0)
+    additive_mask = convert_to_additive_mask(attention_mask, audio_features.dtype)
+    audio_encoded, _ = audio_connector(audio_features, additive_mask)
+    return audio_encoded
+# ─── Validation ───
+def _unwrap_model_safe(model):
+    """Strip DDP / peft wrappers without going through accelerate.unwrap_model,
+    which imports deepspeed — broken in our env (torch API drift)."""
+    while hasattr(model, "module"):
+        model = model.module
+    return model
+def run_validation(lora_path, val_config_path, output_dir, step, lora_rank=128):
+    """Call validate.py in a subprocess. It loads TTSServer (the same stack
+    the warm server / Gradio app uses), attaches our LoRA, then iterates every
+    entry in val_config with the same inference settings the user tests with.
+    Single subprocess amortises the model-load cost across all val entries.
+    Forces validation onto VAL_GPU (default "0") because training already
+    occupies the rest. Override via TRAIN_VAL_GPU env var.
+    """
+    import subprocess
+    val_dir = os.path.join(output_dir, "validation", f"step_{step:05d}")
+    os.makedirs(val_dir, exist_ok=True)
+    script = os.path.join(os.path.dirname(__file__), "validate.py")
+    cmd = [
+        sys.executable, script,
+        "--val-config", val_config_path,
+        "--output-dir", val_dir,
+        "--lora", lora_path,
+        "--lora-rank", str(lora_rank),
+        # Use raw estimator output (no +10% buffer) so we can hear
+        # whether the model needs more/less duration at current quality.
+        "--duration-multiplier", "1.0",
+    ]
+    log_path = os.path.join(val_dir, "validate.log")
+    env = os.environ.copy()
+    # Validation needs its OWN GPU (training fills the others).
+    env["CUDA_VISIBLE_DEVICES"] = os.environ.get("TRAIN_VAL_GPU", "0")
+    try:
+        with open(log_path, "w") as logf:
+            result = subprocess.run(
+                cmd, stdout=logf, stderr=subprocess.STDOUT, timeout=1800, env=env,
+            )
+        if result.returncode == 0:
+            logging.info(f"  Validation step {step}: OK → {val_dir}")
+        else:
+            logging.warning(f"  Validation step {step} FAILED (see {log_path})")
+    except subprocess.TimeoutExpired:
+        logging.warning(f"  Validation step {step} TIMEOUT (>30min)")
+# ─── Args ───
+def parse_args():
+    # First pass: pull out --config so its values can become argparse defaults.
+    cfg_parser = argparse.ArgumentParser(add_help=False)
+    cfg_parser.add_argument("--config", default=None,
+                            help="YAML file with default values for any of the flags below. "
+                                 "Explicit CLI flags still override the YAML.")
+    cfg_args, remaining = cfg_parser.parse_known_args()
+    yaml_defaults: dict = {}
+    if cfg_args.config:
+        import yaml as _yaml
+        with open(cfg_args.config) as f:
+            yaml_defaults = _yaml.safe_load(f) or {}
+        # YAML keys are dashes-or-underscores → normalize to argparse dest (underscore).
+        yaml_defaults = {k.replace("-", "_"): v for k, v in yaml_defaults.items()}
+    def _yaml(name, fallback):
+        return yaml_defaults.get(name, fallback)
+    p = argparse.ArgumentParser(
+        parents=[cfg_parser],
+        description="Audio-Only IC-LoRA Training for Voice Cloning",
+    )
+    p.add_argument("--data-dir", required="data_dir" not in yaml_defaults,
+                   nargs="+", default=_yaml("data_dir", None))
+    p.add_argument("--speaker-index", required="speaker_index" not in yaml_defaults,
+                   nargs="+", default=_yaml("speaker_index", None))
+    p.add_argument("--output-dir", default=_yaml("output_dir", os.path.join(MODEL_DIR, "tts_iclora_v1")))
+    p.add_argument("--checkpoint", default=_yaml("checkpoint", os.path.join(MODEL_DIR, "dramabox-dit-v1.safetensors")))
+    p.add_argument("--full-checkpoint", default=_yaml("full_checkpoint", os.path.join(MODEL_DIR, "dramabox-audio-components.safetensors")))
+    p.add_argument("--base-model", choices=["distilled", "dev"], default=_yaml("base_model", "dev"),
+                   help="Base model type: distilled uses DistilledTimestepSampler, dev uses ShiftedLogitNormal")
+    p.add_argument("--lora-rank", type=int, default=_yaml("lora_rank", 128))
+    p.add_argument("--lora-alpha", type=int, default=_yaml("lora_alpha", 128))
+    p.add_argument("--lora-dropout", type=float, default=_yaml("lora_dropout", 0.0),
+                   help="Dropout applied to LoRA A/B matrices during training. "
+                        "Recommended ~0.1 for small datasets to regularize.")
+    p.add_argument("--resume-lora", default=_yaml("resume_lora", None))
+    p.add_argument("--resume-step-offset", type=int, default=_yaml("resume_step_offset", None),
+                   help="Step to add when naming saved checkpoints. If None, inferred "
+                        "from --resume-lora filename (e.g. lora_step_10000.safetensors → 10000). "
+                        "Set to 0 to start numbering at 0 regardless.")
+    p.add_argument("--ref-ratio", type=float, default=_yaml("ref_ratio", 0.3),
+                   help="Fraction of target length to use as reference (default 0.3)")
+    p.add_argument("--max-ref-tokens", type=int, default=_yaml("max_ref_tokens", 200),
+                   help="Maximum reference tokens after patchification (default 200)")
+    p.add_argument("--text-dropout", type=float, default=_yaml("text_dropout", 0.0),
+                   help="Probability of dropping text conditioning (forces reliance on voice ref)")
+    p.add_argument("--steps", type=int, default=_yaml("steps", 30000))
+    p.add_argument("--lr", type=float, default=_yaml("lr", 3e-5))
+    p.add_argument("--lr-scheduler", choices=["cosine", "linear", "constant"], default=_yaml("lr_scheduler", "cosine"))
+    p.add_argument("--batch-size", type=int, default=_yaml("batch_size", 1))
+    p.add_argument("--grad-accum", type=int, default=_yaml("grad_accum", 4))
+    p.add_argument("--max-grad-norm", type=float, default=_yaml("max_grad_norm", 1.0))
+    p.add_argument("--save-every", type=int, default=_yaml("save_every", 1000))
+    p.add_argument("--log-every", type=int, default=_yaml("log_every", 50))
+    p.add_argument("--seed", type=int, default=_yaml("seed", 42))
+    p.add_argument("--warmup-steps", type=int, default=_yaml("warmup_steps", 100))
+    p.add_argument("--val-config", default=_yaml("val_config", None))
+    return p.parse_args(remaining)
+# ─── Main ───
+def main():
+    from accelerate import Accelerator
+    from accelerate.utils import set_seed
+    args = parse_args()
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.grad_accum,
+        mixed_precision="bf16",
+    )
+    is_main = accelerator.is_main_process
+    if is_main:
+        logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    else:
+        logging.basicConfig(level=logging.WARNING)
+    set_seed(args.seed)
+    device = accelerator.device
+    dtype = torch.bfloat16
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Save training args
+    if is_main:
+        import yaml
+        args_dict = vars(args).copy()
+        args_dict["_meta"] = {
+            "world_size": accelerator.num_processes,
+            "dtype": str(dtype),
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "script": "train_audio_iclora.py",
+            "pattern": "IC-LoRA (ref appended to end)",
+        }
+        with open(os.path.join(args.output_dir, "training_args.yaml"), "w") as f:
+            yaml.dump(args_dict, f, default_flow_style=False, sort_keys=False)
+    from ltx_core.components.patchifiers import AudioPatchifier
+    from ltx_core.model.transformer.modality import Modality
+    from ltx_core.guidance.perturbations import BatchedPerturbationConfig
+    from ltx_core.tools import AudioLatentTools
+    from ltx_core.types import AudioLatentShape, LatentState
+    from ltx_pipelines.utils.helpers import modality_from_latent_state, timesteps_from_mask
+    # Build speaker map
+    if is_main:
+        logging.info("Building speaker map...")
+    speaker_map = build_speaker_map(args.speaker_index, args.data_dir)
+    if is_main:
+        logging.info(f"Speaker map: {len(speaker_map)} speakers, "
+                     f"{sum(len(v) for v in speaker_map.values())} samples")
+    # Load model
+    if is_main:
+        logging.info("Loading audio-only model...")
+    model = build_audio_only_model(args.checkpoint, device, dtype)
+    if is_main:
+        logging.info("Loading audio connector...")
+    audio_connector = load_audio_connector(args.full_checkpoint, device, dtype)
+    audio_connector.eval()
+    for p in audio_connector.parameters():
+        p.requires_grad = False
+    if is_main:
+        logging.info(f"Applying LoRA (rank={args.lora_rank}, alpha={args.lora_alpha})...")
+    model = apply_lora(model, args.lora_rank, args.lora_alpha, args.lora_dropout)
+    # Resume from checkpoint
+    if args.resume_lora:
+        from safetensors.torch import load_file as st_load
+        if is_main:
+            logging.info(f"Resuming from: {args.resume_lora}")
+        lora_sd = st_load(args.resume_lora)
+        mapped = {}
+        for k, v in lora_sd.items():
+            nk = k.replace(".lora_A.weight", ".lora_A.default.weight").replace(
+                ".lora_B.weight", ".lora_B.default.weight")
+            mapped[nk] = v
+        model.load_state_dict(mapped, strict=False)
+    # Determine step offset for save filenames. Without this, resuming a run
+    # restarts step numbering at 0 and would overwrite earlier phase-1
+    # checkpoints with the same save_every cadence.
+    if args.resume_step_offset is None:
+        resume_offset = 0
+        if args.resume_lora:
+            import re as _re
+            m = _re.search(r"lora_step_(\d+)", os.path.basename(args.resume_lora))
+            if m:
+                resume_offset = int(m.group(1))
+        args.resume_step_offset = resume_offset
+    if is_main and args.resume_step_offset:
+        logging.info(f"Save-step offset: +{args.resume_step_offset}")
+    model.train()
+    model.base_model.model.set_gradient_checkpointing(True)
+    # Dataset & DataLoader
+    dataset = IDLoRADataset(speaker_map)
+    if is_main:
+        logging.info(f"Dataset: {len(dataset)} samples, {len(dataset.speaker_map)} speakers")
+    def collate_fn(batch):
+        """Pad variable-length audio to max in batch, track real lengths for loss masking."""
+        max_tgt_T = max(b["tgt_latent"].shape[1] for b in batch)  # [C, T, F]
+        max_ref_T = max(b["ref_latent"].shape[1] for b in batch)
+        C = batch[0]["tgt_latent"].shape[0]
+        F_dim = batch[0]["tgt_latent"].shape[2]
+        tgt_list, ref_list, feat_list, mask_list = [], [], [], []
+        tgt_lengths, ref_lengths = [], []
+        for b in batch:
+            tgt = b["tgt_latent"]
+            ref = b["ref_latent"]
+            tgt_lengths.append(tgt.shape[1])
+            ref_lengths.append(ref.shape[1])
+            if tgt.shape[1] < max_tgt_T:
+                pad = torch.zeros(C, max_tgt_T - tgt.shape[1], F_dim, dtype=tgt.dtype)
+                tgt = torch.cat([tgt, pad], dim=1)
+            tgt_list.append(tgt)
+            if ref.shape[1] < max_ref_T:
+                pad = torch.zeros(C, max_ref_T - ref.shape[1], F_dim, dtype=ref.dtype)
+                ref = torch.cat([ref, pad], dim=1)
+            ref_list.append(ref)
+            feat_list.append(b["audio_features"])
+            mask_list.append(b["attention_mask"])
+        return {
+            "tgt_latent": torch.stack(tgt_list),
+            "ref_latent": torch.stack(ref_list),
+            "audio_features": torch.stack(feat_list),
+            "attention_mask": torch.stack(mask_list),
+            "tgt_lengths": torch.tensor(tgt_lengths),
+            "ref_lengths": torch.tensor(ref_lengths),
+        }
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=2,
+                            pin_memory=True, drop_last=True, collate_fn=collate_fn)
+    # Optimizer & Scheduler
+    optimizer = torch.optim.AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=args.lr, betas=(0.9, 0.999), weight_decay=0.01,
+    )
+    from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR, ConstantLR
+    warmup = LinearLR(optimizer, start_factor=0.01, end_factor=1.0, total_iters=args.warmup_steps)
+    remaining = args.steps - args.warmup_steps
+    if args.lr_scheduler == "cosine":
+        # Warmup -> constant hold (20% of remaining) -> cosine decay
+        hold_steps = max(remaining // 5, 0)
+        decay_steps = max(remaining - hold_steps, 1)
+        hold_sched = ConstantLR(optimizer, factor=1.0, total_iters=hold_steps)
+        decay_sched = CosineAnnealingLR(optimizer, T_max=decay_steps, eta_min=1e-6)
+        scheduler = SequentialLR(
+            optimizer,
+            [warmup, hold_sched, decay_sched],
+            milestones=[args.warmup_steps, args.warmup_steps + hold_steps],
+        )
+    elif args.lr_scheduler == "linear":
+        main_sched = LinearLR(optimizer, start_factor=1.0, end_factor=0.01, total_iters=max(remaining, 1))
+        scheduler = SequentialLR(optimizer, [warmup, main_sched], milestones=[args.warmup_steps])
+    else:
+        main_sched = ConstantLR(optimizer, factor=1.0, total_iters=max(remaining, 1))
+        scheduler = SequentialLR(optimizer, [warmup, main_sched], milestones=[args.warmup_steps])
+    # Prepare with Accelerate — but NOT the scheduler. AcceleratedScheduler
+    # calls the underlying scheduler.step() `num_processes` times per sync,
+    # which silently scales down our warmup/cosine spans by that factor.
+    # We call scheduler.step() ourselves, gated on sync_gradients → exactly
+    # one advance per optimizer step, as the yaml spec intends.
+    model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)
+    patchifier = AudioPatchifier(patch_size=1)
+    # Select timestep sampler based on base model type
+    if args.base_model == "distilled":
+        timestep_sampler = DistilledTimestepSampler()
+        if is_main:
+            logging.info("Using DistilledTimestepSampler (matching distilled model sigmas)")
+    else:
+        timestep_sampler = ShiftedLogitNormalTimestepSampler()
+        if is_main:
+            logging.info("Using ShiftedLogitNormalTimestepSampler (dev model)")
+    # Training loop
+    if is_main:
+        logging.info(f"Training: {args.steps} steps, lr={args.lr}, scheduler={args.lr_scheduler}, "
+                     f"batch={args.batch_size}, grad_accum={args.grad_accum}, "
+                     f"world_size={accelerator.num_processes}, "
+                     f"ref_ratio={args.ref_ratio}, max_ref_tokens={args.max_ref_tokens}")
+        logging.info("IC-LoRA pattern: ref tokens APPENDED to target, loss on target only")
+    data_iter = iter(dataloader)
+    step = 0
+    accum_loss = 0.0
+    best_loss = float("inf")
+    best_step = 0
+    t0 = time.time()
+    total_micro_steps = args.steps * args.grad_accum
+    for micro_step in range(total_micro_steps):
+        try:
+            batch = next(data_iter)
+        except StopIteration:
+            data_iter = iter(dataloader)
+            batch = next(data_iter)
+        is_opt_step = (micro_step + 1) % args.grad_accum == 0
+        if is_opt_step:
+            step += 1
+        with accelerator.accumulate(model):
+            tgt_latent = batch["tgt_latent"].to(dtype=dtype)  # [B, C, max_tgt_T, F]
+            ref_latent = batch["ref_latent"].to(dtype=dtype)  # [B, C, max_ref_T, F]
+            tgt_lengths = batch["tgt_lengths"].to(device=device)  # [B]
+            B = tgt_latent.shape[0]
+            # ── Random silence padding (0-1s) ── ltx_audio_tts baseline.
+            # User observed reference-audio leak at end of generations when this
+            # was reduced to 5 (v14) or 10 frames (v16/v17) — the model seemed
+            # to use the extra target budget to regurgitate ref content. Full
+            # 25 frames (0-1s avg 500ms) was apparently load-bearing for
+            # regularising the boundary and reducing hallucinations.
+            # Uses the real silence latent (not zeros) so the VAE decodes it as
+            # true silence instead of static noise.
+            max_pad_frames = 25  # ~1s at 25 latent frames/sec
+            pad_frames = random.randint(0, max_pad_frames)
+            if pad_frames > 0:
+                C, F_dim = tgt_latent.shape[1], tgt_latent.shape[3]
+                if not hasattr(args, '_silence_frame') or args._silence_frame is None:
+                    _sf_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "assets", "silence_latent_frame.pt")
+                    if os.path.exists(_sf_path):
+                        args._silence_frame = torch.load(_sf_path, weights_only=True)  # [C, 1, F]
+                        if is_main:
+                            logging.info(f"Loaded silence latent from {_sf_path}")
+                    else:
+                        args._silence_frame = False  # fallback to zeros
+                        if is_main:
+                            logging.warning(f"silence_latent_frame.pt not found, using zeros")
+                if args._silence_frame is not False:
+                    sf = args._silence_frame.to(dtype=dtype, device=device)  # [C, 1, F]
+                    silence_pad = sf.unsqueeze(0).expand(B, -1, pad_frames, -1)  # [B, C, pad, F]
+                else:
+                    silence_pad = torch.zeros(B, C, pad_frames, F_dim, dtype=dtype, device=device)
+                tgt_latent = torch.cat([silence_pad, tgt_latent], dim=2)
+            # Cap reference to max_ref_tokens (in latent frames, before patchification)
+            # After patchification, ref_T tokens = ref frames (patch_size=1)
+            ref_T_frames = min(ref_latent.shape[2], args.max_ref_tokens)
+            ref_latent = ref_latent[:, :, :ref_T_frames, :]
+            tgt_T_frames = tgt_latent.shape[2]  # max (padded) target frames
+            # ── Step 1: Create target AudioLatentShape and AudioLatentTools ──
+            tgt_shape = AudioLatentShape(
+                batch=B,
+                channels=tgt_latent.shape[1],  # 8
+                frames=tgt_T_frames,
+                mel_bins=tgt_latent.shape[3],   # 16
+            )
+            audio_tools = AudioLatentTools(
+                patchifier=patchifier,
+                target_shape=tgt_shape,
+            )
+            # ── Step 2: Create initial state from target latent ──
+            # create_initial_state patchifies: [B, C, T, F] -> [B, T, C*F]
+            # Also creates denoise_mask=1 (all target tokens will be denoised)
+            # and computes temporal positions
+            state = audio_tools.create_initial_state(
+                device=device,
+                dtype=dtype,
+                initial_latent=tgt_latent,
+            )
+            # state.latent: [B, tgt_T, 128], state.denoise_mask: [B, tgt_T, 1]
+            # state.positions: [B, 1, tgt_T, 2]
+            tgt_T = audio_tools.target_shape.token_count()  # = tgt_T_frames
+            # ── Step 3: Apply flow-matching noise to target BEFORE appending ref ──
+            # Sample sigma
+            total_tokens = tgt_T + ref_T_frames
+            sigma = timestep_sampler.sample(B, total_tokens, device=device)
+            sigma_exp = sigma.view(-1, 1, 1)  # [B, 1, 1]
+            noise = torch.randn_like(state.latent)  # [B, tgt_T, 128]
+            noisy_tgt = (1 - sigma_exp) * state.latent + sigma_exp * noise
+            # Replace the latent in state with the noisy version
+            # (clean_latent stays clean for post_process_latent pattern)
+            state = LatentState(
+                latent=noisy_tgt,
+                denoise_mask=state.denoise_mask,
+                positions=state.positions,
+                clean_latent=state.clean_latent,
+                attention_mask=state.attention_mask,
+            )
+            # ── Step 4: Append reference tokens using AudioConditionByReferenceLatent ──
+            # This appends ref tokens to the END with denoise_mask=0 (frozen/clean)
+            # Skip entirely when ref_T=0 (SFX / song samples): the model trains
+            # target-only for those categories since there's no voice to clone.
+            if ref_T_frames > 0:
+                ref_conditioning = AudioConditionByReferenceLatent(
+                    latent=ref_latent,
+                    strength=1.0,  # 1.0 = ref fully clean (denoise_mask=0)
+                )
+                state = ref_conditioning.apply_to(
+                    latent_state=state,
+                    latent_tools=audio_tools,
+                )
+            # state.latent: [B, tgt_T + ref_T, 128]
+            # state.denoise_mask: [B, tgt_T + ref_T, 1]
+            #   target tokens: 1.0 (denoise), ref tokens: 0.0 (frozen)
+            # state.positions: [B, 1, tgt_T + ref_T, 2]
+            # ── Step 5: Build loss mask for target tokens (excluding padding) ──
+            # loss_mask: 1 for real target tokens, 0 for padding and ref tokens
+            loss_mask = torch.zeros(B, tgt_T, device=device)
+            for b_idx in range(B):
+                real_len = min(tgt_lengths[b_idx].item(), tgt_T)
+                loss_mask[b_idx, :real_len] = 1.0
+            # ── Step 6: Prepare text context ──
+            # Text conditioning dropout: randomly zero out text context to force
+            # the model to rely on the voice reference for identity/style.
+            with torch.no_grad():
+                audio_context = prepare_audio_context(
+                    audio_connector, batch["audio_features"],
+                    batch["attention_mask"], device, dtype)
+                if args.text_dropout > 0 and random.random() < args.text_dropout:
+                    audio_context = torch.zeros_like(audio_context)
+            # ── Step 7: Build Modality using modality_from_latent_state ──
+            # timesteps = sigma * denoise_mask (ref gets 0, target gets sigma)
+            audio_mod = modality_from_latent_state(
+                state=state,
+                context=audio_context,
+                sigma=sigma,
+                enabled=True,
+            )
+            # ── Step 8: Forward pass ──
+            perturbations = BatchedPerturbationConfig.empty(B)
+            with torch.autocast(device_type="cuda", dtype=dtype):
+                _, velocity_pred = model(video=None, audio=audio_mod, perturbations=perturbations)
+            # ── Step 9: Compute loss (IC-LoRA pattern) ──
+            # Target is at the FRONT (indices 0..tgt_T), ref at the END
+            # velocity target = noise - clean
+            tgt_patchified = audio_tools.patchifier.patchify(tgt_latent)  # [B, tgt_T, 128]
+            target_velocity = noise - tgt_patchified
+            # Extract target portion of prediction
+            pred_tgt = velocity_pred[:, :tgt_T]  # [B, tgt_T, 128]
+            # MSE loss with mask: only on real target tokens (not padding or ref)
+            per_token_mse = (pred_tgt - target_velocity).pow(2).mean(dim=-1)  # [B, tgt_T]
+            loss = per_token_mse.mul(loss_mask).div(loss_mask.mean().clamp(min=1e-6)).mean()
+            accelerator.backward(loss)
+            if accelerator.sync_gradients and args.max_grad_norm > 0:
+                accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+            optimizer.step()
+            optimizer.zero_grad()
+            # Only advance the LR scheduler once per OPTIMIZER step (not per
+            # micro-step). Mirrors AcceleratedOptimizer.step() which is
+            # internally gated on sync_gradients.
+            if accelerator.sync_gradients:
+                scheduler.step()
+        accum_loss += loss.item()
+        # Logging & saving on optimization steps only
+        if is_opt_step and step % args.log_every == 0 and is_main:
+            avg_loss = accum_loss / (args.log_every * args.grad_accum)
+            lr = optimizer.param_groups[0]["lr"]
+            elapsed = time.time() - t0
+            sps = step / elapsed if elapsed > 0 else 0
+            eta = (args.steps - step) / sps if sps > 0 else 0
+            logging.info(
+                f"Step {step}/{args.steps} | loss={avg_loss:.4f} | lr={lr:.2e} | "
+                f"tgt_T={tgt_T} ref_T={ref_T_frames} total={tgt_T + ref_T_frames} | "
+                f"{sps:.1f} steps/s | ETA {eta/60:.0f}min"
+            )
+            # Save best whenever loss improves — no warmup gate, so we can
+            # observe best checkpoints during warmup too.
+            if avg_loss < best_loss:
+                best_loss = avg_loss
+                old_best = os.path.join(args.output_dir, f"best_step_{best_step:05d}.safetensors")
+                best_step = step + args.resume_step_offset
+                new_best = os.path.join(args.output_dir, f"best_step_{best_step:05d}.safetensors")
+                unwrapped = _unwrap_model_safe(model)
+                unwrapped.save_pretrained(args.output_dir)
+                adapter = os.path.join(args.output_dir, "adapter_model.safetensors")
+                if os.path.exists(adapter):
+                    shutil.copy(adapter, new_best)
+                if old_best != new_best and os.path.exists(old_best):
+                    os.remove(old_best)
+                logging.info(f"New best: loss={best_loss:.4f} at step {best_step}")
+            accum_loss = 0.0
+        if is_opt_step and step % args.save_every == 0 and is_main:
+            global_step = step + args.resume_step_offset
+            save_path = os.path.join(args.output_dir, f"lora_step_{global_step:05d}.safetensors")
+            logging.info(f"Saving: {save_path}")
+            unwrapped = _unwrap_model_safe(model)
+            unwrapped.save_pretrained(args.output_dir)
+            adapter = os.path.join(args.output_dir, "adapter_model.safetensors")
+            if os.path.exists(adapter):
+                shutil.copy(adapter, save_path)
+            if args.val_config:
+                logging.info(f"Running validation at step {global_step}...")
+                model.eval()
+                run_validation(save_path, args.val_config, args.output_dir, global_step,
+                               lora_rank=args.lora_rank)
+                model.train()
+    # Final save
+    if is_main:
+        unwrapped = _unwrap_model_safe(model)
+        unwrapped.save_pretrained(args.output_dir)
+        adapter = os.path.join(args.output_dir, "adapter_model.safetensors")
+        global_step = step + args.resume_step_offset
+        save_path = os.path.join(args.output_dir, f"lora_step_{global_step:05d}.safetensors")
+        if os.path.exists(adapter):
+            shutil.copy(adapter, save_path)
+        logging.info(f"Training complete! {step} steps in {time.time()-t0:.0f}s")
+        logging.info(f"Best loss: {best_loss:.4f} at step {best_step}")
+if __name__ == "__main__":
+    main()

dramabox_src/validate.py ADDED Viewed

	@@ -0,0 +1,363 @@

+#!/usr/bin/env python3
+"""Warm validation runner — loads base dev + LoRA + all aux models ONCE,
+then iterates every speaker in val_config generating each output.
+Matches the same generation path as inference.py but keeps Gemma / audio VAE
+/ velocity model / audio decoder resident across entries. Inference
+settings default to the Gradio warm-server values (cfg=2.5, stg=1.5,
+modality=1.0, rescale=0, 30 steps, fps=25) — use --inference-params to
+override.
+"""
+import argparse
+import logging
+import os
+import sys
+import time
+import traceback
+import torch
+import torchaudio
+REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MODEL_DIR = REPO_DIR
+sys.path.insert(0, os.path.join(REPO_DIR, "ltx2"))
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+DEV_FULL_CKPT = os.environ.get(
+    "LTX_FULL_CHECKPOINT",
+    os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "ltx-2.3-22b-dev.safetensors"),
+)
+GEMMA_ROOT = os.environ.get(
+    "GEMMA_ROOT",
+    os.path.expanduser("~/.cache/dramabox/gemma-3-12b-it-bnb-4bit"),
+)
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--val-config", required=True)
+    p.add_argument("--output-dir", required=True)
+    p.add_argument("--lora", default=None)
+    p.add_argument("--lora-rank", type=int, default=128)
+    p.add_argument("--full-checkpoint", default=DEV_FULL_CKPT)
+    p.add_argument("--gemma-root", default=GEMMA_ROOT)
+    p.add_argument("--cfg-scale", type=float, default=2.5)
+    p.add_argument("--stg-scale", type=float, default=1.5)
+    p.add_argument("--rescale-scale", type=float, default=0.0)
+    p.add_argument("--modality-scale", type=float, default=1.0)
+    p.add_argument("--steps", type=int, default=30)
+    p.add_argument("--fps", type=float, default=25.0)
+    p.add_argument("--stg-block", type=int, default=29)
+    p.add_argument("--cfg-clamp", type=float, default=0.0)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--duration-multiplier", type=float, default=1.1)
+    # Match Gradio / inference_server.py DEFAULT_NEG exactly
+    p.add_argument("--negative-prompt", default=(
+        "worst quality, inconsistent, robotic, distorted, noise, static, "
+        "muffled, unclear, unnatural, monotone"
+    ))
+    return p.parse_args()
+def estimate_speech_duration(prompt: str, speed: float = 1.0) -> float:
+    import re
+    quoted = re.findall(r'"([^"]*)"', prompt) or re.findall(r"'([^']*)'", prompt)
+    text = " ".join(quoted) if quoted else prompt
+    duration = len(text) * 0.065 / max(speed, 0.1) + 1.5
+    return max(3.0, round(duration, 1))
+class WarmValidator:
+    def __init__(self, full_checkpoint, gemma_root, lora_path=None, lora_rank=128,
+                 device="cuda", dtype=torch.bfloat16):
+        from audio_conditioning import AudioConditionByReferenceLatent  # noqa: F401 (imported by inference.py)
+        from ltx_core.components.patchifiers import AudioPatchifier
+        from ltx_pipelines.utils.blocks import PromptEncoder, AudioConditioner, AudioDecoder
+        self.device = torch.device(device)
+        self.dtype = dtype
+        self.full_checkpoint = full_checkpoint
+        self.gemma_root = gemma_root
+        self.patchifier = AudioPatchifier(patch_size=1)
+        logging.info("Loading PromptEncoder (Gemma + embeddings_processor)...")
+        t0 = time.time()
+        self.prompt_encoder = PromptEncoder(
+            checkpoint_path=full_checkpoint, gemma_root=gemma_root,
+            dtype=dtype, device=self.device, warm=True, audio_only=True,
+        )
+        logging.info(f"  PromptEncoder ready in {time.time()-t0:.1f}s")
+        logging.info("Loading AudioConditioner (audio VAE encoder)...")
+        t0 = time.time()
+        self.audio_conditioner = AudioConditioner(
+            checkpoint_path=full_checkpoint, dtype=dtype, device=self.device, warm=True,
+        )
+        logging.info(f"  AudioConditioner ready in {time.time()-t0:.1f}s")
+        logging.info("Loading AudioDecoder...")
+        t0 = time.time()
+        self.audio_decoder = AudioDecoder(
+            checkpoint_path=full_checkpoint, dtype=dtype, device=self.device, warm=True,
+        )
+        logging.info(f"  AudioDecoder ready in {time.time()-t0:.1f}s")
+        logging.info("Building velocity model (audio-only from base dev)...")
+        t0 = time.time()
+        self.velocity_model = self._build_velocity_model(full_checkpoint, lora_path, lora_rank)
+        logging.info(f"  Velocity model ready in {time.time()-t0:.1f}s "
+                     f"({sum(p.numel() for p in self.velocity_model.parameters()) / 1e9:.1f}B params)")
+    def _build_velocity_model(self, checkpoint_path, lora_path, lora_rank):
+        from ltx_core.loader.registry import DummyRegistry
+        from ltx_core.loader.sd_ops import SDOps
+        from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder as Builder
+        from ltx_core.model.model_protocol import ModelConfigurator
+        from ltx_core.model.transformer.attention import AttentionFunction
+        from ltx_core.model.transformer.model import LTXModel, LTXModelType
+        from ltx_core.model.transformer.rope import LTXRopeType
+        sd_ops = (
+            SDOps("AO")
+            .with_matching(prefix="model.diffusion_model.")
+            .with_replacement("model.diffusion_model.", "")
+        )
+        class Cfg(ModelConfigurator[LTXModel]):
+            @classmethod
+            def from_config(cls, config):
+                t = config.get("transformer", {})
+                cp = None
+                if not t.get("caption_proj_before_connector", False):
+                    from ltx_core.model.transformer.text_projection import create_caption_projection
+                    with torch.device("meta"):
+                        cp = create_caption_projection(t, audio=True)
+                return LTXModel(
+                    model_type=LTXModelType.AudioOnly,
+                    audio_num_attention_heads=t.get("audio_num_attention_heads", 32),
+                    audio_attention_head_dim=t.get("audio_attention_head_dim", 64),
+                    audio_in_channels=t.get("audio_in_channels", 128),
+                    audio_out_channels=t.get("audio_out_channels", 128),
+                    num_layers=t.get("num_layers", 48),
+                    audio_cross_attention_dim=t.get("audio_cross_attention_dim", 2048),
+                    norm_eps=t.get("norm_eps", 1e-6),
+                    attention_type=AttentionFunction(t.get("attention_type", "default")),
+                    positional_embedding_theta=10000.0,
+                    audio_positional_embedding_max_pos=[20.0],
+                    timestep_scale_multiplier=t.get("timestep_scale_multiplier", 1000),
+                    use_middle_indices_grid=t.get("use_middle_indices_grid", True),
+                    rope_type=LTXRopeType(t.get("rope_type", "interleaved")),
+                    double_precision_rope=t.get("frequencies_precision", False) == "float64",
+                    apply_gated_attention=t.get("apply_gated_attention", False),
+                    audio_caption_projection=cp,
+                    cross_attention_adaln=t.get("cross_attention_adaln", False),
+                )
+        builder = Builder(
+            model_path=checkpoint_path, model_class_configurator=Cfg,
+            model_sd_ops=sd_ops, registry=DummyRegistry(),
+        )
+        velocity = builder.build(device=self.device, dtype=self.dtype).to(self.device).eval()
+        if lora_path and os.path.exists(lora_path):
+            from peft import LoraConfig, get_peft_model
+            from safetensors.torch import load_file as st_load
+            logging.info(f"Attaching LoRA: {lora_path}")
+            lora_sd = st_load(lora_path)
+            is_peft = any("base_model.model." in k for k in lora_sd.keys())
+            is_iclora = any("diffusion_model." in k for k in lora_sd.keys())
+            cfg = LoraConfig(
+                r=lora_rank, lora_alpha=lora_rank, lora_dropout=0.0, bias="none",
+                target_modules=[
+                    "audio_attn1.to_k", "audio_attn1.to_q",
+                    "audio_attn1.to_v", "audio_attn1.to_out.0",
+                    "audio_attn2.to_k", "audio_attn2.to_q",
+                    "audio_attn2.to_v", "audio_attn2.to_out.0",
+                    "audio_ff.net.0.proj", "audio_ff.net.2",
+                ],
+            )
+            velocity = get_peft_model(velocity, cfg)
+            if is_peft:
+                mapped = {}
+                for k, v in lora_sd.items():
+                    nk = k
+                    if ".lora_A.weight" in k and ".lora_A.default.weight" not in k:
+                        nk = k.replace(".lora_A.weight", ".lora_A.default.weight")
+                    if ".lora_B.weight" in k and ".lora_B.default.weight" not in k:
+                        nk = k.replace(".lora_B.weight", ".lora_B.default.weight")
+                    mapped[nk] = v
+                _, unexpected = velocity.load_state_dict(mapped, strict=False)
+                logging.info(f"  Loaded {len(mapped) - len(unexpected)} LoRA weights (peft)")
+            elif is_iclora:
+                audio_keys = {k: v for k, v in lora_sd.items()
+                              if "audio_attn1" in k or "audio_attn2" in k or "audio_ff" in k}
+                mapped = {}
+                for k, v in audio_keys.items():
+                    nk = k.replace("diffusion_model.", "base_model.model.")
+                    nk = nk.replace(".lora_A.weight", ".lora_A.default.weight")
+                    nk = nk.replace(".lora_B.weight", ".lora_B.default.weight")
+                    mapped[nk] = v
+                _, unexpected = velocity.load_state_dict(mapped, strict=False)
+                logging.info(f"  Loaded {len(mapped) - len(unexpected)} LoRA weights (iclora)")
+            velocity = velocity.merge_and_unload()
+            logging.info("  Merged LoRA into base weights")
+        return velocity
+    @torch.inference_mode()
+    def generate(self, prompt, output_path, voice_ref=None, args=None):
+        from audio_conditioning import AudioConditionByReferenceLatent
+        from ltx_core.batch_split import BatchSplitAdapter
+        from ltx_core.components.diffusion_steps import EulerDiffusionStep
+        from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
+        from ltx_core.components.noisers import GaussianNoiser
+        from ltx_core.components.schedulers import LTX2Scheduler
+        from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+        from ltx_core.model.transformer.model import X0Model
+        from ltx_core.tools import AudioLatentTools
+        from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
+        from ltx_pipelines.utils.denoisers import GuidedDenoiser, SimpleDenoiser
+        from ltx_pipelines.utils.gpu_model import gpu_model
+        from ltx_pipelines.utils.media_io import decode_audio_from_file
+        from ltx_pipelines.utils.samplers import euler_denoising_loop
+        t_total = time.time()
+        # ---- Duration + shape ----
+        gen_dur = estimate_speech_duration(prompt) * args.duration_multiplier
+        raw_frames = int(round(gen_dur * args.fps)) + 1
+        num_frames = ((raw_frames - 1 + 4) // 8) * 8 + 1
+        pixel_shape = VideoPixelShape(batch=1, frames=num_frames, height=64, width=64, fps=args.fps)
+        tgt_shape = AudioLatentShape.from_video_pixel_shape(pixel_shape)
+        audio_tools = AudioLatentTools(patchifier=self.patchifier, target_shape=tgt_shape)
+        state = audio_tools.create_initial_state(self.device, self.dtype)
+        # ---- Voice reference ----
+        if voice_ref and os.path.exists(voice_ref):
+            voice = decode_audio_from_file(voice_ref, self.device, 0.0, 10.0)
+            if voice is not None:
+                w = voice.waveform
+                if w.dim() == 2:
+                    if w.shape[0] == 1:
+                        w = w.repeat(2, 1)
+                    w = w.unsqueeze(0)
+                elif w.dim() == 3 and w.shape[1] == 1:
+                    w = w.repeat(1, 2, 1)
+                target_samples = int(10.0 * voice.sampling_rate)
+                if w.shape[-1] < target_samples:
+                    w = w.repeat(1, 1, (target_samples // w.shape[-1]) + 1)
+                w = w[..., :target_samples]
+                peak = w.abs().max()
+                if peak > 0:
+                    w = w * (10 ** (-4.0 / 20) / peak)
+                voice = Audio(waveform=w, sampling_rate=voice.sampling_rate)
+                ref_latent = self.audio_conditioner(lambda enc: vae_encode_audio(voice, enc, None))
+                cond = AudioConditionByReferenceLatent(
+                    latent=ref_latent.to(self.device, self.dtype), strength=1.0,
+                )
+                state = cond.apply_to(latent_state=state, latent_tools=audio_tools)
+        # ---- Noise ----
+        gen = torch.Generator(device=self.device).manual_seed(args.seed)
+        noiser = GaussianNoiser(generator=gen)
+        state = noiser(state, noise_scale=1.0)
+        # ---- Prompt encode ----
+        use_cfg = args.cfg_scale > 1.0
+        prompts = [prompt, args.negative_prompt] if use_cfg else [prompt]
+        ctx = self.prompt_encoder(prompts, streaming_prefetch_count=None)
+        a_ctx = ctx[0].audio_encoding
+        a_ctx_neg = ctx[1].audio_encoding if use_cfg else None
+        # ---- Denoiser ----
+        needs_guidance = args.cfg_scale > 1.0 or args.stg_scale > 0.0 or args.modality_scale > 1.0
+        if needs_guidance:
+            guider = MultiModalGuider(
+                params=MultiModalGuiderParams(
+                    cfg_scale=args.cfg_scale, stg_scale=args.stg_scale,
+                    stg_blocks=[args.stg_block] if args.stg_scale > 0 else [],
+                    rescale_scale=args.rescale_scale,
+                    modality_scale=args.modality_scale,
+                    cfg_clamp_scale=args.cfg_clamp,
+                ),
+                negative_context=a_ctx_neg,
+            )
+            denoiser = GuidedDenoiser(
+                v_context=None, a_context=a_ctx,
+                video_guider=None, audio_guider=guider,
+            )
+        else:
+            denoiser = SimpleDenoiser(v_context=None, a_context=a_ctx)
+        sigmas = LTX2Scheduler().execute(steps=args.steps, latent=state.latent).to(self.device)
+        # ---- Denoise ----
+        # NOTE: don't wrap in gpu_model() — that context manager moves the
+        # model back off GPU on exit, which breaks subsequent iterations of
+        # our warm validator. We keep the velocity model resident.
+        x0 = X0Model(self.velocity_model)
+        batched = BatchSplitAdapter(x0, max_batch_size=1)
+        _, audio_state = euler_denoising_loop(
+            sigmas=sigmas, video_state=None, audio_state=state,
+            stepper=EulerDiffusionStep(), transformer=batched, denoiser=denoiser,
+        )
+        audio_state = audio_tools.clear_conditioning(audio_state)
+        audio_state = audio_tools.unpatchify(audio_state)
+        decoded = self.audio_decoder(audio_state.latent)
+        wav = decoded.waveform
+        if wav.dim() == 1:
+            wav = wav.unsqueeze(0)
+        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+        torchaudio.save(output_path, wav.float().cpu(), decoded.sampling_rate)
+        logging.info(f"  -> {output_path} ({wav.shape[-1]/decoded.sampling_rate:.1f}s, "
+                     f"{time.time()-t_total:.1f}s)")
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    args = parse_args()
+    import yaml
+    with open(args.val_config) as f:
+        val_cfg = yaml.safe_load(f)
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Build validator once (models warm for all entries).
+    validator = WarmValidator(
+        full_checkpoint=args.full_checkpoint,
+        gemma_root=args.gemma_root,
+        lora_path=args.lora,
+        lora_rank=args.lora_rank,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        dtype=torch.bfloat16,
+    )
+    n_ok = n_fail = 0
+    t0 = time.time()
+    for entry in val_cfg.get("speakers", []):
+        name = entry["name"]
+        out_path = os.path.join(args.output_dir, f"{name}.wav")
+        try:
+            validator.generate(
+                prompt=entry["prompt"],
+                output_path=out_path,
+                voice_ref=entry.get("reference"),
+                args=args,
+            )
+            n_ok += 1
+            logging.info(f"  [{name}] OK")
+        except Exception as e:
+            n_fail += 1
+            logging.warning(f"  [{name}] FAILED: {e}")
+            traceback.print_exc()
+    logging.info(f"Validation done: ok={n_ok} fail={n_fail} in {(time.time()-t0)/60:.1f}min "
+                 f"at {args.output_dir}")
+if __name__ == "__main__":
+    main()