scenema-audio

Runtime error

App Files Files Community

multimodalart commited on 12 days ago

Commit

cdc4405

1 Parent(s): c327e46

Initial Gradio ZeroGPU app for Scenema Audio

Browse files

Files changed (21) hide show

README.md +28 -3
app.py +385 -0
requirements.txt +30 -0
src/audio_core/__init__.py +7 -0
src/audio_core/audio_utils.py +266 -0
src/audio_core/chunker.py +334 -0
src/audio_core/compiler.py +305 -0
src/audio_core/engine.py +911 -0
src/audio_core/enhancer.py +121 -0
src/audio_core/inference.py +183 -0
src/audio_core/main.py +42 -0
src/audio_core/processor.py +484 -0
src/audio_core/seedvc.py +194 -0
src/audio_core/validate_and_patch.py +402 -0
src/audio_core/validator.py +105 -0
src/audio_core/vocal_separator.py +244 -0
src/audio_core/whisper_aligner.py +139 -0
src/common/__init__.py +0 -0
src/common/handlers/__init__.py +0 -0
src/common/handlers/base.py +40 -0
src/server.py +188 -0

README.md CHANGED Viewed

@@ -1,13 +1,38 @@
 ---
 title: Scenema Audio
-emoji: 🚀
 colorFrom: pink
 colorTo: red
 sdk: gradio
 sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Scenema Audio
+emoji: 🎙️
 colorFrom: pink
 colorTo: red
 sdk: gradio
 sdk_version: 6.14.0
+python_version: '3.12'
 app_file: app.py
 pinned: false
+hardware: zero-a10g
+short_description: Zero-shot expressive voice cloning and speech generation
+suggested_storage: large
 ---
+# Scenema Audio (ZeroGPU)
+Gradio wrapper around [ScenemaAI/scenema-audio](https://github.com/ScenemaAI/scenema-audio).
+Zero-shot expressive voice cloning and speech generation with emotion, pacing,
+and breath control, built on an audio diffusion transformer extracted from
+[LTX 2.3](https://github.com/Lightricks/LTX-2).
+## Cold start
+First request downloads ~38 GB of model weights:
+- `scenema-audio-transformer-int8.safetensors` (~4.9 GB)
+- `scenema-audio-pipeline.safetensors` (~6.7 GB)
+- `google/gemma-3-12b-it` (~24 GB, **gated** — requires `HF_TOKEN` secret)
+- SeedVC + BigVGAN + Whisper checkpoints (~3 GB)
+- MelBandRoFormer (~436 MB)
+Set `HF_TOKEN` in the Space secrets with access to `google/gemma-3-12b-it`.
+## License
+- **Model weights:** LTX-2 Community License Agreement
+- **Code:** MIT

app.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""Scenema Audio - ZeroGPU Gradio Space.
+Wraps the ScenemaAI/scenema-audio AudioProcessor in a Gradio UI.
+Heavy model weights (~38 GB) are downloaded on first cold-start and
+cached on persistent storage; generation runs under @spaces.GPU.
+"""
+import asyncio
+import base64
+import logging
+import os
+import sys
+import tempfile
+import uuid
+from pathlib import Path
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+# Allow tweaking via env, but default to repo-local cache so weights persist
+# across worker restarts on Spaces persistent storage if mounted at /data.
+MODEL_DIR = Path(os.environ.get("MODEL_DIR", "/data/models")) \
+    if Path("/data").exists() else Path(os.environ.get("MODEL_DIR", "./models"))
+MODEL_DIR.mkdir(parents=True, exist_ok=True)
+os.environ["MODEL_DIR"] = str(MODEL_DIR)
+# Default model paths (must be set before AudioProcessor is imported)
+os.environ.setdefault(
+    "AUDIO_CKPT", str(MODEL_DIR / "scenema-audio-transformer-int8.safetensors")
+)
+os.environ.setdefault(
+    "PIPELINE_CKPT", str(MODEL_DIR / "scenema-audio-pipeline.safetensors")
+)
+os.environ.setdefault(
+    "VAE_ENCODER_CKPT", str(MODEL_DIR / "scenema-audio-vae-encoder.safetensors")
+)
+os.environ.setdefault("GEMMA_ROOT", str(MODEL_DIR / "gemma-3-12b-it"))
+os.environ.setdefault(
+    "MELBAND_MODEL_PATH", str(MODEL_DIR / "MelBandRoformer_fp16.safetensors")
+)
+os.environ.setdefault("SEEDVC_PATH", str(Path.cwd() / "seed-vc"))
+os.environ.setdefault("MELBAND_NODE_PATH", str(Path.cwd() / "melband_roformer_node"))
+os.environ.setdefault("HF_HUB_CACHE", str(MODEL_DIR / "hf_cache"))
+os.environ.setdefault("GEMMA_QUANTIZE", "nf4")
+# Make repo source importable
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+import gradio as gr
+import spaces
+from huggingface_hub import hf_hub_download, snapshot_download
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s"
+)
+logger = logging.getLogger("scenema-space")
+# ── Model download (CPU phase, runs at import) ────────────────────────────
+HF_REPO = "ScenemaAI/scenema-audio"
+GEMMA_REPO = "google/gemma-3-12b-it"
+SEEDVC_REPO = "Plachta/Seed-VC"
+BIGVGAN_REPO = "nvidia/bigvgan_v2_22khz_80band_256x"
+WHISPER_REPO = "openai/whisper-small"
+def _download_all():
+    token = os.environ.get("HF_TOKEN")
+    audio_ckpt = Path(os.environ["AUDIO_CKPT"])
+    if not audio_ckpt.exists():
+        logger.info("Downloading audio transformer INT8 (~4.9 GB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-transformer-int8.safetensors",
+            local_dir=str(audio_ckpt.parent),
+            token=token,
+        )
+    pipeline_ckpt = Path(os.environ["PIPELINE_CKPT"])
+    if not pipeline_ckpt.exists():
+        logger.info("Downloading pipeline checkpoint (~6.7 GB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-pipeline.safetensors",
+            local_dir=str(pipeline_ckpt.parent),
+            token=token,
+        )
+    vae = Path(os.environ["VAE_ENCODER_CKPT"])
+    if not vae.exists():
+        logger.info("Downloading VAE encoder (~42 MB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-vae-encoder.safetensors",
+            local_dir=str(vae.parent),
+            token=token,
+        )
+    melband = Path(os.environ["MELBAND_MODEL_PATH"])
+    if not melband.exists():
+        logger.info("Downloading MelBandRoFormer (~436 MB)...")
+        hf_hub_download(
+            "Kijai/MelBandRoFormer_comfy",
+            "MelBandRoformer_fp16.safetensors",
+            local_dir=str(melband.parent),
+            token=token,
+        )
+    gemma = Path(os.environ["GEMMA_ROOT"])
+    if not gemma.exists() or not any(gemma.glob("*.safetensors")):
+        logger.info("Downloading Gemma 3 12B IT (~24 GB, gated)...")
+        snapshot_download(
+            GEMMA_REPO,
+            local_dir=str(gemma),
+            ignore_patterns=["*.gguf"],
+            token=token,
+        )
+    seedvc_path = Path(os.environ["SEEDVC_PATH"])
+    seedvc_ckpts = seedvc_path / "checkpoints"
+    if not seedvc_ckpts.exists() or not any(seedvc_ckpts.glob("*.pth")):
+        logger.info("Downloading SeedVC checkpoints (~1.6 GB)...")
+        seedvc_ckpts.mkdir(parents=True, exist_ok=True)
+        hf_cache = seedvc_ckpts / "hf_cache"
+        hf_cache.mkdir(parents=True, exist_ok=True)
+        os.environ["HF_HUB_CACHE"] = str(hf_cache)
+        hf_hub_download(
+            SEEDVC_REPO,
+            "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
+            local_dir=str(seedvc_ckpts),
+            token=token,
+        )
+        hf_hub_download(
+            SEEDVC_REPO,
+            "config_dit_mel_seed_uvit_whisper_small_wavenet.yml",
+            local_dir=str(seedvc_ckpts),
+            token=token,
+        )
+        snapshot_download(BIGVGAN_REPO, local_dir=str(hf_cache / "bigvgan"))
+        snapshot_download(WHISPER_REPO, local_dir=str(hf_cache / "whisper-small"))
+def _ensure_seedvc_repo():
+    """Clone the seed-vc python source if missing (architecture code)."""
+    seedvc = Path(os.environ["SEEDVC_PATH"])
+    if not (seedvc / "modules").exists():
+        logger.info("Cloning seed-vc source...")
+        os.system(f"git clone --depth 1 https://github.com/Plachtaa/seed-vc.git {seedvc}")
+    melband_node = Path(os.environ["MELBAND_NODE_PATH"])
+    if not melband_node.exists():
+        logger.info("Cloning ComfyUI-MelBandRoFormer source...")
+        os.system(
+            f"git clone --depth 1 https://github.com/kijai/ComfyUI-MelBandRoFormer {melband_node}"
+        )
+_ensure_seedvc_repo()
+_download_all()
+# Import processor only after model paths/env are set
+from audio_core.processor import AudioProcessor  # noqa: E402
+from common.handlers.base import ProcessJob  # noqa: E402
+_processor: AudioProcessor | None = None
+def _get_processor() -> AudioProcessor:
+    global _processor
+    if _processor is None:
+        _processor = AudioProcessor()
+        _processor.startup()
+    return _processor
+# ── Generation ────────────────────────────────────────────────────────────
+def _build_prompt(text, voice, gender, scene, language, shot, action, sound_before):
+    attrs = [f'voice="{voice}"', f'gender="{gender}"']
+    if scene:
+        attrs.append(f'scene="{scene}"')
+    if language and language != "en":
+        attrs.append(f'language="{language}"')
+    if shot:
+        attrs.append(f'shot="{shot}"')
+    inner = ""
+    if sound_before:
+        inner += f"<sound>{sound_before}</sound>"
+    if action:
+        inner += f"<action>{action}</action>"
+    inner += text
+    return f"<speak {' '.join(attrs)}>{inner}</speak>"
+@spaces.GPU(duration=300)
+def generate(
+    text,
+    voice,
+    gender,
+    scene,
+    language,
+    shot,
+    action,
+    sound_before,
+    reference_audio,
+    mode,
+    seed,
+    background_sfx,
+    skip_vc,
+    raw_xml,
+    progress=gr.Progress(track_tqdm=True),
+):
+    progress(0, desc="Loading models (cold start can take a few minutes)")
+    processor = _get_processor()
+    if raw_xml and raw_xml.strip():
+        prompt = raw_xml.strip()
+    else:
+        if not text.strip():
+            raise gr.Error("Speech text is required.")
+        prompt = _build_prompt(text, voice, gender, scene, language, shot, action, sound_before)
+    # If reference audio is a local file (gradio path), upload-less: we copy into
+    # a temp http-less path that AudioProcessor expects URL. Easiest: serve via
+    # a file:// URL — but httpx doesn't support file://. Instead, patch path by
+    # writing input to a known place and using a fake URL handler via temp.
+    body = {
+        "prompt": prompt,
+        "mode": mode,
+        "seed": int(seed) if seed is not None else -1,
+        "background_sfx": bool(background_sfx),
+        "skip_vc": bool(skip_vc),
+        "validate": True,
+    }
+    # Reference voice: AudioProcessor downloads from URL. We bypass by directly
+    # placing a local path; the _generate function uses `reference_voice_url`
+    # and calls `_download_reference`. Workaround: monkey-patch download to
+    # return the local path if a file:// URL is given.
+    ref_local_path = None
+    if reference_audio:
+        ref_local_path = reference_audio
+        body["reference_voice_url"] = f"file://{ref_local_path}"
+    async def _run():
+        # Patch _download_reference for this call to handle file:// URLs
+        original = processor._download_reference
+        async def patched(url):
+            if url.startswith("file://"):
+                return url[len("file://"):]
+            return await original(url)
+        processor._download_reference = patched
+        try:
+            job = ProcessJob(job_id=str(uuid.uuid4()), input=body)
+            return await processor.process(job)
+        finally:
+            processor._download_reference = original
+    progress(0.1, desc="Generating audio")
+    result = asyncio.run(_run())
+    if not result.success:
+        raise gr.Error(result.error or "Generation failed")
+    # Write to temp wav and return path
+    out_path = Path(tempfile.gettempdir()) / f"scenema_{uuid.uuid4().hex}.wav"
+    out_path.write_bytes(result.output.data)
+    meta = result.output.metadata or {}
+    info = (
+        f"Duration: {meta.get('duration_s', 0)}s · "
+        f"Seed: {meta.get('seed')} · "
+        f"GPU: {meta.get('gpu', 'N/A')} · "
+        f"Time: {meta.get('processing_ms', 0)} ms"
+    )
+    return str(out_path), info
+# ── UI ────────────────────────────────────────────────────────────────────
+EXAMPLES = [
+    [
+        "The old lighthouse had stood on the cliff for over a century, its beam cutting through the fog like a blade of light.",
+        "A warm, clear male voice with a slight British accent. Measured, thoughtful pacing.",
+        "male", "", "en", "closeup", "", "",
+        None, "generate", 42, False, False, "",
+    ],
+    [
+        "The city never really sleeps. It just closes its eyes and pretends for a while.",
+        "A young woman with a smoky, low register voice. Intimate, confessional tone.",
+        "female", "", "en", "closeup", "", "",
+        None, "voice_design", 7, False, False, "",
+    ],
+    [
+        "Get the lines! She is pulling loose! Move! I said move!",
+        "Male, mid 40s. Weathered. Urgent, projecting over wind.",
+        "male", "Open dock in a thunderstorm, heavy rain", "en", "scene",
+        "He shouts over the storm", "Heavy rain and wind howling",
+        None, "generate", 11, True, False, "",
+    ],
+]
+with gr.Blocks(title="Scenema Audio") as demo:
+    gr.Markdown(
+        """
+        # Scenema Audio · Zero-shot Expressive TTS
+        Generate expressive speech with emotion, scene, and voice cloning.
+        Built on [ScenemaAI/scenema-audio](https://github.com/ScenemaAI/scenema-audio).
+        **Note:** First request triggers a ~38 GB cold start. Subsequent requests are fast.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            text = gr.Textbox(
+                label="Speech text",
+                lines=4,
+                placeholder="What the voice should say...",
+            )
+            voice = gr.Textbox(
+                label="Voice description",
+                lines=2,
+                placeholder='e.g. "A warm male voice with a slight British accent..."',
+            )
+            with gr.Row():
+                gender = gr.Radio(["male", "female"], value="male", label="Gender")
+                language = gr.Dropdown(
+                    ["en", "es", "fr", "de", "it", "pt", "ja", "zh", "ko"],
+                    value="en", label="Language",
+                )
+                shot = gr.Radio(
+                    ["closeup", "wide", "scene"], value="closeup", label="Shot"
+                )
+            with gr.Accordion("Scene & direction (optional)", open=False):
+                scene = gr.Textbox(label="Scene", placeholder="e.g. busy cafe at midday")
+                action = gr.Textbox(label="Performance direction (<action>)")
+                sound_before = gr.Textbox(label="Sound event before speech (<sound>)")
+            with gr.Accordion("Raw XML override (optional)", open=False):
+                raw_xml = gr.Textbox(
+                    label="<speak> XML (overrides fields above when set)",
+                    lines=4,
+                )
+            with gr.Accordion("Voice cloning (optional)", open=False):
+                reference_audio = gr.Audio(
+                    label="Reference voice (10-20s)",
+                    type="filepath",
+                )
+            with gr.Row():
+                mode = gr.Radio(
+                    ["generate", "voice_design"], value="generate", label="Mode"
+                )
+                seed = gr.Number(value=42, precision=0, label="Seed (-1 = random)")
+            with gr.Row():
+                background_sfx = gr.Checkbox(value=False, label="Keep background SFX")
+                skip_vc = gr.Checkbox(value=False, label="Skip SeedVC post-processing")
+            run_btn = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=2):
+            out_audio = gr.Audio(label="Output", type="filepath")
+            info = gr.Textbox(label="Info", interactive=False)
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[
+            text, voice, gender, scene, language, shot, action, sound_before,
+            reference_audio, mode, seed, background_sfx, skip_vc, raw_xml,
+        ],
+    )
+    run_btn.click(
+        generate,
+        inputs=[
+            text, voice, gender, scene, language, shot, action, sound_before,
+            reference_audio, mode, seed, background_sfx, skip_vc, raw_xml,
+        ],
+        outputs=[out_audio, info],
+    )
+if __name__ == "__main__":
+    demo.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+numpy==2.2.6
+transformers==4.57.6
+accelerate==1.13.0
+safetensors==0.7.0
+sentencepiece==0.2.1
+ltx-core @ git+https://github.com/Lightricks/LTX-2.git@41d924371612b692c0fd1e4d9d94c3dfb3c02cb3#subdirectory=packages/ltx-core
+ltx-pipelines @ git+https://github.com/Lightricks/LTX-2.git@41d924371612b692c0fd1e4d9d94c3dfb3c02cb3#subdirectory=packages/ltx-pipelines
+scipy==1.13.1
+librosa==0.10.2
+huggingface-hub==0.36.2
+munch==4.0.0
+einops==0.8.0
+descript-audio-codec==1.0.0
+pydub==0.25.1
+soundfile==0.12.1
+hydra-core==1.3.2
+pyyaml==6.0.3
+python-dotenv==1.2.2
+diffusers==0.37.1
+onnxruntime==1.25.0
+funasr==1.3.1
+rotary-embedding-torch==0.8.9
+beartype==0.22.9
+fastapi==0.136.1
+httpx==0.28.1
+psutil==7.2.2
+bitsandbytes==0.49.2
+kokoro==0.9.4
+faster-whisper==1.2.1
+ctranslate2==4.7.1

src/audio_core/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Scenema Audio: Expressive audio generation via LTX 2.3 audio diffusion."""
+__version__ = "1.0.0"

src/audio_core/audio_utils.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Audio utility functions for Scenema Audio.
+Silence trimming, volume normalization, wav I/O, format conversion.
+"""
+import logging
+import math
+import numpy as np
+import soundfile as sf
+logger = logging.getLogger(__name__)
+def trim_silence(
+    audio_np: np.ndarray,
+    sr: int,
+    max_silence: float = 0.5,
+    threshold_db: float = -40,
+) -> np.ndarray:
+    """Trim silence exceeding max_silence from start and end of audio.
+    Keeps up to max_silence seconds of silence at boundaries.
+    Args:
+        audio_np: Audio samples, shape (samples,) or (samples, channels).
+        sr: Sample rate in Hz.
+        max_silence: Maximum silence to keep at head/tail in seconds.
+        threshold_db: Amplitude threshold below which audio is considered silence.
+    Returns:
+        Trimmed audio array with the same number of dimensions as input.
+    """
+    threshold = 10 ** (threshold_db / 20.0)
+    max_silent_samples = int(max_silence * sr)
+    window = int(0.02 * sr)  # 20ms analysis window
+    if audio_np.ndim == 2:
+        mono = audio_np.mean(axis=1)
+    else:
+        mono = audio_np
+    if len(mono) < window:
+        return audio_np
+    energy = np.array(
+        [
+            np.abs(mono[i : i + window]).max()
+            for i in range(0, len(mono) - window, window)
+        ]
+    )
+    voiced = np.where(energy > threshold)[0]
+    if len(voiced) == 0:
+        return audio_np
+    first_voiced = max(0, voiced[0] * window - max_silent_samples)
+    last_voiced = min(len(audio_np), (voiced[-1] + 1) * window + max_silent_samples)
+    return audio_np[first_voiced:last_voiced]
+def normalize_volume(
+    audio_np: np.ndarray,
+    sr: int,
+    target_lufs: float = -23.0,
+) -> np.ndarray:
+    """Normalize audio volume to target LUFS (approximate via RMS).
+    Uses a simplified RMS-based LUFS approximation suitable for
+    per-chunk normalization before concatenation.
+    Args:
+        audio_np: Audio samples, shape (samples,) or (samples, channels).
+        sr: Sample rate in Hz.
+        target_lufs: Target loudness in LUFS (default -23, EBU R128).
+    Returns:
+        Volume-normalized audio array, soft-clipped to prevent distortion.
+    """
+    if audio_np.ndim == 2:
+        mono = audio_np.mean(axis=1)
+    else:
+        mono = audio_np
+    rms = np.sqrt(np.mean(mono**2))
+    if rms < 1e-8:
+        return audio_np
+    current_lufs = 20 * math.log10(rms) - 0.691
+    gain_db = target_lufs - current_lufs
+    gain = 10 ** (gain_db / 20.0)
+    gain = max(0.1, min(gain, 10.0))
+    result = audio_np * gain
+    peak = np.abs(result).max()
+    if peak > 0.99:
+        result = result * (0.99 / peak)
+    return result
+def extract_wav(audio_obj) -> tuple[np.ndarray, int]:
+    """Extract numpy waveform from an LTX Audio object.
+    Handles shapes: (B,C,samples) -> (samples,C), (C,samples) -> (samples,C).
+    Args:
+        audio_obj: LTX pipeline Audio object with .waveform and .sampling_rate.
+    Returns:
+        Tuple of (waveform as float32 numpy, sample_rate).
+    """
+    w = audio_obj.waveform.cpu().float().numpy()
+    if w.ndim == 3:
+        w = w.squeeze(0)
+    if w.ndim == 2:
+        w = w.T
+    return w, audio_obj.sampling_rate
+def save_wav(audio_np: np.ndarray, sr: int, path: str) -> None:
+    """Save audio to WAV file.
+    Args:
+        audio_np: Audio samples, shape (samples,) or (samples, channels).
+        sr: Sample rate in Hz.
+        path: Output file path.
+    """
+    sf.write(path, audio_np, sr)
+def load_wav(path: str) -> tuple[np.ndarray, int]:
+    """Load audio from WAV file.
+    Args:
+        path: Input file path.
+    Returns:
+        Tuple of (audio samples as float64 numpy, sample_rate).
+    """
+    data, sr = sf.read(path)
+    return data, sr
+def to_mono(audio_np: np.ndarray) -> np.ndarray:
+    """Convert stereo to mono by averaging channels.
+    Args:
+        audio_np: Audio samples, shape (samples, 2) for stereo or (samples,) for mono.
+    Returns:
+        Mono audio array, shape (samples,).
+    """
+    if audio_np.ndim == 2 and audio_np.shape[1] == 2:
+        return audio_np.mean(axis=1)
+    return audio_np
+def shorten_long_silence(
+    audio_np: np.ndarray,
+    sr: int,
+    max_duration: float = 1.0,
+    target_duration: float = 0.3,
+    threshold_db: float = -35,
+) -> np.ndarray:
+    """Shorten silence regions longer than max_duration to target_duration.
+    Unlike silenceremove which deletes silence entirely, this preserves
+    a natural pause of target_duration seconds. Prevents chunk boundary
+    artifacts while keeping the audio flow natural.
+    Args:
+        audio_np: Audio samples, shape (samples,) or (samples, channels).
+        sr: Sample rate in Hz.
+        max_duration: Silence longer than this is shortened.
+        target_duration: Silence is shortened to this duration.
+        threshold_db: Amplitude threshold below which audio is silence.
+    Returns:
+        Audio with long silence regions shortened.
+    """
+    threshold = 10 ** (threshold_db / 20.0)
+    window = int(0.02 * sr)  # 20ms analysis window
+    max_samples = int(max_duration * sr)
+    target_samples = int(target_duration * sr)
+    if audio_np.ndim == 2:
+        mono = audio_np.mean(axis=1)
+    else:
+        mono = audio_np
+    if len(mono) < window:
+        return audio_np
+    # Find silent regions
+    energy = np.array(
+        [
+            np.abs(mono[i : i + window]).max()
+            for i in range(0, len(mono) - window, window)
+        ]
+    )
+    is_silent = energy < threshold
+    # Build list of (start_sample, end_sample) for silence regions
+    silence_regions = []
+    in_silence = False
+    start = 0
+    for i, silent in enumerate(is_silent):
+        if silent and not in_silence:
+            start = i * window
+            in_silence = True
+        elif not silent and in_silence:
+            end = i * window
+            if end - start > max_samples:
+                silence_regions.append((start, end))
+            in_silence = False
+    if in_silence:
+        end = len(mono)
+        if end - start > max_samples:
+            silence_regions.append((start, end))
+    if not silence_regions:
+        return audio_np
+    # Build output by keeping non-silence and shortening long silence
+    parts = []
+    prev_end = 0
+    for s_start, s_end in silence_regions:
+        # Keep audio before this silence
+        parts.append(audio_np[prev_end:s_start])
+        # Add shortened silence (target_duration worth)
+        parts.append(audio_np[s_start : s_start + target_samples])
+        prev_end = s_end
+    # Keep remaining audio after last silence
+    parts.append(audio_np[prev_end:])
+    result = np.concatenate(parts, axis=0)
+    shortened = (len(audio_np) - len(result)) / sr
+    if shortened > 0:
+        logger.info(
+            "Shortened %d silence regions, removed %.1fs",
+            len(silence_regions),
+            shortened,
+        )
+    return result
+def ensure_stereo(audio_np: np.ndarray) -> np.ndarray:
+    """Convert mono to stereo by duplicating the channel.
+    Args:
+        audio_np: Audio samples, shape (samples,) for mono or (samples, 2) for stereo.
+    Returns:
+        Stereo audio array, shape (samples, 2).
+    """
+    if audio_np.ndim == 1:
+        return np.stack([audio_np, audio_np], axis=-1)
+    return audio_np

src/audio_core/chunker.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Text chunking and duration estimation for Scenema Audio.
+Splits long text into chunks at sentence boundaries using Kokoro TTS
+phoneme-level timing as the source of truth for duration. No word counting.
+Algorithm:
+  1. Split text into sentences
+  2. Estimate each sentence's duration via Kokoro (one call per sentence)
+  3. Greedily merge: accumulate sentence durations, start a new chunk
+     when running_sum * LTX_MULTIPLIER exceeds MAX_CHUNK_DURATION_S
+"""
+import logging
+import random
+from dataclasses import dataclass
+from .compiler import compile_chunk_prompt, compile_prompt, extract_sentence_actions
+from .validator import validate_prompt
+logger = logging.getLogger(__name__)
+FALLBACK_WORDS_PER_SEC = 2.2  # Test-environment-only fallback when Kokoro is mocked
+ACTION_DURATION_S = 1.5  # Extra time per action block
+MAX_CHUNK_DURATION_S = (
+    15.0  # Safe generation limit — model trained on 20s but repeats beyond ~15s
+)
+LTX_MULTIPLIER = 1.5  # LTX speaks slower than Kokoro; overshoot for trimming
+# Kokoro singleton (loaded once, reused)
+_kokoro_pipeline = None
+_kokoro_available: bool | None = None
+def _get_kokoro():
+    """Get or initialize the Kokoro TTS pipeline for duration estimation.
+    Kokoro is 82M params, runs on CPU. Loaded once and cached.
+    Falls back to word-count heuristic only in test environments.
+    """
+    global _kokoro_pipeline, _kokoro_available
+    if _kokoro_available is False:
+        return None
+    if _kokoro_pipeline is not None:
+        return _kokoro_pipeline
+    try:
+        from kokoro import KPipeline
+        pipe = KPipeline(lang_code="a")
+        # Verify it's a real Kokoro pipeline (not a mock in tests)
+        if not hasattr(pipe, "__module__") or "kokoro" not in str(
+            getattr(pipe, "__module__", "")
+        ):
+            raise TypeError("Kokoro pipeline is not genuine (test mock)")
+        _kokoro_pipeline = pipe
+        _kokoro_available = True
+        logger.info("Kokoro TTS loaded for duration estimation")
+        return _kokoro_pipeline
+    except TypeError:
+        # Test environment with mocks, fall back silently
+        _kokoro_available = False
+        return None
+    except (ImportError, Exception) as e:
+        _kokoro_available = False
+        logger.error("Kokoro is required but not available: %s", e)
+        raise RuntimeError(
+            f"Kokoro TTS is a required dependency for duration estimation. "
+            f"Install it with: pip install kokoro. Error: {e}"
+        ) from e
+def _kokoro_duration(text: str) -> float | None:
+    """Estimate speech duration using Kokoro TTS phoneme-level timing.
+    Args:
+        text: Speech text to estimate duration for
+    Returns:
+        Duration in seconds, or None if Kokoro unavailable
+    """
+    pipe = _get_kokoro()
+    if pipe is None:
+        return None
+    try:
+        total_frames = 0
+        for result in pipe(text, voice="af_heart"):
+            if hasattr(result, "audio") and result.audio is not None:
+                total_frames += len(result.audio)
+        # Kokoro outputs at 24000Hz
+        duration = total_frames / 24000.0
+        return duration
+    except Exception as e:
+        logger.warning("Kokoro estimation failed: %s", e)
+        return None
+@dataclass
+class ChunkSpec:
+    compiled_prompt: str
+    duration_s: float
+    seed: int
+    expected_text: str
+    language: str = "en"
+def _split_into_sentences(text: str) -> list[str]:
+    """Split text into individual sentences at .!? boundaries."""
+    sentences = []
+    current = ""
+    for char in text:
+        current += char
+        if char in ".!?":
+            stripped = current.strip()
+            if stripped:
+                sentences.append(stripped)
+            current = ""
+    if current.strip():
+        sentences.append(current.strip())
+    return sentences
+def _estimate_sentence_durations(sentences: list[str]) -> list[float]:
+    """Estimate Kokoro duration for each sentence individually.
+    One Kokoro call per sentence. Returns raw Kokoro durations (before
+    LTX multiplier). Falls back to word-count heuristic per sentence
+    only in test environments where Kokoro is mocked.
+    """
+    durations = []
+    for sent in sentences:
+        dur = _kokoro_duration(sent)
+        if dur is None:
+            # Test environment fallback only
+            dur = len(sent.split()) / FALLBACK_WORDS_PER_SEC + 0.3
+        durations.append(dur)
+    return durations
+def split_text_by_duration(
+    text: str,
+    multiplier: float = LTX_MULTIPLIER,
+    max_duration: float = MAX_CHUNK_DURATION_S,
+) -> list[tuple[str, float]]:
+    """Split text into chunks using Kokoro duration estimation.
+    Kokoro is the source of truth for duration. No word counting.
+    Algorithm:
+      1. Split text into sentences
+      2. Estimate each sentence's duration via Kokoro (one call per sentence)
+      3. Greedily merge: accumulate durations, start a new chunk when
+         running_sum * multiplier would exceed max_duration
+    Duration is additive across sentences because Kokoro estimates are
+    phoneme-level with no cross-sentence dependencies.
+    Args:
+        text: Full speech text.
+        multiplier: LTX speaks slower than Kokoro; applied to estimates.
+        max_duration: Max audio duration per chunk (model training limit).
+    Returns:
+        List of (chunk_text, estimated_ltx_duration) tuples.
+    """
+    sentences = _split_into_sentences(text)
+    if not sentences:
+        return []
+    # Split long sentences at commas if they exceed max_duration on their own
+    expanded = []
+    for sent in sentences:
+        dur = _estimate_sentence_durations([sent])[0]
+        if dur * multiplier > max_duration and "," in sent:
+            # Split at commas and re-estimate
+            clauses = [c.strip() for c in sent.split(",") if c.strip()]
+            clause_durs = _estimate_sentence_durations(clauses)
+            sub_texts: list[str] = []
+            sub_dur = 0.0
+            for clause, cdur in zip(clauses, clause_durs):
+                if sub_texts and (sub_dur + cdur) * multiplier > max_duration:
+                    expanded.append(", ".join(sub_texts))
+                    sub_texts = []
+                    sub_dur = 0.0
+                sub_texts.append(clause)
+                sub_dur += cdur
+            if sub_texts:
+                expanded.append(", ".join(sub_texts))
+        else:
+            expanded.append(sent)
+    durations = _estimate_sentence_durations(expanded)
+    chunks: list[tuple[str, float]] = []
+    current_texts: list[str] = []
+    current_dur = 0.0
+    for sent, dur in zip(expanded, durations):
+        if current_texts and (current_dur + dur) * multiplier > max_duration:
+            chunk_text = " ".join(current_texts)
+            chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))
+            current_texts = []
+            current_dur = 0.0
+        current_texts.append(sent)
+        current_dur += dur
+    if current_texts:
+        chunk_text = " ".join(current_texts)
+        chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))
+    return chunks
+def estimate_duration(
+    text: str,
+    num_actions: int = 0,
+    multiplier: float = LTX_MULTIPLIER,
+) -> float:
+    """Estimate audio duration for a single chunk of text.
+    Used for single-chunk prompts that don't need splitting.
+    Args:
+        text: Speech text (no actions)
+        num_actions: Number of action blocks (adds time for breaths/pauses)
+        multiplier: Duration multiplier (LTX speaks slower than Kokoro)
+    """
+    kokoro_dur = _kokoro_duration(text)
+    if kokoro_dur is not None:
+        base_duration = kokoro_dur
+        logger.debug("Kokoro estimate: %.1fs for '%s'", kokoro_dur, text[:40])
+    else:
+        words = len(text.split())
+        base_duration = words / FALLBACK_WORDS_PER_SEC + 0.5
+    action_time = num_actions * ACTION_DURATION_S
+    duration = (base_duration + action_time) * multiplier
+    return min(duration, MAX_CHUNK_DURATION_S)
+def plan_chunks(
+    xml_string: str,
+    base_seed: int = -1,
+    pace: float = LTX_MULTIPLIER,
+) -> list[ChunkSpec]:
+    """Plan generation chunks from an XML prompt.
+    Validates XML, extracts text, splits into duration-based chunks
+    using Kokoro, and builds per-chunk compiled prompts.
+    Args:
+        xml_string: Valid <speak> XML string
+        base_seed: Base seed (-1 for random, otherwise sequential per chunk)
+        pace: Duration multiplier (default 1.5). Higher = slower speech.
+    """
+    result = validate_prompt(xml_string)
+    if not result.valid:
+        raise ValueError(f"Invalid prompt: {'; '.join(result.errors)}")
+    compiled = compile_prompt(xml_string)
+    if base_seed == -1:
+        base_seed = random.randint(0, 999999)
+    # Check if entire text fits in a single chunk (uncapped duration for this check)
+    kokoro_dur = _kokoro_duration(compiled.speech_text)
+    if kokoro_dur is not None:
+        total_dur = kokoro_dur * pace
+    else:
+        words = len(compiled.speech_text.split())
+        total_dur = (words / FALLBACK_WORDS_PER_SEC + 0.5) * pace
+    if total_dur <= MAX_CHUNK_DURATION_S:
+        return [
+            ChunkSpec(
+                compiled_prompt=compiled.prompt,
+                duration_s=min(total_dur, MAX_CHUNK_DURATION_S),
+                seed=base_seed,
+                expected_text=compiled.speech_text,
+                language=compiled.language,
+            )
+        ]
+    # Extract action-to-sentence mapping before splitting
+    sentence_action_map = extract_sentence_actions(xml_string)
+    # Split by Kokoro-estimated duration
+    text_chunks = split_text_by_duration(compiled.speech_text, multiplier=pace)
+    # Track which global sentence index each chunk starts at
+    global_sentence_idx = 0
+    specs: list[ChunkSpec] = []
+    for i, (chunk_text, chunk_dur) in enumerate(text_chunks):
+        # Find actions that belong to this chunk's first sentence
+        actions_before = sentence_action_map.get(global_sentence_idx)
+        chunk_prompt = compile_chunk_prompt(
+            speech_text=chunk_text,
+            voice=compiled.voice,
+            scene=compiled.scene,
+            actions_before=actions_before,
+            gender=compiled.gender,
+            shot=compiled.shot,
+        )
+        specs.append(
+            ChunkSpec(
+                compiled_prompt=chunk_prompt,
+                duration_s=chunk_dur,
+                seed=base_seed + i * 1000,
+                expected_text=chunk_text,
+                language=compiled.language,
+            )
+        )
+        # Count sentences in this chunk to advance global index
+        chunk_sentences = _split_into_sentences(chunk_text)
+        global_sentence_idx += len(chunk_sentences)
+    logger.info(
+        "Planned %d chunks (%.1fs total estimated)",
+        len(specs),
+        sum(s.duration_s for s in specs),
+    )
+    return specs

src/audio_core/compiler.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""XML prompt compiler for Scenema Audio.
+Compiles a <speak> XML prompt into the video-style flat text prompt
+that the LTX 2.3 audio model expects.
+Supports three block types inside <speak>:
+  <action>  — delivery/performance cues (how the person speaks/acts)
+  <sound>   — audio events that should be heard (SFX, ambient sounds)
+  Text      — the actual speech content
+And three shot modes via the shot attribute:
+  closeup (default) — speech-focused, no SFX, clean audio
+  wide              — environment + speech, SFX prominent
+  scene             — raw scene description, maximum SFX
+Example (closeup mode):
+  Input:
+    <speak voice="Deep male voice" scene="A dimly lit room" gender="male">
+      <action>He takes a slow breath</action>
+      Many years later, as he faced the firing squad...
+    </speak>
+  Output:
+    Close-up in a dimly lit room. He takes a slow breath.
+    "Many years later, as he faced the firing squad..."
+    Deep male voice.
+Example (scene mode with SFX):
+  Input:
+    <speak voice="Tense male whisper" scene="Dark room, heavy rain"
+           gender="male" shot="scene">
+      <sound>A phone rings twice then stops</sound>
+      <action>He picks up the receiver and speaks in a low whisper</action>
+      Its done. The package is at the location.
+      <sound>Thunder rumbles in the distance</sound>
+      <action>He continues urgently</action>
+      You have thirty minutes.
+    </speak>
+  Output:
+    Dark room, heavy rain. A phone rings twice then stops.
+    He picks up the receiver and speaks in a low whisper:
+    "Its done. The package is at the location."
+    Thunder rumbles in the distance. He continues urgently:
+    "You have thirty minutes."
+    Tense male whisper. Dark room, heavy rain.
+"""
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+DEFAULT_SCENE = "a person speaking to camera"
+@dataclass
+class CompiledPrompt:
+    prompt: str
+    speech_text: str
+    voice: str
+    scene: str | None
+    language: str
+    gender: str
+    shot: str
+@dataclass
+class TextBlock:
+    text: str
+@dataclass
+class ActionBlock:
+    text: str
+@dataclass
+class SoundBlock:
+    text: str
+Block = TextBlock | ActionBlock | SoundBlock
+def _extract_blocks(root: ET.Element) -> list[Block]:
+    """Walk <speak> children in document order, extract text, action, and sound blocks."""
+    blocks: list[Block] = []
+    if root.text and root.text.strip():
+        blocks.append(TextBlock(text=root.text.strip()))
+    for child in root:
+        if child.tag == "action" and child.text and child.text.strip():
+            blocks.append(ActionBlock(text=child.text.strip()))
+        elif child.tag == "sound" and child.text and child.text.strip():
+            blocks.append(SoundBlock(text=child.text.strip()))
+        if child.tail and child.tail.strip():
+            blocks.append(TextBlock(text=child.tail.strip()))
+    return blocks
+def _ensure_trailing_punctuation(text: str) -> str:
+    """Ensure text ends with sentence-ending punctuation."""
+    if text and text[-1] not in ".!?\"'":
+        return text + "."
+    return text
+SHOT_PREFIXES = {
+    "closeup": "Close-up in",
+    "wide": "Wide shot of",
+    "scene": "",
+}
+def _compile_blocks(
+    blocks: list[Block],
+    voice: str,
+    scene: str | None,
+    gender: str = "male",
+    shot: str = "closeup",
+) -> str:
+    """Compile blocks into the video-style prompt string."""
+    parts: list[str] = []
+    is_scene_mode = shot in ("scene", "wide")
+    pronoun = "She" if gender == "female" else "He"
+    scene_text = scene if scene else DEFAULT_SCENE
+    prefix = SHOT_PREFIXES.get(shot, SHOT_PREFIXES["closeup"])
+    if prefix:
+        parts.append(f"{prefix} {scene_text}.")
+    else:
+        parts.append(f"{scene_text}.")
+    first_speech = True
+    for block in blocks:
+        if isinstance(block, SoundBlock):
+            # Sound events compile as standalone sentences
+            parts.append(_ensure_trailing_punctuation(block.text))
+        elif isinstance(block, ActionBlock):
+            if is_scene_mode:
+                # In scene/wide mode, action flows into speech with connector
+                # Don't add punctuation — the colon before the quote handles it
+                parts.append(block.text + ":")
+            else:
+                # In closeup mode, action is a standalone sentence
+                parts.append(_ensure_trailing_punctuation(block.text))
+        elif isinstance(block, TextBlock):
+            clean_text = _ensure_trailing_punctuation(block.text)
+            if (
+                is_scene_mode
+                and first_speech
+                and not any(isinstance(b, ActionBlock) for b in blocks)
+            ):
+                # No action before first speech in scene mode — add pronoun
+                parts.append(f'{pronoun} speaks: "{clean_text}"')
+            else:
+                parts.append(f'"{clean_text}"')
+            first_speech = False
+    parts.append(_ensure_trailing_punctuation(voice))
+    # In scene/wide mode, repeat scene as SFX reinforcement at the end
+    if is_scene_mode and scene:
+        parts.append(_ensure_trailing_punctuation(scene))
+    return " ".join(parts)
+def _extract_speech_only(blocks: list[Block]) -> str:
+    """Extract only speech text (no actions or sounds) for duration estimation."""
+    texts = [b.text for b in blocks if isinstance(b, TextBlock)]
+    return " ".join(texts)
+def compile_prompt(xml_string: str) -> CompiledPrompt:
+    """Compile a <speak> XML prompt into a video-style text prompt.
+    Args:
+        xml_string: Valid <speak> XML string (must pass validate_prompt first)
+    Returns:
+        CompiledPrompt with the compiled prompt and extracted metadata
+    """
+    root = ET.fromstring(xml_string)
+    voice = root.get("voice", "").strip()
+    scene = root.get("scene")
+    if scene:
+        scene = scene.strip()
+    language = root.get("language", "en").strip()
+    gender = root.get("gender", "male").strip()
+    shot = root.get("shot", "closeup").strip()
+    blocks = _extract_blocks(root)
+    prompt = _compile_blocks(blocks, voice, scene, gender, shot)
+    speech_text = _extract_speech_only(blocks)
+    return CompiledPrompt(
+        prompt=prompt,
+        speech_text=speech_text,
+        voice=voice,
+        scene=scene,
+        language=language,
+        gender=gender,
+        shot=shot,
+    )
+def extract_sentence_actions(xml_string: str) -> dict[int, list[str]]:
+    """Map sentence indices to their preceding action blocks.
+    Walks the XML blocks in order, tracking the most recent action(s).
+    When a text block is encountered, its sentences inherit the pending actions.
+    Only the first sentence of each text block gets the actions (the action
+    precedes the text block in the XML).
+    Returns:
+        Dict mapping sentence index (0-based across all speech text) to a list
+        of action strings that precede that sentence.
+    """
+    root = ET.fromstring(xml_string)
+    blocks = _extract_blocks(root)
+    sentence_actions: dict[int, list[str]] = {}
+    pending_actions: list[str] = []
+    sentence_idx = 0
+    for block in blocks:
+        if isinstance(block, ActionBlock):
+            pending_actions.append(block.text)
+        elif isinstance(block, TextBlock):
+            # Split this text block into sentences to count them
+            text = block.text.strip()
+            sentences = []
+            current = ""
+            for char in text:
+                current += char
+                if char in ".!?":
+                    s = current.strip()
+                    if s:
+                        sentences.append(s)
+                    current = ""
+            if current.strip():
+                sentences.append(current.strip())
+            if pending_actions and sentences:
+                sentence_actions[sentence_idx] = pending_actions.copy()
+                pending_actions.clear()
+            sentence_idx += len(sentences)
+    return sentence_actions
+def extract_speech_text(xml_string: str) -> str:
+    """Extract only the speech text from XML, ignoring actions and sounds.
+    Useful for duration estimation (Kokoro) without compiling the full prompt.
+    """
+    root = ET.fromstring(xml_string)
+    blocks = _extract_blocks(root)
+    return _extract_speech_only(blocks)
+def compile_chunk_prompt(
+    speech_text: str,
+    voice: str,
+    scene: str | None = None,
+    actions_before: list[str] | None = None,
+    actions_after: list[str] | None = None,
+    gender: str = "male",
+    shot: str = "closeup",
+) -> str:
+    """Compile a single chunk's prompt from pre-split text.
+    Used by the chunker to build per-chunk prompts after text splitting.
+    Args:
+        speech_text: The chunk's speech text portion.
+        voice: Voice description string.
+        scene: Scene description string (optional).
+        actions_before: Action blocks to prepend before speech.
+        actions_after: Action blocks to append after speech.
+    Returns:
+        Compiled video-style prompt string.
+    """
+    blocks: list[Block] = []
+    if actions_before:
+        for a in actions_before:
+            blocks.append(ActionBlock(text=a))
+    blocks.append(TextBlock(text=speech_text))
+    if actions_after:
+        for a in actions_after:
+            blocks.append(ActionBlock(text=a))
+    return _compile_blocks(blocks, voice, scene, gender, shot)

src/audio_core/engine.py ADDED Viewed

	@@ -0,0 +1,911 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Audio generation engine for Scenema Audio.
+Loads the LTX 2.3 audio-only checkpoint, Audio VAE encoder, and
+Gemma 3 12B text encoder. VRAM management is auto-detected: models
+are moved between GPU and CPU as needed per inference phase.
+"""
+import gc
+import json
+import logging
+import os
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, replace as dc_replace
+import numpy as np
+import psutil
+import torch
+import torchaudio
+from safetensors import safe_open
+from safetensors.torch import load_file
+from ltx_core.batch_split import BatchSplitAdapter, BatchedPerturbationConfig
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.components.patchifiers import AudioPatchifier, VideoLatentPatchifier
+from ltx_core.model.audio_vae.audio_vae import Audio, encode_audio
+from ltx_core.model.audio_vae.model_configurator import AudioEncoderConfigurator
+from ltx_core.model.transformer.model import X0Model
+from ltx_core.model.transformer.model_configurator import LTXModelConfigurator
+from ltx_core.model.transformer.transformer import BasicAVTransformerBlock, rms_norm
+from ltx_core.tools import AudioLatentTools, LatentState, VideoLatentTools
+from ltx_core.types import AudioLatentShape, VideoLatentShape, VideoPixelShape
+from ltx_pipelines.distilled import DISTILLED_SIGMAS, DistilledPipeline
+from ltx_pipelines.utils.blocks import ModalitySpec, _build_state
+from ltx_pipelines.utils.denoisers import SimpleDenoiser
+from ltx_pipelines.utils.samplers import euler_denoising_loop
+from ltx_pipelines.utils.types import OffloadMode
+from ltx_core.text_encoders.gemma.tokenizer import LTXVGemmaTokenizer
+import bitsandbytes  # noqa: F401
+from transformers import BitsAndBytesConfig, Gemma3ForConditionalGeneration
+from .audio_utils import extract_wav
+logger = logging.getLogger(__name__)
+FPS = 24
+MAX_REF_SECONDS = 5
+class _Int8Linear(torch.nn.Module):
+    """Linear layer with INT8 weights, dequantized to input dtype during forward.
+    Keeps weights as int8 buffers in VRAM (~50% of bf16). Dequantization
+    happens per forward pass: weight = int8 * scale, then cast to input dtype.
+    Ported from bench_full_quantized.py.
+    """
+    def __init__(self, weight_int8, scale, bias=None):
+        super().__init__()
+        self.register_buffer("weight_int8", weight_int8)
+        self.register_buffer("scale", scale)
+        if bias is not None:
+            self.register_parameter("bias", torch.nn.Parameter(bias))
+        else:
+            self.bias = None
+    def forward(self, x):
+        w = self.weight_int8.float() * self.scale.unsqueeze(1)
+        w = w.to(x.dtype)
+        return torch.nn.functional.linear(x, w, self.bias)
+# VRAM threshold: cards with this much VRAM keep all models GPU-resident
+# (Gemma bf16 on GPU, no offloading, MelBandRoFormer + SeedVC preloaded).
+# Below this: Gemma streams from CPU, models load/unload per request.
+HIGH_VRAM_THRESHOLD_GB = 40
+@dataclass
+class AudioResult:
+    waveform_np: np.ndarray  # (samples,) or (samples, channels) float32
+    sample_rate: int
+    duration_s: float
+def _materialize_meta_tensors(module, device="cpu"):
+    """Replace meta tensors with zeros on the specified device."""
+    for name, param in list(module.named_parameters()):
+        if param.is_meta:
+            parts = name.split(".")
+            mod = module
+            for p in parts[:-1]:
+                mod = getattr(mod, p)
+            mod._parameters[parts[-1]] = torch.nn.Parameter(
+                torch.zeros(param.shape, dtype=torch.bfloat16, device=device)
+            )
+    for name, buf in list(module.named_buffers()):
+        if buf.is_meta:
+            parts = name.split(".")
+            mod = module
+            for p in parts[:-1]:
+                mod = getattr(mod, p)
+            mod._buffers[parts[-1]] = torch.zeros(
+                buf.shape, dtype=torch.bfloat16, device=device
+            )
+def _audio_only_forward(self, video, audio, perturbations=None):
+    """Monkey-patched forward for audio-only transformer blocks.
+    Skips all video computation (attn1, attn2, ff, audio_to_video_attn)
+    and only runs audio self-attention, cross-attention, and feedforward.
+    """
+    if video is None and audio is None:
+        raise ValueError("Need at least one modality")
+    batch_size = (video or audio).x.shape[0]
+    if perturbations is None:
+        perturbations = BatchedPerturbationConfig.empty(batch_size)
+    vx = video.x if video is not None else None
+    ax = audio.x if audio is not None else None
+    run_ax = audio is not None and audio.enabled and ax.numel() > 0
+    if run_ax:
+        ashift_msa, ascale_msa, agate_msa = self.get_ada_values(
+            self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(0, 3)
+        )
+        norm_ax = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_msa) + ashift_msa
+        del ashift_msa, ascale_msa
+        ax = (
+            ax
+            + self.audio_attn1(
+                norm_ax, pe=audio.positional_embeddings, mask=audio.self_attention_mask
+            )
+            * agate_msa
+        )
+        del agate_msa, norm_ax
+        ax = ax + self._apply_text_cross_attention(
+            ax,
+            audio.context,
+            self.audio_attn2,
+            self.audio_scale_shift_table,
+            getattr(self, "audio_prompt_scale_shift_table", None),
+            audio.timesteps,
+            audio.prompt_timestep,
+            audio.context_mask,
+            cross_attention_adaln=self.cross_attention_adaln,
+        )
+        ashift_ff, ascale_ff, agate_ff = self.get_ada_values(
+            self.audio_scale_shift_table, ax.shape[0], audio.timesteps, slice(3, 6)
+        )
+        norm_ax_ff = rms_norm(ax, eps=self.norm_eps) * (1 + ascale_ff) + ashift_ff
+        del ashift_ff, ascale_ff
+        ax = ax + self.audio_ff(norm_ax_ff) * agate_ff
+        del agate_ff, norm_ax_ff
+    if video is not None:
+        object.__setattr__(video, "x", vx)
+    if audio is not None:
+        object.__setattr__(audio, "x", ax)
+    return video, audio
+# ── VRAM Manager ────────────────────────────────────────────────────────
+class VRAMManager:
+    """Manages model placement between GPU and CPU based on available VRAM.
+    Tracks which models are on GPU and moves them as needed per inference phase.
+    Offloading is determined by comparing total registered model size against
+    available VRAM. If all models fit, no offloading occurs.
+    """
+    def __init__(self, vram_gb: float):
+        self.vram_gb = vram_gb
+        self._models: dict[str, torch.nn.Module] = {}
+        self._model_sizes: dict[str, float] = {}  # GB per model
+        self._on_gpu: set[str] = set()
+        self.needs_offload = False  # Determined after all models registered
+    def register(self, name: str, model: torch.nn.Module, on_gpu: bool = True) -> None:
+        """Register a model for VRAM management.
+        Args:
+            name: Identifier for the model.
+            model: The PyTorch module.
+            on_gpu: Whether the model is currently on GPU.
+        """
+        self._models[name] = model
+        size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / 1e9
+        self._model_sizes[name] = size_gb
+        if on_gpu:
+            self._on_gpu.add(name)
+    def finalize(self) -> None:
+        """Determine offloading strategy based on total model size vs VRAM.
+        Call after all models are registered. Sets needs_offload based on
+        whether all registered models fit in VRAM simultaneously with
+        headroom for activations and pipeline overhead (~5GB).
+        """
+        total_model_gb = sum(self._model_sizes.values())
+        # Gemma overhead depends on quantization mode:
+        #   bf16 streaming: ~16GB peak (13GB Gemma + 2GB embeddings + 1GB safety)
+        #   NF4: ~11GB peak (8GB NF4 model on GPU + 2GB embeddings + 1GB safety)
+        gemma_nf4 = os.environ.get("GEMMA_QUANTIZE", "").lower() == "nf4"
+        gemma_overhead_gb = 11.0 if gemma_nf4 else 16.0
+        self.needs_offload = (total_model_gb + gemma_overhead_gb) > self.vram_gb
+        logger.info(
+            "VRAM strategy: %.1f GB models + %.1f GB Gemma overhead (%s) vs %.1f GB VRAM -> offload=%s",
+            total_model_gb,
+            gemma_overhead_gb,
+            "nf4" if gemma_nf4 else "bf16",
+            self.vram_gb,
+            "yes" if self.needs_offload else "no",
+        )
+    def to_gpu(self, *names: str) -> None:
+        """Move specified models to GPU, offloading others if needed.
+        If offloading is required (VRAM < 40GB), all models NOT in the
+        requested set are moved to CPU first to free VRAM.
+        Args:
+            names: Model names that should be on GPU for the current phase.
+        """
+        if not self.needs_offload:
+            # High VRAM: just ensure requested models are on GPU
+            for name in names:
+                if name not in self._on_gpu and name in self._models:
+                    self._models[name].cuda()
+                    self._on_gpu.add(name)
+            return
+        # Offload models that shouldn't be on GPU
+        needed = set(names)
+        to_offload = self._on_gpu - needed
+        for name in to_offload:
+            if name in self._models:
+                self._models[name].cpu()
+                self._on_gpu.discard(name)
+                logger.debug("Offloaded %s to CPU", name)
+        torch.cuda.empty_cache()
+        # Load requested models to GPU
+        for name in names:
+            if name not in self._on_gpu and name in self._models:
+                self._models[name].cuda()
+                self._on_gpu.add(name)
+                logger.debug("Loaded %s to GPU", name)
+    def free_all(self) -> None:
+        """Move all models to CPU."""
+        for name in list(self._on_gpu):
+            if name in self._models:
+                self._models[name].cpu()
+        self._on_gpu.clear()
+        torch.cuda.empty_cache()
+    @contextmanager
+    def phase(self, *names: str):
+        """Context manager for a VRAM phase.
+        Ensures specified models are on GPU for the duration, then
+        returns to previous state on exit.
+        Args:
+            names: Model names needed on GPU for this phase.
+        """
+        prev_on_gpu = set(self._on_gpu)
+        self.to_gpu(*names)
+        try:
+            yield
+        finally:
+            # Restore previous state only if offloading is needed
+            if self.needs_offload:
+                to_restore = prev_on_gpu - set(names)
+                to_remove = set(names) - prev_on_gpu
+                for name in to_remove:
+                    if name in self._models and name in self._on_gpu:
+                        self._models[name].cpu()
+                        self._on_gpu.discard(name)
+                for name in to_restore:
+                    if name in self._models and name not in self._on_gpu:
+                        self._models[name].cuda()
+                        self._on_gpu.add(name)
+                torch.cuda.empty_cache()
+# ── Audio Engine ────────────────────────────────────────────────────────
+class AudioEngine:
+    """LTX 2.3 audio-only generation engine.
+    Loads the baked audio checkpoint, Audio VAE encoder, and Gemma 3 12B
+    text encoder. VRAM is managed automatically per inference phase.
+    """
+    def __init__(
+        self,
+        audio_ckpt_path: str,
+        vae_encoder_path: str,
+        gemma_root: str,
+        pipeline_ckpt_path: str | None = None,
+    ):
+        """Initialize AudioEngine.
+        Args:
+            audio_ckpt_path: Path to the audio-only transformer checkpoint.
+            vae_encoder_path: Path to the standalone Audio VAE encoder checkpoint.
+            gemma_root: Path to the Gemma 3 12B model directory.
+            pipeline_ckpt_path: Path to checkpoint for DistilledPipeline.
+        """
+        self.audio_ckpt_path = audio_ckpt_path
+        self.vae_encoder_path = vae_encoder_path
+        self.gemma_root = gemma_root
+        self.pipeline_ckpt_path = pipeline_ckpt_path or audio_ckpt_path
+        self._config = None
+        self._mdl_wrapper = None
+        self._audio_encoder = None
+        self._pipeline = None
+        self._vram: VRAMManager | None = None
+        self._vae_sr = None
+        self._loaded = False
+    @property
+    def vae_sample_rate(self) -> int:
+        return self._vae_sr or 16000
+    def load(self) -> None:
+        """Load all models. Call once at startup."""
+        if self._loaded:
+            return
+        vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
+        ram_gb = psutil.virtual_memory().total / 1e9
+        logger.info(
+            "System: %.1f GB VRAM, %.1f GB RAM, GPU: %s",
+            vram_gb,
+            ram_gb,
+            torch.cuda.get_device_name(0),
+        )
+        if vram_gb < 11:
+            raise RuntimeError(
+                f"Insufficient VRAM: {vram_gb:.0f}GB. Minimum 11GB required."
+            )
+        if ram_gb < 24:
+            raise RuntimeError(
+                f"Insufficient RAM: {ram_gb:.0f}GB. Minimum 24GB required."
+            )
+        self._vram = VRAMManager(vram_gb)
+        self._load_audio_model()
+        self._load_vae_encoder()
+        self._patch_transformer_blocks()
+        self._build_pipeline()
+        # Determine offloading strategy based on actual model sizes vs VRAM
+        self._vram.finalize()
+        self._loaded = True
+        logger.info("AudioEngine loaded")
+    def _load_audio_model(self) -> None:
+        """Load the audio-only checkpoint to GPU.
+        Supports both bf16 and INT8 quantized checkpoints. INT8 checkpoints
+        store weights as .weight.int8 (int8) + .weight.scale (float32) pairs.
+        For INT8, nn.Linear layers are replaced with Int8Linear modules that
+        keep weights quantized in VRAM (~5GB vs 9.8GB) and dequantize during
+        the forward pass.
+        """
+        t0 = time.time()
+        with safe_open(self.audio_ckpt_path, framework="pt") as f:
+            self._config = json.loads(f.metadata()["config"])
+        with torch.device("meta"):
+            mdl = LTXModelConfigurator.from_config(self._config)
+        sd = load_file(self.audio_ckpt_path, device="cpu")
+        # Detect INT8 checkpoint format
+        int8_map = {
+            k.replace(".weight.int8", ""): k for k in sd if k.endswith(".weight.int8")
+        }
+        scale_map = {
+            k.replace(".weight.scale", ""): k for k in sd if k.endswith(".weight.scale")
+        }
+        is_int8 = len(int8_map) > 0
+        if is_int8:
+            # Load only non-quantized keys first (biases, norms, embeddings)
+            regular_sd = {
+                k: v
+                for k, v in sd.items()
+                if not k.endswith(".int8") and not k.endswith(".scale")
+            }
+            mdl_wrapper = X0Model(mdl)
+            mdl_wrapper.load_state_dict(regular_sd, strict=False, assign=True)
+            # Replace nn.Linear with Int8Linear for quantized weights
+            n_replaced = 0
+            for name in int8_map:
+                w_int8 = sd[int8_map[name]]
+                w_scale = sd[scale_map[name]]
+                parts = name.split(".")
+                parent = mdl_wrapper
+                for p in parts[:-1]:
+                    parent = getattr(parent, p)
+                old = getattr(parent, parts[-1])
+                bias_key = name + ".bias"
+                bias = sd.get(bias_key)
+                if bias is None and hasattr(old, "bias") and old.bias is not None:
+                    bias = old.bias.data
+                setattr(parent, parts[-1], _Int8Linear(w_int8, w_scale, bias))
+                n_replaced += 1
+            logger.info("INT8: replaced %d Linear layers with Int8Linear", n_replaced)
+        else:
+            mdl_wrapper = X0Model(mdl)
+            mdl_wrapper.load_state_dict(sd, strict=False, assign=True)
+            # Runtime INT8 quantization via BnB (bf16 checkpoint → INT8 on GPU)
+            if os.environ.get("TRANSFORMER_QUANTIZE", "").lower() == "int8":
+                import bitsandbytes as bnb
+                n_quantized = 0
+                for name, module in list(mdl_wrapper.named_modules()):
+                    for cn, child in list(module.named_children()):
+                        if (
+                            isinstance(child, torch.nn.Linear)
+                            and child.weight.numel() > 1_000_000
+                        ):
+                            int8_layer = bnb.nn.Linear8bitLt(
+                                child.in_features,
+                                child.out_features,
+                                bias=child.bias is not None,
+                                has_fp16_weights=False,
+                            )
+                            int8_layer.weight = bnb.nn.Int8Params(
+                                child.weight.data,
+                                requires_grad=False,
+                                has_fp16_weights=False,
+                            )
+                            if child.bias is not None:
+                                int8_layer.bias = child.bias
+                            setattr(module, cn, int8_layer)
+                            n_quantized += 1
+                logger.info(
+                    "Runtime INT8: quantized %d Linear layers via BnB", n_quantized
+                )
+        del sd
+        gc.collect()
+        for block in mdl.transformer_blocks:
+            block.attn1 = torch.nn.Identity()
+            block.attn2 = torch.nn.Identity()
+            block.ff = torch.nn.Identity()
+            block.audio_to_video_attn = torch.nn.Identity()
+        gc.collect()
+        _materialize_meta_tensors(mdl_wrapper)
+        cross_pe = max(
+            mdl.positional_embedding_max_pos[0],
+            mdl.audio_positional_embedding_max_pos[0],
+        )
+        mdl._init_preprocessors(cross_pe)
+        self._mdl_wrapper = mdl_wrapper.cuda().eval()
+        self._vram.register("audio_model", self._mdl_wrapper, on_gpu=True)
+        logger.info(
+            "Audio model loaded: %.1f GB, %.1fs",
+            torch.cuda.memory_allocated() / 1e9,
+            time.time() - t0,
+        )
+    def _load_vae_encoder(self) -> None:
+        """Load Audio VAE encoder from standalone checkpoint."""
+        t0 = time.time()
+        avae_cfg = self._config["audio_vae"]
+        preproc = avae_cfg["preprocessing"]
+        self._vae_sr = preproc["audio"]["sampling_rate"]
+        with torch.device("meta"):
+            encoder = AudioEncoderConfigurator().from_config(avae_cfg)
+        sd = load_file(self.vae_encoder_path, device="cpu")
+        encoder.load_state_dict(sd, strict=False, assign=True)
+        pcs = encoder.per_channel_statistics
+        if "per_channel_statistics.std-of-means" in sd:
+            pcs._buffers["std-of-means"] = sd["per_channel_statistics.std-of-means"]
+            pcs._buffers["mean-of-means"] = sd["per_channel_statistics.mean-of-means"]
+        del sd
+        dd = avae_cfg["model"]["params"]["ddconfig"]
+        encoder.mel_bins = dd["mel_bins"]
+        encoder.mid.attn_1 = torch.nn.Identity()
+        _materialize_meta_tensors(encoder, device="cpu")
+        self._audio_encoder = encoder.cuda().eval().to(torch.bfloat16)
+        self._vram.register("vae_encoder", self._audio_encoder, on_gpu=True)
+        logger.info(
+            "Audio VAE encoder loaded: %.1fM params, %.1fs",
+            sum(p.numel() for p in self._audio_encoder.parameters()) / 1e6,
+            time.time() - t0,
+        )
+    def _patch_transformer_blocks(self) -> None:
+        """Monkey-patch transformer blocks for audio-only forward pass."""
+        BasicAVTransformerBlock.forward = _audio_only_forward
+        logger.info("Transformer blocks patched for audio-only forward")
+    def _build_pipeline(self) -> None:
+        """Build DistilledPipeline and cache Gemma + embeddings processor in CPU RAM.
+        Caching eliminates the ~35s rebuild cost on every encode call.
+        Gemma stays in CPU RAM permanently, streams to GPU layer-by-layer.
+        Embeddings processor shuttles between CPU and GPU per call.
+        """
+        t0 = time.time()
+        mdl_wrapper = self._mdl_wrapper
+        # Use NONE offload when VRAM is sufficient so Gemma stays GPU-resident
+        # for fast encoding (~0.5s vs ~7s streaming). Fall back to CPU streaming
+        # on smaller cards.
+        offload = (
+            OffloadMode.NONE
+            if self._vram.vram_gb >= HIGH_VRAM_THRESHOLD_GB
+            else OffloadMode.CPU
+        )
+        self._pipeline = DistilledPipeline(
+            distilled_checkpoint_path=self.pipeline_ckpt_path,
+            gemma_root=self.gemma_root,
+            spatial_upsampler_path=None,
+            loras=[],
+            offload_mode=offload,
+        )
+        @contextmanager
+        def _gpu_ctx(**kw):
+            yield mdl_wrapper
+        self._pipeline.stage._transformer_ctx = _gpu_ctx
+        pe = self._pipeline.prompt_encoder
+        # Gemma loading strategy:
+        #   NF4: BitsAndBytes int4 quantization (~8GB on GPU, ~0.1s encode)
+        #   bf16 GPU: full precision on GPU (~24GB, ~1-2s encode) — when VRAM >= 40GB
+        #   bf16 streaming: streams from CPU RAM layer-by-layer (~7s encode) — when VRAM < 40GB
+        self._gemma_nf4 = os.environ.get("GEMMA_QUANTIZE", "").lower() == "nf4"
+        self._gemma_on_gpu = False
+        if self._gemma_nf4:
+            self._build_nf4_gemma()
+            # NF4 needs its own embeddings processor and tokenizer
+            self._cached_emb_proc = pe._embeddings_processor_builder.build(
+                device="cuda",
+                dtype=torch.bfloat16,
+            ).eval()
+            self._cached_tokenizer = LTXVGemmaTokenizer(self.gemma_root)
+            logger.info("Embeddings processor cached on CUDA (NF4 mode)")
+        elif self._vram.vram_gb >= HIGH_VRAM_THRESHOLD_GB:
+            # Build pipeline's text encoder ONCE on GPU and keep it resident.
+            # This uses the same builder as pipeline.prompt_encoder but
+            # avoids the build/destroy cycle that makes each call ~30s.
+            t_gemma = time.time()
+            self._resident_text_encoder = pe._text_encoder_builder.build(
+                device=torch.device("cuda"),
+                dtype=torch.bfloat16,
+            ).eval()
+            self._cached_emb_proc = pe._embeddings_processor_builder.build(
+                device="cuda",
+                dtype=torch.bfloat16,
+            ).eval()
+            self._gemma_on_gpu = True
+            vram_gb = torch.cuda.memory_allocated() / (1024**3)
+            logger.info(
+                "Gemma bf16 (pipeline encoder) GPU-resident: %.1fGB VRAM, %.1fs",
+                vram_gb,
+                time.time() - t_gemma,
+            )
+        else:
+            # Low VRAM: pipeline.prompt_encoder streams from CPU (~7s/encode)
+            logger.info("Gemma managed by pipeline prompt_encoder (CPU streaming)")
+        logger.info("Pipeline built: %.1fs", time.time() - t0)
+    def _build_nf4_gemma(self) -> None:
+        """Load Gemma 3 12B with BitsAndBytes NF4 quantization (~8GB on GPU).
+        NF4 Gemma stays on GPU permanently. Encode is near-instant (~0.1s)
+        since there's no CPU->GPU streaming. Slight quality tradeoff vs bf16
+        but acceptable for production use.
+        """
+        t0 = time.time()
+        quant_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+        )
+        self._nf4_gemma_model = Gemma3ForConditionalGeneration.from_pretrained(
+            self.gemma_root,
+            quantization_config=quant_config,
+            device_map="cuda",
+            dtype=torch.bfloat16,
+        ).eval()
+        # No streaming text encoder needed — _cached_text_encoder stays None
+        self._cached_text_encoder = None
+        vram_gb = torch.cuda.memory_allocated() / (1024**3)
+        logger.info(
+            "Gemma NF4 loaded on GPU: %.1fGB VRAM, %.1fs", vram_gb, time.time() - t0
+        )
+    def _build_bf16_gemma_gpu(self) -> None:
+        """Load Gemma 3 12B bf16 directly on GPU (~24GB).
+        For cards with >= 40GB VRAM. Gemma stays on GPU permanently.
+        Encode is ~1-2s (pure inference, no CPU->GPU streaming).
+        """
+        t0 = time.time()
+        self._nf4_gemma_model = Gemma3ForConditionalGeneration.from_pretrained(
+            self.gemma_root,
+            device_map="cuda",
+            torch_dtype=torch.bfloat16,
+        ).eval()
+        self._cached_text_encoder = None
+        self._gemma_on_gpu = True
+        vram_gb = torch.cuda.memory_allocated() / (1024**3)
+        logger.info(
+            "Gemma bf16 loaded on GPU: %.1fGB VRAM, %.1fs", vram_gb, time.time() - t0
+        )
+    def unload(self) -> None:
+        """Free all GPU and CPU memory."""
+        if self._vram:
+            self._vram.free_all()
+        if (
+            hasattr(self, "_cached_text_encoder")
+            and self._cached_text_encoder is not None
+        ):
+            self._cached_text_encoder.teardown()
+            self._cached_text_encoder = None
+        if hasattr(self, "_nf4_gemma_model"):
+            del self._nf4_gemma_model
+            self._nf4_gemma_model = None
+        if hasattr(self, "_cached_emb_proc"):
+            self._cached_emb_proc = None
+        if hasattr(self, "_cached_tokenizer"):
+            self._cached_tokenizer = None
+        self._mdl_wrapper = None
+        self._audio_encoder = None
+        self._pipeline = None
+        self._vram = None
+        self._loaded = False
+        gc.collect()
+        torch.cuda.empty_cache()
+        logger.info("AudioEngine unloaded")
+    def encode_text(self, prompt: str):
+        """Encode text prompt via Gemma 3 12B.
+        Uses the pipeline's PromptEncoder which builds Gemma through
+        the LTX-native builder. This ensures identical encoding to the
+        reference pipeline (critical for SFX generation quality).
+        Falls back to NF4/bf16 GPU-resident Gemma when available for speed,
+        but routes through the pipeline encoder for correctness.
+        Args:
+            prompt: Compiled video-style text prompt.
+        Returns:
+            Tuple of (video_context, audio_context) tensors for denoising.
+        """
+        t0 = time.time()
+        with torch.inference_mode():
+            if self._gemma_nf4:
+                # NF4: use BitsAndBytes quantized Gemma (fast, ~0.1s)
+                tp = self._cached_tokenizer.tokenize_with_weights(prompt)["gemma"]
+                ids = torch.tensor([[t[0] for t in tp]], device="cuda")
+                mask = torch.tensor([[w[1] for w in tp]], device="cuda")
+                out = self._nf4_gemma_model.model(
+                    input_ids=ids,
+                    attention_mask=mask,
+                    output_hidden_states=True,
+                )
+                hs = out.hidden_states
+                am = mask
+                del out, ids
+                emb = self._cached_emb_proc.process_hidden_states(hs, am)
+                vc = emb.video_encoding
+                ac = emb.audio_encoding
+                del hs, am, emb
+            elif self._gemma_on_gpu:
+                # bf16 GPU-resident: use pipeline's text encoder (fast, ~0.5s)
+                hs, am = self._resident_text_encoder.encode(prompt)
+                emb = self._cached_emb_proc.process_hidden_states(hs, am)
+                vc = emb.video_encoding
+                ac = emb.audio_encoding
+                del hs, am, emb
+            else:
+                # CPU streaming: use pipeline's prompt encoder (~7s)
+                (emb,) = self._pipeline.prompt_encoder([prompt])
+                vc = emb.video_encoding
+                ac = emb.audio_encoding
+        logger.info("Gemma encode: %.1fs", time.time() - t0)
+        return vc, ac
+    def encode_reference(self, waveform_np: np.ndarray, sample_rate: int):
+        """Encode reference audio to latent via Audio VAE encoder.
+        Args:
+            waveform_np: Audio samples, shape (samples,) or (samples, channels).
+            sample_rate: Sample rate of the input audio in Hz.
+        Returns:
+            Reference latent tensor [B, C, T, F] on GPU.
+        """
+        # Ensure VAE encoder is on GPU
+        self._vram.to_gpu("vae_encoder")
+        if waveform_np.ndim == 1:
+            waveform_np = np.stack([waveform_np, waveform_np], axis=-1)
+        if waveform_np.ndim == 2 and waveform_np.shape[1] == 2:
+            wav = torch.from_numpy(waveform_np.T).float()
+        else:
+            wav = torch.from_numpy(waveform_np).float()
+        if sample_rate != self._vae_sr:
+            wav = torchaudio.functional.resample(wav, sample_rate, self._vae_sr)
+        max_samples = MAX_REF_SECONDS * self._vae_sr
+        if wav.shape[1] > max_samples:
+            wav = wav[:, :max_samples]
+        audio_obj = Audio(waveform=wav.unsqueeze(0), sampling_rate=self._vae_sr)
+        with torch.inference_mode():
+            latent = encode_audio(audio_obj, self._audio_encoder)
+        logger.info("Reference encoded: %s", latent.shape)
+        return latent
+    def generate(
+        self,
+        vc,
+        ac,
+        duration: float,
+        seed: int,
+        ref_latent=None,
+    ) -> AudioResult:
+        """Generate audio with optional A2V reference conditioning.
+        Args:
+            vc: Video context from encode_text().
+            ac: Audio context from encode_text().
+            duration: Target duration in seconds.
+            seed: Random seed for reproducibility.
+            ref_latent: Optional reference latent from encode_reference()
+                for A2V voice conditioning.
+        Returns:
+            AudioResult with waveform numpy array and metadata.
+        """
+        return self._generate_impl(vc, ac, duration, seed, ref_latent)
+    @torch.inference_mode()
+    def _generate_impl(self, vc, ac, duration, seed, ref_latent=None):
+        # Ensure audio model is on GPU for denoising
+        self._vram.to_gpu("audio_model")
+        num_frames = ((int(duration * FPS) + 7) // 8) * 8 + 1
+        device = torch.device("cuda")
+        gen = torch.Generator(device=device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=gen)
+        sigmas = DISTILLED_SIGMAS.to(dtype=torch.float32, device=device)
+        pixel_shape = VideoPixelShape(
+            batch=1, frames=num_frames, width=64, height=64, fps=FPS
+        )
+        v_shape = VideoLatentShape.from_pixel_shape(pixel_shape)
+        video_tools = VideoLatentTools(
+            VideoLatentPatchifier(patch_size=1), v_shape, fps=FPS
+        )
+        video_state = _build_state(
+            ModalitySpec(context=vc, conditionings=[]),
+            video_tools,
+            noiser,
+            torch.bfloat16,
+            device,
+        )
+        a_shape = AudioLatentShape.from_video_pixel_shape(pixel_shape)
+        audio_tools = AudioLatentTools(AudioPatchifier(patch_size=1), a_shape)
+        audio_state = _build_state(
+            ModalitySpec(context=ac),
+            audio_tools,
+            noiser,
+            torch.bfloat16,
+            device,
+        )
+        ref_frames = 0
+        if ref_latent is not None:
+            ref = ref_latent.to(device=device, dtype=torch.bfloat16)
+            ref_frames = ref.shape[2]
+            total_t = ref_frames + audio_state.latent.shape[1]
+            ref_patchified = ref.permute(0, 2, 1, 3).reshape(1, ref_frames, -1)
+            combined_latent = torch.cat([ref_patchified, audio_state.latent], dim=1)
+            ref_mask = torch.zeros(
+                1, ref_frames, 1, device=device, dtype=audio_state.denoise_mask.dtype
+            )
+            combined_mask = torch.cat([ref_mask, audio_state.denoise_mask], dim=1)
+            combined_clean = torch.cat(
+                [ref_patchified, torch.zeros_like(audio_state.clean_latent)], dim=1
+            )
+            combined_a_shape = AudioLatentShape(
+                batch=1, channels=8, frames=total_t, mel_bins=16
+            )
+            combined_audio_tools = AudioLatentTools(
+                AudioPatchifier(patch_size=1), combined_a_shape
+            )
+            gen2 = torch.Generator(device=device).manual_seed(seed)
+            noiser2 = GaussianNoiser(generator=gen2)
+            tmp_state = _build_state(
+                ModalitySpec(context=ac),
+                combined_audio_tools,
+                noiser2,
+                torch.bfloat16,
+                device,
+            )
+            combined_positions = tmp_state.positions
+            del tmp_state
+            audio_state_final = LatentState(
+                latent=combined_latent,
+                denoise_mask=combined_mask,
+                positions=combined_positions,
+                clean_latent=combined_clean,
+                attention_mask=None,
+            )
+        else:
+            audio_state_final = audio_state
+        stepper = EulerDiffusionStep()
+        with self._pipeline.stage._transformer_ctx() as transformer:
+            wrapped = BatchSplitAdapter(transformer, max_batch_size=1)
+            t0 = time.time()
+            _, audio_state_out = euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state_final,
+                stepper=stepper,
+                transformer=wrapped,
+                denoiser=SimpleDenoiser(vc, ac),
+            )
+            logger.debug("Denoise: %.2fs", time.time() - t0)
+        if ref_latent is not None and audio_state_out is not None and ref_frames > 0:
+            audio_state_out = dc_replace(
+                audio_state_out,
+                latent=audio_state_out.latent[:, ref_frames:],
+                denoise_mask=audio_state_out.denoise_mask[:, ref_frames:],
+                positions=audio_state_out.positions[:, :, ref_frames:],
+                clean_latent=(
+                    audio_state_out.clean_latent[:, ref_frames:]
+                    if audio_state_out.clean_latent is not None
+                    else None
+                ),
+            )
+        audio_state_out = audio_tools.clear_conditioning(audio_state_out)
+        audio_state_out = audio_tools.unpatchify(audio_state_out)
+        if torch.isnan(audio_state_out.latent).any():
+            logger.warning("NaN detected in denoised latent")
+        # Offload audio model before VAE decode (pipeline handles decoder GPU usage)
+        self._vram.to_gpu()
+        audio = self._pipeline.audio_decoder(audio_state_out.latent)
+        # Restore audio model after decode
+        self._vram.to_gpu("audio_model")
+        w, sr = extract_wav(audio)
+        return AudioResult(waveform_np=w, sample_rate=sr, duration_s=w.shape[0] / sr)

src/audio_core/enhancer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""VoiceFixer audio post-processing for Scenema Audio.
+Applies neural speech restoration to improve clarity, remove artifacts,
+and bring speech to studio quality. Runs on GPU after SeedVC as the
+final processing step.
+Model is downloaded on first use and cached to disk for subsequent runs.
+"""
+import logging
+import os
+import subprocess
+import sys
+import tempfile
+import numpy as np
+import soundfile as sf
+import torchaudio
+logger = logging.getLogger(__name__)
+_voicefixer = None
+def _ensure_installed():
+    """Install voicefixer if not available."""
+    try:
+        import voicefixer  # noqa: F401
+    except ImportError:
+        logger.info("Installing voicefixer...")
+        try:
+            subprocess.check_call(
+                [sys.executable, "-m", "pip", "install", "voicefixer", "--quiet"],
+            )
+            logger.info("voicefixer installed")
+        except subprocess.CalledProcessError:
+            logger.warning("Failed to install voicefixer, enhancement will be skipped")
+            raise ImportError("voicefixer not available")
+def _get_voicefixer():
+    """Get or initialize the VoiceFixer model.
+    Downloaded on first use and cached by the library's default cache.
+    """
+    global _voicefixer
+    if _voicefixer is not None:
+        return _voicefixer
+    _ensure_installed()
+    from voicefixer import VoiceFixer  # noqa: E402
+    _voicefixer = VoiceFixer()
+    logger.info("VoiceFixer model loaded")
+    return _voicefixer
+def enhance_audio(audio_np: np.ndarray, sr: int) -> np.ndarray:
+    """Apply VoiceFixer to audio for studio-quality output.
+    VoiceFixer works on WAV files, so we write to temp, process, and read back.
+    Args:
+        audio_np: Audio array (mono or stereo), any sample rate.
+        sr: Sample rate.
+    Returns:
+        Enhanced audio array at original sample rate.
+    """
+    try:
+        vf = _get_voicefixer()
+    except (ImportError, Exception) as e:
+        logger.warning("VoiceFixer unavailable: %s, skipping", e)
+        return audio_np
+    is_stereo = audio_np.ndim == 2 and audio_np.shape[1] == 2
+    with tempfile.TemporaryDirectory() as tmp:
+        input_path = os.path.join(tmp, "input.wav")
+        output_path = os.path.join(tmp, "output.wav")
+        sf.write(input_path, audio_np, sr)
+        try:
+            vf.restore(
+                input=input_path,
+                output=output_path,
+                cuda=True,
+                mode=0,  # 0=general, 1=speech-specific
+            )
+            enhanced, enhanced_sr = sf.read(output_path)
+            # Resample back to original sr if needed
+            if enhanced_sr != sr:
+                import torch
+                t = torch.from_numpy(
+                    enhanced.T if enhanced.ndim == 2 else enhanced
+                ).float()
+                if t.ndim == 1:
+                    t = t.unsqueeze(0)
+                t = torchaudio.functional.resample(t, enhanced_sr, sr)
+                enhanced = t.squeeze(0).numpy()
+                if enhanced.ndim == 1 and is_stereo:
+                    enhanced = np.stack([enhanced, enhanced], axis=1)
+                elif enhanced.ndim == 2:
+                    enhanced = enhanced.T
+            logger.info("Enhanced audio: %.1fs", len(enhanced) / sr)
+            return enhanced
+        except Exception as e:
+            logger.warning("VoiceFixer failed: %s, returning original", e)
+            return audio_np

src/audio_core/inference.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Inference orchestration for Scenema Audio.
+Generates audio for planned chunks with A2V voice conditioning between
+chunks and concatenates the results. A2V reference from each chunk's tail
+guides the next chunk toward a consistent voice, which SeedVC then
+polishes for exact identity matching.
+"""
+import logging
+import numpy as np
+from .audio_utils import normalize_volume, trim_silence
+from .chunker import ChunkSpec
+from .engine import AudioEngine, AudioResult
+from .whisper_aligner import validate_text
+logger = logging.getLogger(__name__)
+REF_TAIL_SECONDS = 3.0
+MAX_RETRIES = 3
+RETRY_DURATION_FACTOR = 1.3
+MIN_WORD_MATCH_RATIO = 0.90
+def generate_chunks(
+    engine: AudioEngine,
+    chunks: list[ChunkSpec],
+    ref_latent=None,
+    ref_duration_s: float = REF_TAIL_SECONDS,
+    validate: bool = False,
+    min_match_ratio: float = MIN_WORD_MATCH_RATIO,
+    anchor_ref: bool = False,
+) -> list[AudioResult]:
+    """Generate audio for all chunks with A2V voice conditioning.
+    Each chunk gets its own Gemma encode (since each has different text).
+    The tail of each chunk's audio is encoded via Audio VAE and used as
+    A2V reference for the next chunk, guiding voice consistency. SeedVC
+    is applied afterward by the processor for exact identity matching.
+    Args:
+        engine: AudioEngine instance
+        chunks: List of ChunkSpec from plan_chunks()
+        ref_latent: Initial reference latent (from user-provided voice URL)
+        ref_duration_s: Seconds of tail audio to use as A2V reference
+        validate: If True, run Whisper validation with retry loop.
+            If False (default), generate once without validation.
+        anchor_ref: If True, every chunk uses ref_latent instead of
+            chaining from the previous chunk's tail. Keeps voice
+            anchored to the external reference.
+    """
+    results: list[AudioResult] = []
+    for i, chunk in enumerate(chunks):
+        label = "with ref" if ref_latent is not None else "no ref"
+        logger.info(
+            "Chunk %d/%d (%s, %.1fs): %s",
+            i + 1,
+            len(chunks),
+            label,
+            chunk.duration_s,
+            chunk.expected_text[:60] + ("..." if len(chunk.expected_text) > 60 else ""),
+        )
+        # Gemma encode once per chunk (reused across retries)
+        logger.info("Compiled prompt: %s", chunk.compiled_prompt)
+        vc, ac = engine.encode_text(chunk.compiled_prompt)
+        duration = chunk.duration_s
+        seed = chunk.seed
+        if not validate:
+            # Single generation, no whisper validation
+            result = engine.generate(vc, ac, duration, seed, ref_latent=ref_latent)
+            best_result = result
+        else:
+            # Validation retry loop with whisper
+            best_result = None
+            best_ratio = -1.0
+            for attempt in range(MAX_RETRIES + 1):
+                result = engine.generate(vc, ac, duration, seed, ref_latent=ref_latent)
+                passed, transcribed, ratio = validate_text(
+                    result.waveform_np,
+                    result.sample_rate,
+                    chunk.expected_text,
+                    language=chunk.language,
+                    min_word_ratio=min_match_ratio,
+                )
+                if ratio > best_ratio:
+                    best_result = result
+                    best_ratio = ratio
+                if passed:
+                    logger.info(
+                        "  Chunk %d validated: %.0f%% word match",
+                        i + 1,
+                        ratio * 100,
+                    )
+                    break
+                if attempt < MAX_RETRIES:
+                    duration = min(duration * RETRY_DURATION_FACTOR, 20.0)
+                    seed += 1
+                    logger.info(
+                        "  Chunk %d retry %d: %.0f%% match, extending to %.1fs, seed=%d",
+                        i + 1,
+                        attempt + 1,
+                        ratio * 100,
+                        duration,
+                        seed,
+                    )
+                else:
+                    logger.warning(
+                        "  Chunk %d: best %.0f%% match after %d retries, accepting",
+                        i + 1,
+                        best_ratio * 100,
+                        MAX_RETRIES,
+                    )
+        results.append(best_result)
+        # A2V: use tail of this chunk as reference for the next
+        # In anchor mode, keep using the original ref_latent for every chunk
+        if i < len(chunks) - 1 and not anchor_ref:
+            tail_samples = int(ref_duration_s * result.sample_rate)
+            tail_wav = result.waveform_np[-tail_samples:]
+            ref_latent = engine.encode_reference(tail_wav, result.sample_rate)
+    return results
+def concatenate_chunks(
+    results: list[AudioResult],
+    trim: bool = True,
+    normalize: bool = True,
+) -> tuple[np.ndarray, int]:
+    """Concatenate audio chunks with silence trimming and volume normalization.
+    Trims excess silence from chunk boundaries and normalizes volume
+    per-chunk to ensure consistent loudness across the full output.
+    Chunks are hard-concatenated (no crossfade).
+    Args:
+        results: List of AudioResult from generate_chunks().
+        trim: Whether to trim silence from chunk boundaries.
+        normalize: Whether to normalize volume per chunk.
+    Returns:
+        Tuple of (concatenated waveform numpy array, sample_rate).
+    """
+    if not results:
+        raise ValueError("No chunks to concatenate")
+    sr = results[0].sample_rate
+    processed: list[np.ndarray] = []
+    for i, r in enumerate(results):
+        w = r.waveform_np
+        if trim:
+            w = trim_silence(w, sr, max_silence=0.5)
+        if normalize:
+            w = normalize_volume(w, sr)
+        processed.append(w)
+        logger.debug(
+            "Chunk %d: %.1fs -> %.1fs",
+            i,
+            r.duration_s,
+            w.shape[0] / sr,
+        )
+    result = np.concatenate(processed, axis=0)
+    logger.info(
+        "Concatenated: %.1fs from %d chunks", result.shape[0] / sr, len(processed)
+    )
+    return result, sr

src/audio_core/main.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Scenema Audio entry point.
+CRITICAL: CUDA memory config must happen before torch imports.
+"""
+import os
+if "expandable_segments" not in os.environ.get("PYTORCH_CUDA_ALLOC_CONF", ""):
+    _alloc = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
+        (_alloc + ",expandable_segments:True") if _alloc else "expandable_segments:True"
+    )
+import logging
+logging.basicConfig(
+    level=logging.DEBUG if os.environ.get("DEBUG") else logging.INFO,
+    format="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+def main():
+    # These imports are inside main() because CUDA config above
+    # must execute before torch is imported (processor -> engine -> torch)
+    from common.runner import run
+    from .processor import AudioProcessor
+    handler_mode = os.environ.get("HANDLER_MODE", "http")
+    logger.info("Starting Scenema Audio in %s mode", handler_mode)
+    processor = AudioProcessor()
+    run(processor, service_type="scenema_audio")
+if __name__ == "__main__":
+    main()

src/audio_core/processor.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Scenema Audio processor. Processor protocol implementation.
+Handles HTTP sync/async requests for audio generation and voice design.
+Follows the pattern of gpu_x2v/processor.py.
+"""
+import io
+import logging
+import os
+import random
+import shutil
+import tempfile
+import time
+from datetime import datetime, timezone
+import httpx
+import numpy as np
+import psutil
+import soundfile as sf
+import torch
+import torchaudio
+from common.handlers.base import ProcessJob, ProcessOutput, ProcessResult
+from .audio_utils import (
+    ensure_stereo,
+    load_wav,
+    normalize_volume,
+    shorten_long_silence,
+    save_wav,
+    to_mono,
+    trim_silence,
+)
+from .chunker import plan_chunks
+from .compiler import compile_prompt
+from .engine import AudioEngine, HIGH_VRAM_THRESHOLD_GB
+from .inference import concatenate_chunks, generate_chunks
+from .seedvc import SeedVC
+from .validate_and_patch import validate_and_patch
+from .validator import validate_prompt
+from .vocal_separator import VocalSeparator
+logger = logging.getLogger(__name__)
+VOICE_DESIGN_DURATION_S = 15.0
+class AudioProcessor:
+    """Processor for Scenema Audio generation.
+    Implements the Processor protocol (startup/shutdown/process).
+    """
+    def __init__(self):
+        self.engine: AudioEngine | None = None
+        self.vocal_separator = None
+        self.seedvc = None
+        self._http_client = None
+    def startup(self) -> None:
+        """Load models. Called once by handler at startup."""
+        if self.engine is not None:
+            return
+        audio_ckpt = os.environ.get(
+            "AUDIO_CKPT",
+            "/app/models/scenema-audio-transformer.safetensors",
+        )
+        vae_encoder = os.environ.get(
+            "VAE_ENCODER_CKPT",
+            "/app/models/scenema-audio-vae-encoder.safetensors",
+        )
+        gemma_root = os.environ.get(
+            "GEMMA_ROOT",
+            "/app/models/gemma-3-12b-it",
+        )
+        pipeline_ckpt = os.environ.get(
+            "PIPELINE_CKPT",
+            "/app/models/ltx-2.3-22b-distilled.safetensors",
+        )
+        self.engine = AudioEngine(
+            audio_ckpt_path=audio_ckpt,
+            vae_encoder_path=vae_encoder,
+            gemma_root=gemma_root,
+            pipeline_ckpt_path=pipeline_ckpt,
+        )
+        self.engine.load()
+        self.vocal_separator = VocalSeparator()
+        self.seedvc = SeedVC()
+        # Preload all models on high-VRAM cards (>= 40GB), keep resident
+        vram_gb = (
+            torch.cuda.get_device_properties(0).total_memory / 1e9
+            if torch.cuda.is_available()
+            else 0
+        )
+        self._keep_resident = vram_gb >= HIGH_VRAM_THRESHOLD_GB
+        if self._keep_resident:
+            self.vocal_separator.load()
+            self.seedvc.load()
+            logger.info("All models preloaded and resident (%.0fGB VRAM)", vram_gb)
+        else:
+            logger.info("Low VRAM (%.0fGB), models loaded on-demand", vram_gb)
+        logger.info("AudioProcessor ready")
+    def shutdown(self) -> None:
+        """Unload all models."""
+        if self.engine:
+            self.engine.unload()
+            self.engine = None
+        if self.vocal_separator:
+            self.vocal_separator.unload()
+            self.vocal_separator = None
+        if self.seedvc and self.seedvc._loaded:
+            self.seedvc.unload()
+        logger.info("AudioProcessor shutdown")
+    async def process(self, job: ProcessJob) -> ProcessResult:
+        """Process an audio generation job."""
+        start_time = time.time()
+        started_at = datetime.now(timezone.utc).isoformat()
+        torch.cuda.reset_peak_memory_stats()
+        try:
+            if self.engine is None:
+                self.startup()
+            config = self._parse_input(job)
+            if config["mode"] == "voice_design":
+                wav_np, sr = await self._voice_design(config)
+            else:
+                wav_np, sr = await self._generate(config)
+            wav_bytes = self._encode_wav(wav_np, sr)
+            processing_ms = int((time.time() - start_time) * 1000)
+            return ProcessResult(
+                job_id=job.job_id,
+                success=True,
+                output=ProcessOutput(
+                    success=True,
+                    data=wav_bytes,
+                    content_type="audio/wav",
+                    metadata=self._build_metadata(
+                        config, wav_np, sr, processing_ms, started_at
+                    ),
+                ),
+                processing_ms=processing_ms,
+            )
+        except Exception as e:
+            logger.error("Processing failed: %s", e, exc_info=True)
+            processing_ms = int((time.time() - start_time) * 1000)
+            return ProcessResult(
+                job_id=job.job_id,
+                success=False,
+                output=ProcessOutput(success=False, error=str(e)),
+                error=str(e),
+                processing_ms=processing_ms,
+            )
+    def _parse_input(self, job: ProcessJob) -> dict:
+        """Parse and validate job input.
+        Input schema:
+            prompt: str           - Required. <speak> XML string.
+            mode: str             - "generate" (default) or "voice_design".
+            reference_voice_url: str | None - URL to reference audio for voice cloning.
+            background_sfx: bool  - Keep background SFX (default: false, strips via MelBandRoFormer).
+            validate: bool        - Enable Whisper speech validation (default: false).
+                                    When true, each generated chunk is transcribed by faster-whisper
+                                    (GPU, float16, ~1GB VRAM) and compared against the expected text.
+                                    If word match ratio falls below 60%, the chunk is regenerated with
+                                    extended duration and a new seed (up to 3 retries), keeping the
+                                    best result. Adds <1s per chunk on GPU. When false, each chunk is
+                                    generated once with no quality gate, which is faster and sufficient
+                                    for most prompts.
+            seed: int             - Base seed (-1 for random).
+        """
+        inp = job.input
+        prompt = inp.get("prompt")
+        if not prompt:
+            raise ValueError("Missing required 'prompt' field")
+        mode = inp.get("mode", "generate")
+        if mode not in ("generate", "voice_design"):
+            raise ValueError(
+                f"Invalid mode: {mode}. Must be 'generate' or 'voice_design'"
+            )
+        result = validate_prompt(prompt)
+        if not result.valid:
+            raise ValueError(f"Invalid prompt XML: {'; '.join(result.errors)}")
+        seed = inp.get("seed", -1)
+        if seed == -1:
+            seed = random.randint(0, 999999)
+        return {
+            "prompt": prompt,
+            "mode": mode,
+            "reference_voice_url": inp.get("reference_voice_url"),
+            "background_sfx": inp.get("background_sfx", False),
+            "validate": inp.get("validate", True),
+            "seed": seed,
+            "pace": inp.get("pace", 1.5),
+            "min_match_ratio": inp.get("min_match_ratio", 0.90),
+            "vc_cfg_rate": inp.get("vc_cfg_rate", 0.5),
+            "vc_steps": inp.get("vc_steps", 25),
+            "skip_vc": inp.get("skip_vc", False),
+        }
+    async def _voice_design(self, config: dict) -> tuple[np.ndarray, int]:
+        """Generate a 15s voice sample for voice design."""
+        compiled = compile_prompt(config["prompt"])
+        vc, ac = self.engine.encode_text(compiled.prompt)
+        result = self.engine.generate(vc, ac, VOICE_DESIGN_DURATION_S, config["seed"])
+        wav = result.waveform_np
+        sr = result.sample_rate
+        if not config["background_sfx"]:
+            wav = self._strip_background(wav, sr)
+        wav = trim_silence(wav, sr)
+        wav = shorten_long_silence(wav, sr)
+        wav = normalize_volume(wav, sr)
+        return wav, sr
+    async def _generate(self, config: dict) -> tuple[np.ndarray, int]:
+        """Full generation pipeline with chunking and post-processing."""
+        chunks = plan_chunks(
+            config["prompt"], base_seed=config["seed"], pace=config["pace"]
+        )
+        logger.info("Planned %d chunk(s)", len(chunks))
+        ref_wav_path = None
+        if config["reference_voice_url"]:
+            ref_wav_path = await self._download_reference(config["reference_voice_url"])
+        # skip_vc: seed every chunk with the reference audio's tail latent,
+        # identical to how inter-chunk chaining works. The model sees the
+        # reference as "what I just generated" and continues in that voice.
+        # Disables the normal chaining (each chunk chains from the ref, not
+        # from the previous chunk) to keep the voice anchored to the reference.
+        anchor_latent = None
+        if config["skip_vc"] and ref_wav_path:
+            ref_wav, ref_sr = load_wav(ref_wav_path)
+            ref_mono = to_mono(ref_wav)
+            tail_seconds = 3.0
+            tail_samples = int(tail_seconds * ref_sr)
+            if ref_mono.shape[0] > tail_samples:
+                ref_tail = ref_mono[-tail_samples:]
+            else:
+                ref_tail = ref_mono
+            anchor_latent = self.engine.encode_reference(ref_tail, ref_sr)
+            logger.info(
+                "Anchor mode: every chunk seeded from %.1fs reference tail",
+                ref_tail.shape[0] / ref_sr,
+            )
+        with torch.inference_mode():
+            results = generate_chunks(
+                self.engine,
+                chunks,
+                ref_latent=anchor_latent,
+                anchor_ref=anchor_latent is not None,
+                validate=config["validate"],
+                min_match_ratio=config["min_match_ratio"],
+            )
+        wav, sr = concatenate_chunks(results)
+        # Strip background music/SFX from the concatenated audio (single pass)
+        if not config["background_sfx"]:
+            wav = self._strip_background(wav, sr)
+        # Cap silence — scale with pace
+        max_silence = min(0.5 * config["pace"], 1.5)
+        wav = shorten_long_silence(
+            wav, sr, max_duration=max_silence, target_duration=max_silence * 0.6
+        )
+        # Apply SeedVC when: reference voice provided, or multiple chunks (voice consistency).
+        # Skip for single-chunk generations without reference (preserves SFX).
+        needs_vc = ref_wav_path or len(results) > 1
+        if not config["skip_vc"] and needs_vc:
+            wav = self._apply_seedvc(
+                wav,
+                sr,
+                results,
+                ref_wav_path,
+                vc_steps=config["vc_steps"],
+                vc_cfg_rate=config["vc_cfg_rate"],
+            )
+        # Post-SeedVC alignment trimming (disabled by default, needs refinement)
+        if config.get("patch", False):
+            expected_text = " ".join(c.expected_text for c in chunks)
+            wav = validate_and_patch(wav, sr, expected_text)
+        # Ensure stereo final output
+        wav = ensure_stereo(wav)
+        if ref_wav_path and os.path.exists(ref_wav_path):
+            os.unlink(ref_wav_path)
+        return wav, sr
+    def _strip_background(self, wav_np: np.ndarray, sr: int) -> np.ndarray:
+        """Strip background music/SFX using MelBandRoFormer.
+        Loads the model on-demand and unloads after to free VRAM.
+        """
+        if self.vocal_separator is None:
+            return wav_np
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+            input_path = f.name
+        vocals_path = input_path.replace(".wav", "_vocals.wav")
+        try:
+            if not self._keep_resident:
+                self.vocal_separator.load()
+            stereo = ensure_stereo(wav_np)
+            save_wav(stereo, sr, input_path)
+            self.vocal_separator.separate(input_path, vocals_path, None)
+            vocals, _ = load_wav(vocals_path)
+            return vocals
+        except Exception as e:
+            logger.warning("Vocal separation failed: %s", e)
+            return wav_np
+        finally:
+            if not self._keep_resident:
+                self.vocal_separator.unload()
+            for p in [input_path, vocals_path]:
+                if os.path.exists(p):
+                    os.unlink(p)
+    def _apply_seedvc(
+        self,
+        wav: np.ndarray,
+        sr: int,
+        chunk_results: list,
+        ref_wav_path: str | None,
+        vc_steps: int = 20,
+        vc_cfg_rate: float = 0.5,
+    ) -> np.ndarray:
+        """Apply SeedVC voice cloning.
+        If reference_voice_url provided: convert against reference.
+        If no reference: convert all against chunk 0 (first chunk sets identity).
+        """
+        if self.seedvc is None:
+            logger.info("SeedVC not available, skipping voice cloning")
+            return wav
+        try:
+            if not self._keep_resident:
+                self.seedvc.load()
+            with tempfile.TemporaryDirectory() as tmp:
+                source_path = os.path.join(tmp, "source_22k.wav")
+                target_path = os.path.join(tmp, "target_22k.wav")
+                source_mono = to_mono(wav)
+                source_t = torch.from_numpy(source_mono).float().unsqueeze(0)
+                source_22k = torchaudio.functional.resample(source_t, sr, 22050)
+                save_wav(source_22k.squeeze(0).numpy(), 22050, source_path)
+                if ref_wav_path:
+                    target_wav, target_sr = load_wav(ref_wav_path)
+                    target_mono = to_mono(target_wav)
+                    target_t = torch.from_numpy(target_mono).float().unsqueeze(0)
+                    target_22k = torchaudio.functional.resample(
+                        target_t, target_sr, 22050
+                    )
+                    save_wav(target_22k.squeeze(0).numpy(), 22050, target_path)
+                else:
+                    chunk0 = chunk_results[0].waveform_np
+                    chunk0_mono = to_mono(chunk0)
+                    chunk0_t = torch.from_numpy(chunk0_mono).float().unsqueeze(0)
+                    chunk0_22k = torchaudio.functional.resample(
+                        chunk0_t, chunk_results[0].sample_rate, 22050
+                    )
+                    save_wav(chunk0_22k.squeeze(0).numpy(), 22050, target_path)
+                converted = self.seedvc.convert(
+                    source_path,
+                    target_path,
+                    diffusion_steps=vc_steps,
+                    cfg_rate=vc_cfg_rate,
+                )
+                conv_t = torch.from_numpy(converted).float().unsqueeze(0)
+                result = torchaudio.functional.resample(conv_t, 22050, sr)
+                wav = result.squeeze(0).numpy()
+                wav = ensure_stereo(wav)
+        except Exception as e:
+            logger.error("SeedVC failed: %s", e, exc_info=True)
+        finally:
+            if not self._keep_resident:
+                try:
+                    self.seedvc.unload()
+                except Exception:
+                    pass
+        return wav
+    async def _download_reference(self, url: str) -> str:
+        """Download reference audio from URL to temp file."""
+        if self._http_client is None:
+            self._http_client = httpx.AsyncClient(timeout=60.0, follow_redirects=True)
+        response = await self._http_client.get(url)
+        response.raise_for_status()
+        suffix = ".wav"
+        if "mp3" in url.lower() or "mpeg" in response.headers.get("content-type", ""):
+            suffix = ".mp3"
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
+            f.write(response.content)
+            logger.info(
+                "Downloaded reference: %d bytes to %s", len(response.content), f.name
+            )
+            return f.name
+    def _encode_wav(self, wav_np: np.ndarray, sr: int) -> bytes:
+        """Encode numpy array to WAV bytes."""
+        buf = io.BytesIO()
+        sf.write(buf, wav_np, sr, format="WAV")
+        return buf.getvalue()
+    def _build_metadata(
+        self,
+        config: dict,
+        wav_np: np.ndarray,
+        sr: int,
+        processing_ms: int,
+        started_at: str = "",
+    ) -> dict:
+        """Build comprehensive metadata matching x2v pattern."""
+        gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A"
+        vram_total_mb = 0
+        vram_peak_mb = 0
+        if torch.cuda.is_available():
+            vram_total_mb = round(
+                torch.cuda.get_device_properties(0).total_memory / 1024**2
+            )
+            vram_peak_mb = round(torch.cuda.max_memory_allocated() / 1024**2)
+        cpu_cores_total = os.cpu_count() or 0
+        system_ram_gb = round(psutil.virtual_memory().total / 1024**3)
+        disk = shutil.disk_usage("/")
+        return {
+            "duration_s": round(wav_np.shape[0] / sr, 2),
+            "sample_rate": sr,
+            "mode": config["mode"],
+            "seed": config["seed"],
+            "background_sfx": config["background_sfx"],
+            "has_reference_voice": config["reference_voice_url"] is not None,
+            "validate": config["validate"],
+            "processing_ms": processing_ms,
+            "vram_peak_mb": vram_peak_mb,
+            "vram_total_mb": vram_total_mb,
+            "gpu": gpu_name,
+            "cpu_cores_total": cpu_cores_total,
+            "system_ram_gb": system_ram_gb,
+            "disk_total_gb": round(disk.total / 1024**3, 1),
+            "disk_free_gb": round(disk.free / 1024**3, 1),
+            "started_at": started_at,
+            "completed_at": datetime.now(timezone.utc).isoformat(),
+        }

src/audio_core/seedvc.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""SeedVC voice conversion for Scenema Audio.
+Converts the voice identity of generated audio to match a reference speaker
+while preserving prosody, rhythm, and emotion. Uses the Seed-VC model with
+DiT backbone, CAMPPlus speaker encoder, and BigVGAN vocoder.
+Expects 22050Hz mono WAV input for both source and target.
+"""
+import inspect
+import logging
+import os
+import sys
+import types
+from argparse import Namespace
+from pathlib import Path
+import numpy as np
+import torch
+logger = logging.getLogger(__name__)
+DEFAULT_SEEDVC_PATH = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc"))
+DEFAULT_DIFFUSION_STEPS = 25
+DEFAULT_CFG_RATE = 0.5
+class SeedVC:
+    """Voice conversion engine using Seed-VC.
+    Converts source audio voice identity to match a target speaker
+    while preserving the source's delivery, emotion, and pacing.
+    """
+    def __init__(self, seedvc_path: Path = DEFAULT_SEEDVC_PATH):
+        self.seedvc_path = seedvc_path
+        self._loaded = False
+        self._original_cwd: str | None = None
+        self._app_vc = None
+    def load(self) -> None:
+        """Load SeedVC models to GPU.
+        Changes working directory to seedvc_path (required by SeedVC internals),
+        stubs gradio, and loads all models via app_vc.load_models().
+        """
+        if self._loaded:
+            return
+        logger.info("Loading SeedVC from %s", self.seedvc_path)
+        self._original_cwd = os.getcwd()
+        os.chdir(self.seedvc_path)
+        if "gradio" not in sys.modules:
+            sys.modules["gradio"] = types.ModuleType("gradio")
+        seedvc_str = str(self.seedvc_path)
+        if seedvc_str not in sys.path:
+            sys.path.insert(0, seedvc_str)
+        os.environ.setdefault(
+            "HF_HUB_CACHE",
+            str(self.seedvc_path / "checkpoints" / "hf_cache"),
+        )
+        # Patch BigVGAN for huggingface_hub compat (same as gpu_vc)
+        import modules.bigvgan.bigvgan as _bigvgan_mod
+        _orig = _bigvgan_mod.BigVGAN._from_pretrained
+        @classmethod
+        def _patched(cls, **kwargs):
+            kwargs.setdefault("proxies", None)
+            kwargs.setdefault("resume_download", False)
+            return _orig.__func__(cls, **kwargs)
+        _bigvgan_mod.BigVGAN._from_pretrained = _patched
+        # Load models (exact pattern from gpu_vc/seedvc_engine.py)
+        import app_vc
+        self._app_vc = app_vc
+        app_vc.device = torch.device("cuda")
+        args = Namespace(checkpoint=None, config=None, fp16=True, gpu=0)
+        (
+            app_vc.model,
+            app_vc.semantic_fn,
+            app_vc.vocoder_fn,
+            app_vc.campplus_model,
+            app_vc.to_mel,
+            app_vc.mel_fn_args,
+        ) = app_vc.load_models(args)
+        app_vc.max_context_window = app_vc.sr // app_vc.hop_length * 30
+        app_vc.overlap_wave_len = app_vc.overlap_frame_len * app_vc.hop_length
+        self._loaded = True
+        logger.info("SeedVC loaded: sr=%d, device=%s", app_vc.sr, app_vc.device)
+    def unload(self) -> None:
+        """Free SeedVC models from GPU."""
+        if not self._loaded:
+            return
+        if self._app_vc is not None:
+            for attr in [
+                "model",
+                "semantic_fn",
+                "vocoder_fn",
+                "campplus_model",
+                "to_mel",
+            ]:
+                if hasattr(self._app_vc, attr):
+                    delattr(self._app_vc, attr)
+            self._app_vc = None
+        torch.cuda.empty_cache()
+        if self._original_cwd:
+            os.chdir(self._original_cwd)
+            self._original_cwd = None
+        self._loaded = False
+        logger.info("SeedVC unloaded")
+    def convert(
+        self,
+        source_wav_path: str,
+        target_wav_path: str,
+        diffusion_steps: int = DEFAULT_DIFFUSION_STEPS,
+        cfg_rate: float = DEFAULT_CFG_RATE,
+    ) -> np.ndarray:
+        """Convert voice identity of source to match target.
+        Both files must be 22050Hz mono WAV.
+        Args:
+            source_wav_path: Path to source audio (generated speech)
+            target_wav_path: Path to target audio (reference voice)
+            diffusion_steps: Number of diffusion steps (quality vs speed)
+            cfg_rate: Classifier-free guidance rate
+        Returns:
+            Converted audio as float32 numpy array at 22050Hz mono
+        """
+        if not self._loaded:
+            raise RuntimeError("SeedVC not loaded. Call load() first.")
+        logger.info(
+            "Converting voice: %s -> %s (%d steps, cfg_rate=%.2f)",
+            source_wav_path,
+            target_wav_path,
+            diffusion_steps,
+            cfg_rate,
+        )
+        audio_tuple = None
+        vc_kwargs = {
+            "source": source_wav_path,
+            "target": target_wav_path,
+            "diffusion_steps": diffusion_steps,
+            "length_adjust": 1.0,
+            "inference_cfg_rate": cfg_rate,
+        }
+        # n_quantizers removed in newer SeedVC versions
+        sig = inspect.signature(self._app_vc.voice_conversion)
+        if "n_quantizers" in sig.parameters:
+            vc_kwargs["n_quantizers"] = 3
+        for result in self._app_vc.voice_conversion(**vc_kwargs):
+            if isinstance(result, tuple) and len(result) == 2:
+                _, audio_tuple = result
+        if audio_tuple is None:
+            raise RuntimeError("SeedVC produced no output")
+        sample_rate, samples = audio_tuple
+        if samples.dtype == np.int16:
+            samples = samples.astype(np.float32) / 32768.0
+        elif samples.dtype != np.float32:
+            samples = samples.astype(np.float32)
+        peak = np.abs(samples).max()
+        if peak > 1.0:
+            samples = samples / peak
+        logger.info("Converted: %.1fs at %dHz", len(samples) / sample_rate, sample_rate)
+        return samples

src/audio_core/validate_and_patch.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Forced alignment and hallucination trimming for Scenema Audio.
+Uses Needleman-Wunsch sequence alignment (same algorithm as DNA matching)
+to optimally align Whisper-transcribed words against expected text. Words
+in the transcription that are INSERTIONS (not in the expected text) are
+trimmed at silence boundaries. Substitutions (misrecognized words) are kept.
+"""
+import logging
+import re
+import numpy as np
+from .audio_utils import to_mono
+from .whisper_aligner import _get_whisper
+logger = logging.getLogger(__name__)
+SILENCE_THRESHOLD = 0.015
+TRIM_PAD_S = 0.02
+# Alignment scoring
+MATCH_SCORE = 2
+MISMATCH_SCORE = -1
+GAP_SCORE = -1  # Cost of insertion or deletion
+def _normalize_words(text: str) -> list[str]:
+    """Normalize text to lowercase words without punctuation."""
+    text = text.lower()
+    text = re.sub(r"[^\w\s]", "", text)
+    return text.split()
+def _fuzzy_match(a: str, b: str) -> bool:
+    """Check if two words are similar enough (edit distance based)."""
+    if a == b:
+        return True
+    if not a or not b or len(a) < 4 or len(b) < 4:
+        return False
+    m, n = len(a), len(b)
+    dp = list(range(n + 1))
+    for i in range(1, m + 1):
+        prev = dp[0]
+        dp[0] = i
+        for j in range(1, n + 1):
+            temp = dp[j]
+            dp[j] = prev if a[i - 1] == b[j - 1] else 1 + min(prev, dp[j], dp[j - 1])
+            prev = temp
+    return 1 - (dp[n] / max(m, n)) >= 0.5
+def _score(a: str, b: str) -> int:
+    """Score for aligning word a with word b."""
+    if a == b:
+        return MATCH_SCORE
+    if _fuzzy_match(a, b):
+        return MATCH_SCORE  # Treat fuzzy matches same as exact
+    return MISMATCH_SCORE
+def _needleman_wunsch(
+    transcribed: list[str],
+    expected: list[str],
+) -> list[str]:
+    """Needleman-Wunsch global alignment.
+    Returns a list of labels for each transcribed word:
+    - "match": word aligns to an expected word (exact or fuzzy)
+    - "substitution": word replaces an expected word (poor match)
+    - "insertion": word has no counterpart in expected text (hallucinated)
+    Expected words that have no counterpart are deletions (not returned
+    since we only label transcribed words).
+    """
+    m = len(transcribed)
+    n = len(expected)
+    # Build score matrix
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(1, m + 1):
+        dp[i][0] = dp[i - 1][0] + GAP_SCORE
+    for j in range(1, n + 1):
+        dp[0][j] = dp[0][j - 1] + GAP_SCORE
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            match = dp[i - 1][j - 1] + _score(transcribed[i - 1], expected[j - 1])
+            delete = dp[i - 1][j] + GAP_SCORE  # transcribed word is insertion
+            insert = dp[i][j - 1] + GAP_SCORE  # expected word is deletion
+            dp[i][j] = max(match, delete, insert)
+    # Traceback
+    labels = []
+    i, j = m, n
+    while i > 0 or j > 0:
+        if (
+            i > 0
+            and j > 0
+            and dp[i][j]
+            == dp[i - 1][j - 1] + _score(transcribed[i - 1], expected[j - 1])
+        ):
+            s = _score(transcribed[i - 1], expected[j - 1])
+            labels.append("match" if s == MATCH_SCORE else "substitution")
+            i -= 1
+            j -= 1
+        elif i > 0 and dp[i][j] == dp[i - 1][j] + GAP_SCORE:
+            labels.append("insertion")
+            i -= 1
+        else:
+            j -= 1  # Deletion in expected — skip
+    labels.reverse()
+    return labels
+def _transcribe_with_timestamps(
+    audio_mono: np.ndarray,
+    sr: int,
+    language: str,
+) -> list[dict]:
+    """Transcribe audio with word-level timestamps."""
+    if sr != 16000:
+        import librosa
+        audio_16k = librosa.resample(audio_mono, orig_sr=sr, target_sr=16000)
+    else:
+        audio_16k = audio_mono
+    model = _get_whisper()
+    segments, _ = model.transcribe(
+        audio_16k,
+        language=language,
+        word_timestamps=True,
+        vad_filter=True,
+    )
+    words = []
+    for seg in segments:
+        if seg.words:
+            for w in seg.words:
+                words.append(
+                    {
+                        "word": w.word.strip().lower(),
+                        "start": w.start,
+                        "end": w.end,
+                    }
+                )
+    return words
+def _find_silence_boundary(
+    audio: np.ndarray,
+    sr: int,
+    center_sample: int,
+    direction: str = "left",
+    window_s: float = 0.3,
+) -> int:
+    """Find nearest silence point from center position."""
+    hop = int(0.01 * sr)
+    window_samples = int(window_s * sr)
+    if direction == "left":
+        positions = range(center_sample, max(0, center_sample - window_samples), -hop)
+    else:
+        positions = range(
+            center_sample, min(len(audio), center_sample + window_samples), hop
+        )
+    for pos in positions:
+        chunk = audio[max(0, pos - hop // 2) : min(len(audio), pos + hop // 2)]
+        if (
+            len(chunk) > 0
+            and np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) < SILENCE_THRESHOLD
+        ):
+            return pos
+    return center_sample
+def _merge_ranges(
+    ranges: list[tuple[float, float]], gap: float = 0.15
+) -> list[tuple[float, float]]:
+    """Merge consecutive time ranges that are close together."""
+    if not ranges:
+        return []
+    merged = []
+    for start, end in sorted(ranges):
+        if merged and start - merged[-1][1] < gap:
+            merged[-1] = (merged[-1][0], end)
+        else:
+            merged.append((start, end))
+    return merged
+def _detect_audio_repetition(
+    mono: np.ndarray,
+    sr: int,
+    expected_words: list[str],
+    min_duration_s: float = 1.5,
+    similarity_threshold: float = 0.85,
+) -> list[tuple[float, float]]:
+    """Detect repeated audio segments via mel spectrogram cross-correlation.
+    Slides a window across the audio and compares each segment against
+    all subsequent segments. If two non-overlapping segments have high
+    cosine similarity and the expected text does NOT contain that phrase
+    repeated, the second segment is marked for removal.
+    Only detects segments >= min_duration_s to avoid false positives on
+    short common sounds (breaths, pauses).
+    """
+    import torch
+    total_s = len(mono) / sr
+    if total_s < min_duration_s * 3:
+        return []
+    # Compute mel spectrogram
+    hop_length = int(0.02 * sr)  # 20ms hops
+    n_fft = int(0.04 * sr)  # 40ms window
+    audio_t = torch.from_numpy(mono).float()
+    try:
+        mel_spec = torch.stft(
+            audio_t,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            window=torch.hann_window(n_fft),
+            return_complex=True,
+        ).abs()
+    except Exception:
+        return []
+    # Reduce to energy per time frame
+    energy = mel_spec.mean(dim=0).numpy()  # (time_frames,)
+    frames_per_sec = sr / hop_length
+    # Slide window: check segments of varying length
+    repeated_ranges = []
+    for window_s in [3.0, 2.0, 1.5]:
+        win_frames = int(window_s * frames_per_sec)
+        if win_frames >= len(energy):
+            continue
+        step = win_frames // 2
+        for i in range(0, len(energy) - win_frames, step):
+            seg_a = energy[i : i + win_frames]
+            norm_a = np.linalg.norm(seg_a)
+            if norm_a < 1e-6:
+                continue
+            for j in range(i + win_frames, len(energy) - win_frames, step):
+                seg_b = energy[j : j + win_frames]
+                norm_b = np.linalg.norm(seg_b)
+                if norm_b < 1e-6:
+                    continue
+                similarity = np.dot(seg_a, seg_b) / (norm_a * norm_b)
+                if similarity >= similarity_threshold:
+                    start_s = j / frames_per_sec
+                    end_s = (j + win_frames) / frames_per_sec
+                    repeated_ranges.append((start_s, end_s))
+    # Deduplicate overlapping ranges
+    if not repeated_ranges:
+        return []
+    merged = _merge_ranges(repeated_ranges, gap=0.5)
+    logger.debug("Audio fingerprint candidates: %d segments", len(merged))
+    return merged
+def _build_trim_mask(
+    mono: np.ndarray,
+    sr: int,
+    insertion_ranges: list[tuple[float, float]],
+) -> np.ndarray:
+    """Build boolean mask removing insertion segments at silence boundaries."""
+    total_samples = len(mono)
+    keep_mask = np.ones(total_samples, dtype=bool)
+    pad_samples = int(TRIM_PAD_S * sr)
+    for start_s, end_s in insertion_ranges:
+        trim_start = _find_silence_boundary(mono, sr, int(start_s * sr), "left")
+        trim_end = _find_silence_boundary(mono, sr, int(end_s * sr), "right")
+        trim_start = max(0, trim_start - pad_samples)
+        trim_end = min(total_samples, trim_end + pad_samples)
+        keep_mask[trim_start:trim_end] = False
+    return keep_mask
+def validate_and_patch(
+    audio_np: np.ndarray,
+    sr: int,
+    expected_text: str,
+    language: str = "en",
+) -> np.ndarray:
+    """Trim hallucinated content using Needleman-Wunsch sequence alignment.
+    1. Transcribe audio with Whisper (word timestamps)
+    2. Align transcribed words against expected text (NW algorithm)
+    3. Label each transcribed word: match, substitution, or insertion
+    4. Trim insertion words (hallucinated) at silence boundaries
+    5. Keep substitutions (misrecognized real speech)
+    Args:
+        audio_np: Audio array (mono or stereo).
+        sr: Sample rate.
+        expected_text: Full expected plain text.
+        language: Language code.
+    Returns:
+        Trimmed audio array.
+    """
+    expected_words = _normalize_words(expected_text)
+    if not expected_words:
+        return audio_np
+    mono = to_mono(audio_np).astype(np.float32)
+    try:
+        transcribed = _transcribe_with_timestamps(mono, sr, language)
+    except Exception as e:
+        logger.warning("Forced alignment failed: %s, skipping", e)
+        return audio_np
+    if not transcribed:
+        logger.info("No words transcribed, skipping trim")
+        return audio_np
+    # Extract just the words for alignment
+    transcribed_words = [re.sub(r"[^\w]", "", tw["word"]) for tw in transcribed]
+    transcribed_words = [w for w in transcribed_words if w]  # Remove empty
+    # Build index mapping: filtered word index -> original transcribed index
+    word_indices = [
+        i for i, tw in enumerate(transcribed) if re.sub(r"[^\w]", "", tw["word"])
+    ]
+    # Run Needleman-Wunsch alignment
+    labels = _needleman_wunsch(transcribed_words, expected_words)
+    # Collect insertion ranges (hallucinated words)
+    insertion_ranges = []
+    n_match = 0
+    n_sub = 0
+    n_ins = 0
+    for idx, label in enumerate(labels):
+        orig_idx = word_indices[idx]
+        if label == "insertion":
+            insertion_ranges.append(
+                (transcribed[orig_idx]["start"], transcribed[orig_idx]["end"])
+            )
+            n_ins += 1
+        elif label == "match":
+            n_match += 1
+        else:
+            n_sub += 1
+    logger.info(
+        "NW alignment: %d matched, %d substituted, %d inserted (of %d transcribed vs %d expected)",
+        n_match,
+        n_sub,
+        n_ins,
+        len(transcribed_words),
+        len(expected_words),
+    )
+    # Audio fingerprint: detect repeated audio segments that Whisper missed
+    fingerprint_ranges = _detect_audio_repetition(mono, sr, expected_words)
+    if fingerprint_ranges:
+        logger.info(
+            "Audio fingerprint found %d repeated segments", len(fingerprint_ranges)
+        )
+        insertion_ranges.extend(fingerprint_ranges)
+    if not insertion_ranges:
+        logger.info("No insertions detected, audio clean")
+        return audio_np
+    # Merge consecutive insertions and trim
+    merged = _merge_ranges(insertion_ranges)
+    keep_mask = _build_trim_mask(mono, sr, merged)
+    result = audio_np[keep_mask]
+    trimmed_s = (len(mono) - np.sum(keep_mask)) / sr
+    logger.info(
+        "Trimmed %.1fs of hallucinated content (%.1fs -> %.1fs)",
+        trimmed_s,
+        len(mono) / sr,
+        np.sum(keep_mask) / sr,
+    )
+    return result

src/audio_core/validator.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""XML prompt validation for Scenema Audio.
+Validates the <speak> XML format:
+  <speak voice="..." scene="..." language="...">
+    <action>delivery/stage direction</action>
+    Speech text here.
+    <action>more direction</action>
+    More speech text.
+  </speak>
+Only <speak> root with <action> children allowed. All content is freeform.
+"""
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass, field
+ALLOWED_CHILD_TAGS = {"action", "sound"}
+@dataclass
+class ValidationResult:
+    valid: bool
+    errors: list[str] = field(default_factory=list)
+    voice: str | None = None
+    scene: str | None = None
+    language: str | None = None
+def validate_prompt(xml_string: str) -> ValidationResult:
+    """Validate a Scenema Audio XML prompt.
+    Checks for valid XML structure, required <speak> root element,
+    required voice attribute, and only <action> child elements.
+    Args:
+        xml_string: Raw XML string to validate.
+    Returns:
+        ValidationResult with parsed attributes if valid,
+        or a list of errors if invalid.
+    """
+    errors: list[str] = []
+    if not xml_string or not xml_string.strip():
+        return ValidationResult(valid=False, errors=["Prompt is empty"])
+    try:
+        root = ET.fromstring(xml_string)
+    except ET.ParseError as e:
+        return ValidationResult(valid=False, errors=[f"Invalid XML: {e}"])
+    if root.tag != "speak":
+        errors.append(f"Root element must be <speak>, got <{root.tag}>")
+        return ValidationResult(valid=False, errors=errors)
+    voice = root.get("voice")
+    if not voice or not voice.strip():
+        errors.append("Missing required 'voice' attribute on <speak>")
+    gender = root.get("gender")
+    if not gender or gender.strip() not in ("male", "female"):
+        errors.append(
+            "Missing or invalid 'gender' attribute on <speak>. Must be 'male' or 'female'"
+        )
+    scene = root.get("scene")
+    language = root.get("language", "en")
+    allowed_attrs = {"voice", "scene", "language", "gender", "shot"}
+    for attr in root.attrib:
+        if attr not in allowed_attrs:
+            errors.append(f"Unknown attribute '{attr}' on <speak>")
+    for child in root:
+        if child.tag not in ALLOWED_CHILD_TAGS:
+            errors.append(
+                f"Unsupported tag <{child.tag}>. Only <action> and <sound> are allowed inside <speak>"
+            )
+        if len(list(child)) > 0:
+            errors.append(f"<{child.tag}> must contain only text, no nested elements")
+    has_text = False
+    if root.text and root.text.strip():
+        has_text = True
+    for child in root:
+        if child.tail and child.tail.strip():
+            has_text = True
+            break
+    if not has_text:
+        errors.append("Prompt must contain at least one speech text node")
+    if errors:
+        return ValidationResult(valid=False, errors=errors)
+    return ValidationResult(
+        valid=True,
+        voice=voice.strip() if voice else None,
+        scene=scene.strip() if scene else None,
+        language=language.strip() if language else None,
+    )

src/audio_core/vocal_separator.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""MelBandRoFormer vocal separation for Scenema Audio.
+Separates vocals from background music/SFX in audio. Used to clean
+generated audio that may contain unwanted background sounds from the
+diffusion model (which was trained on video with ambient audio).
+Expects stereo 44100Hz input. Processes in overlapping chunks for
+smooth transitions.
+"""
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+import numpy as np
+import torch
+from safetensors.torch import load_file
+logger = logging.getLogger(__name__)
+DEFAULT_MODEL_PATH = Path(
+    os.environ.get("MELBAND_MODEL_PATH", "/app/models/MelBandRoformer_fp16.safetensors")
+)
+DEFAULT_NODE_PATH = Path(
+    os.environ.get("MELBAND_NODE_PATH", "/app/melband_roformer_node")
+)
+MODEL_CONFIG = {
+    "dim": 384,
+    "depth": 6,
+    "stereo": True,
+    "num_stems": 1,
+    "time_transformer_depth": 1,
+    "freq_transformer_depth": 1,
+    "num_bands": 60,
+    "dim_head": 64,
+    "heads": 8,
+    "attn_dropout": 0,
+    "ff_dropout": 0,
+    "flash_attn": True,
+    "dim_freqs_in": 1025,
+    "sample_rate": 44100,
+    "stft_n_fft": 2048,
+    "stft_hop_length": 441,
+    "stft_win_length": 2048,
+    "stft_normalized": False,
+    "mask_estimator_depth": 2,
+    "multi_stft_resolution_loss_weight": 1.0,
+    "multi_stft_resolutions_window_sizes": (4096, 2048, 1024, 512, 256),
+    "multi_stft_hop_size": 147,
+    "multi_stft_normalized": False,
+}
+CHUNK_SIZE = 352800  # ~8 seconds at 44100Hz
+OVERLAP_FACTOR = 2
+class VocalSeparator:
+    """Separates vocals from background audio using MelBandRoFormer.
+    Processes audio in overlapping chunks with fade windows for
+    smooth transitions. Keeps model loaded on GPU for repeated use.
+    """
+    def __init__(
+        self,
+        model_path: Path = DEFAULT_MODEL_PATH,
+        node_path: Path = DEFAULT_NODE_PATH,
+    ):
+        self.model_path = model_path
+        self.node_path = node_path
+        self._model = None
+        self._loaded = False
+    def load(self) -> None:
+        """Load MelBandRoFormer model to GPU."""
+        if self._loaded:
+            return
+        # Lazy import: model architecture only available after node_path added to sys.path
+        node_str = str(self.node_path)
+        if node_str not in sys.path:
+            sys.path.insert(0, node_str)
+        from model.mel_band_roformer import MelBandRoformer
+        logger.info("Loading MelBandRoFormer from %s", self.model_path)
+        model = MelBandRoformer(**MODEL_CONFIG)
+        sd = load_file(str(self.model_path))
+        model.load_state_dict(sd)
+        del sd
+        self._model = model.cuda().eval().float()
+        self._loaded = True
+        param_count = sum(p.numel() for p in self._model.parameters())
+        logger.info("MelBandRoFormer loaded: %.1fM params", param_count / 1e6)
+    def unload(self) -> None:
+        """Free model from GPU."""
+        if not self._loaded:
+            return
+        self._model = None
+        torch.cuda.empty_cache()
+        self._loaded = False
+        logger.info("MelBandRoFormer unloaded")
+    def separate(
+        self,
+        input_path: str,
+        vocals_path: str,
+        sfx_path: str | None = None,
+    ) -> dict:
+        """Separate vocals from background audio.
+        Args:
+            input_path: Path to input audio file (any format ffmpeg supports)
+            vocals_path: Output path for isolated vocals
+            sfx_path: Output path for isolated SFX/background (optional)
+        Returns:
+            Dict with metadata: input_duration, sample_rate
+        """
+        if not self._loaded:
+            raise RuntimeError("VocalSeparator not loaded. Call load() first.")
+        sr = MODEL_CONFIG["sample_rate"]
+        audio = self._load_audio_ffmpeg(input_path, sr)
+        input_duration = audio.shape[1] / sr
+        logger.info("Separating: %.1fs audio", input_duration)
+        with torch.inference_mode():
+            vocals = self._chunked_inference(audio, sr)
+        self._save_audio_ffmpeg(vocals, sr, vocals_path)
+        if sfx_path:
+            sfx = audio - vocals
+            self._save_audio_ffmpeg(sfx, sr, sfx_path)
+        return {
+            "input_duration": input_duration,
+            "sample_rate": sr,
+        }
+    def _chunked_inference(self, audio: np.ndarray, sr: int) -> np.ndarray:
+        """Run model inference in overlapping chunks with fade windows."""
+        total_samples = audio.shape[1]
+        chunk_size = CHUNK_SIZE
+        overlap = chunk_size // OVERLAP_FACTOR
+        step = chunk_size - overlap
+        fade_in = np.linspace(0, 1, overlap, dtype=np.float32)
+        fade_out = np.linspace(1, 0, overlap, dtype=np.float32)
+        result = np.zeros_like(audio)
+        weight = np.zeros(total_samples, dtype=np.float32)
+        pos = 0
+        while pos < total_samples:
+            end = min(pos + chunk_size, total_samples)
+            chunk = audio[:, pos:end]
+            if chunk.shape[1] < chunk_size:
+                pad_width = chunk_size - chunk.shape[1]
+                chunk = np.pad(chunk, ((0, 0), (0, pad_width)))
+            chunk_t = torch.from_numpy(chunk.copy()).unsqueeze(0).cuda().float()
+            out = self._model(chunk_t)
+            out_np = out.squeeze(0).cpu().float().numpy()[:, : end - pos]
+            chunk_len = end - pos
+            w = np.ones(chunk_len, dtype=np.float32)
+            if pos > 0:
+                fade_len = min(overlap, chunk_len)
+                w[:fade_len] *= fade_in[:fade_len]
+            if end < total_samples:
+                fade_len = min(overlap, chunk_len)
+                w[-fade_len:] *= fade_out[:fade_len]
+            result[:, pos:end] += out_np * w[np.newaxis, :]
+            weight[pos:end] += w
+            pos += step
+        weight = np.maximum(weight, 1e-8)
+        result /= weight[np.newaxis, :]
+        return result
+    def _load_audio_ffmpeg(self, path: str, target_sr: int) -> np.ndarray:
+        """Load audio to stereo float32 numpy via ffmpeg."""
+        cmd = [
+            "ffmpeg",
+            "-i",
+            path,
+            "-f",
+            "f32le",
+            "-acodec",
+            "pcm_f32le",
+            "-ac",
+            "2",
+            "-ar",
+            str(target_sr),
+            "-v",
+            "quiet",
+            "pipe:1",
+        ]
+        proc = subprocess.run(cmd, capture_output=True, check=True)
+        audio = np.frombuffer(proc.stdout, dtype=np.float32)
+        return audio.reshape(-1, 2).T  # (2, samples)
+    def _save_audio_ffmpeg(self, audio: np.ndarray, sr: int, path: str) -> None:
+        """Save stereo float32 numpy to WAV via ffmpeg."""
+        interleaved = audio.T.astype(np.float32).tobytes()
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-f",
+            "f32le",
+            "-acodec",
+            "pcm_f32le",
+            "-ac",
+            "2",
+            "-ar",
+            str(sr),
+            "-i",
+            "pipe:0",
+            "-acodec",
+            "pcm_s16le",
+            path,
+            "-v",
+            "quiet",
+        ]
+        subprocess.run(cmd, input=interleaved, check=True)

src/audio_core/whisper_aligner.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Whisper alignment for audio validation in Scenema Audio.
+Uses faster-whisper (CTranslate2) on GPU to transcribe generated audio
+and validate that the expected text was spoken. Whisper-small is 244M
+params (~1GB VRAM, float16). Runs after denoise when VRAM is free.
+"""
+import logging
+import re
+import unicodedata
+import numpy as np
+logger = logging.getLogger(__name__)
+# Singleton whisper model (loaded once, reused)
+_whisper_model = None
+def _get_whisper():
+    """Get or initialize the whisper-small model.
+    Loaded once and cached for the process lifetime.
+    Runs on GPU with float16 — whisper-small is 244M params (~1GB VRAM).
+    By the time validation runs, denoise is complete and VRAM is free.
+    CTranslate2 uses its own CUDA allocator so no conflict with PyTorch.
+    """
+    global _whisper_model
+    if _whisper_model is not None:
+        return _whisper_model
+    from faster_whisper import WhisperModel
+    logger.info("Loading whisper-small for alignment validation (GPU, float16)...")
+    _whisper_model = WhisperModel("small", device="cuda", compute_type="float16")
+    logger.info("whisper-small loaded (GPU)")
+    return _whisper_model
+def transcribe(audio_np: np.ndarray, sr: int, language: str = "en") -> str:
+    """Transcribe audio and return the text.
+    Args:
+        audio_np: Audio samples, shape (samples,) or (samples, channels).
+        sr: Sample rate in Hz.
+        language: Language code for transcription.
+    Returns:
+        Transcribed text string.
+    """
+    model = _get_whisper()
+    # Convert to mono float32 if needed
+    if audio_np.ndim == 2:
+        audio_mono = audio_np.mean(axis=1).astype(np.float32)
+    else:
+        audio_mono = audio_np.astype(np.float32)
+    # Resample to 16kHz if needed
+    if sr != 16000:
+        import librosa
+        audio_mono = librosa.resample(audio_mono, orig_sr=sr, target_sr=16000)
+    try:
+        segments, _ = model.transcribe(
+            audio_mono,
+            language=language,
+            word_timestamps=False,
+            vad_filter=True,
+        )
+        text = " ".join(seg.text.strip() for seg in segments).strip()
+    except (ValueError, TypeError):
+        # Mocked model in tests returns wrong types
+        logger.debug("Whisper transcribe returned unexpected type (test env?)")
+        text = ""
+    return text
+def validate_text(
+    audio_np: np.ndarray,
+    sr: int,
+    expected_text: str,
+    language: str = "en",
+    min_word_ratio: float = 0.6,
+) -> tuple[bool, str, float]:
+    """Validate that generated audio contains the expected text.
+    Transcribes the audio and checks what fraction of expected words
+    appear in the transcription.
+    Args:
+        audio_np: Audio samples.
+        sr: Sample rate.
+        expected_text: The text that should have been spoken.
+        language: Language code.
+        min_word_ratio: Minimum fraction of expected words that must
+            appear in transcription (0.0 to 1.0).
+    Returns:
+        Tuple of (passed, transcribed_text, word_match_ratio).
+    """
+    transcribed = transcribe(audio_np, sr, language)
+    # Normalize both texts for comparison (strip accents for cross-locale matching)
+    def normalize(t):
+        t = unicodedata.normalize("NFD", t)
+        t = "".join(c for c in t if unicodedata.category(c) != "Mn")
+        t = t.lower()
+        t = re.sub(r"[^\w\s]", "", t)
+        return set(t.split())
+    expected_words = normalize(expected_text)
+    transcribed_words = normalize(transcribed)
+    if not expected_words:
+        return True, transcribed, 1.0
+    matched = expected_words & transcribed_words
+    ratio = len(matched) / len(expected_words)
+    passed = ratio >= min_word_ratio
+    if not passed:
+        logger.warning(
+            "Validation failed: %.0f%% word match (need %.0f%%). "
+            "Expected: %s... Got: %s...",
+            ratio * 100,
+            min_word_ratio * 100,
+            expected_text[:60],
+            transcribed[:60],
+        )
+    return passed, transcribed, ratio

src/common/__init__.py ADDED Viewed

File without changes

src/common/handlers/__init__.py ADDED Viewed

File without changes

src/common/handlers/base.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Minimal handler types for standalone deployment.
+Drop-in replacement for the production common.handlers.base module.
+Provides ProcessJob, ProcessOutput, and ProcessResult so that
+audio_core.processor imports resolve without modification.
+"""
+from dataclasses import dataclass
+from typing import Any, Optional
+@dataclass
+class ProcessJob:
+    job_id: str
+    input: dict[str, Any]
+    upload_url: Optional[str] = None
+    webhook_url: Optional[str] = None
+@dataclass
+class ProcessOutput:
+    success: bool = True
+    data: Optional[bytes] = None
+    content_type: Optional[str] = None
+    result: Optional[dict] = None
+    metadata: Optional[dict] = None
+    error: Optional[str] = None
+@dataclass
+class ProcessResult:
+    job_id: str
+    success: bool
+    output: Optional[ProcessOutput] = None
+    processing_ms: int = 0
+    error: Optional[str] = None

src/server.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) 2026 Scenema AI
+# https://scenema.ai
+# SPDX-License-Identifier: MIT
+"""Scenema Audio standalone server.
+Thin FastAPI wrapper around the production AudioProcessor.
+"""
+import asyncio
+import base64
+import logging
+import os
+import uuid
+from contextlib import asynccontextmanager
+from pathlib import Path
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse
+from huggingface_hub import hf_hub_download, snapshot_download
+import uvicorn
+logger = logging.getLogger("scenema-audio")
+# Must be set before any torch import
+os.environ.setdefault(
+    "PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True"
+)
+from audio_core.processor import AudioProcessor  # noqa: E402
+from common.handlers.base import ProcessJob  # noqa: E402
+# ── Model download ──────────────────────────────────────────────
+HF_REPO = "ScenemaAI/scenema-audio"
+GEMMA_REPO = "google/gemma-3-12b-it"
+SEEDVC_REPO = "Plachta/Seed-VC"
+BIGVGAN_REPO = "nvidia/bigvgan_v2_22khz_80band_256x"
+WHISPER_REPO = "openai/whisper-small"
+MODEL_DIR = Path(os.environ.get("MODEL_DIR", "/app/models"))
+def _download_models():
+    """Download missing model checkpoints from HuggingFace."""
+    token = os.environ.get("HF_TOKEN")
+    # Audio transformer (INT8 by default)
+    audio_ckpt = Path(os.environ.get(
+        "AUDIO_CKPT",
+        str(MODEL_DIR / "scenema-audio-transformer-int8.safetensors"),
+    ))
+    if not audio_ckpt.exists():
+        logger.info("Downloading audio transformer (INT8, ~4.9 GB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-transformer-int8.safetensors",
+            local_dir=str(audio_ckpt.parent),
+            token=token,
+        )
+    # Pipeline checkpoint
+    pipeline_ckpt = Path(os.environ.get(
+        "PIPELINE_CKPT",
+        str(MODEL_DIR / "scenema-audio-pipeline.safetensors"),
+    ))
+    if not pipeline_ckpt.exists():
+        logger.info("Downloading pipeline checkpoint (~7.1 GB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-pipeline.safetensors",
+            local_dir=str(pipeline_ckpt.parent),
+            token=token,
+        )
+    # VAE encoder (small, may already be baked)
+    vae_ckpt = Path(os.environ.get(
+        "VAE_ENCODER_CKPT",
+        str(MODEL_DIR / "scenema-audio-vae-encoder.safetensors"),
+    ))
+    if not vae_ckpt.exists():
+        logger.info("Downloading VAE encoder (~42 MB)...")
+        hf_hub_download(
+            HF_REPO,
+            "scenema-audio-vae-encoder.safetensors",
+            local_dir=str(vae_ckpt.parent),
+            token=token,
+        )
+    # Gemma 3 12B IT
+    gemma_root = Path(os.environ.get("GEMMA_ROOT", str(MODEL_DIR / "gemma-3-12b-it")))
+    if not gemma_root.exists() or not any(gemma_root.glob("*.safetensors")):
+        logger.info("Downloading Gemma 3 12B IT (~24 GB, gated model)...")
+        snapshot_download(
+            GEMMA_REPO,
+            local_dir=str(gemma_root),
+            ignore_patterns=["*.gguf"],
+            token=token,
+        )
+    # SeedVC
+    seedvc_path = Path(os.environ.get("SEEDVC_PATH", "/app/seed-vc"))
+    seedvc_cache = seedvc_path / "checkpoints"
+    if not seedvc_cache.exists() or not any(seedvc_cache.glob("*.pth")):
+        logger.info("Downloading SeedVC checkpoints (~1.6 GB)...")
+        hf_cache = seedvc_cache / "hf_cache"
+        hf_cache.mkdir(parents=True, exist_ok=True)
+        os.environ["HF_HUB_CACHE"] = str(hf_cache)
+        hf_hub_download(
+            SEEDVC_REPO,
+            "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
+            local_dir=str(seedvc_cache),
+            token=token,
+        )
+        hf_hub_download(
+            SEEDVC_REPO,
+            "config_dit_mel_seed_uvit_whisper_small_wavenet.yml",
+            local_dir=str(seedvc_cache),
+            token=token,
+        )
+        snapshot_download(BIGVGAN_REPO, local_dir=str(hf_cache / "bigvgan"))
+        snapshot_download(WHISPER_REPO, local_dir=str(hf_cache / "whisper-small"))
+# ── FastAPI app ─────────────────────────────────────────────────
+processor = AudioProcessor()
+_semaphore = asyncio.Semaphore(1)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    _download_models()
+    processor.startup()
+    logger.info("Scenema Audio ready on port %s", os.environ.get("PORT", "8000"))
+    yield
+    processor.shutdown()
+app = FastAPI(title="Scenema Audio", lifespan=lifespan)
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+@app.post("/generate")
+async def generate(request: Request):
+    body = await request.json()
+    job = ProcessJob(
+        job_id=str(uuid.uuid4()),
+        input=body,
+    )
+    async with _semaphore:
+        result = await processor.process(job)
+    if not result.success:
+        return JSONResponse(
+            status_code=500,
+            content={
+                "status": "failed",
+                "error": result.error or "Generation failed",
+            },
+        )
+    output = result.output
+    audio_b64 = base64.b64encode(output.data).decode() if output.data else None
+    return {
+        "status": "succeeded",
+        "audio": audio_b64,
+        "content_type": output.content_type or "audio/wav",
+        "metadata": output.metadata or {},
+    }
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+    )
+    port = int(os.environ.get("PORT", "8000"))
+    uvicorn.run(app, host="0.0.0.0", port=port)