scenema-audio

Runtime error

File size: 11,019 Bytes

cdc4405

# Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT

"""Text chunking and duration estimation for Scenema Audio.

Splits long text into chunks at sentence boundaries using Kokoro TTS
phoneme-level timing as the source of truth for duration. No word counting.

Algorithm:
  1. Split text into sentences
  2. Estimate each sentence's duration via Kokoro (one call per sentence)
  3. Greedily merge: accumulate sentence durations, start a new chunk
     when running_sum * LTX_MULTIPLIER exceeds MAX_CHUNK_DURATION_S
"""

import logging
import random
from dataclasses import dataclass

from .compiler import compile_chunk_prompt, compile_prompt, extract_sentence_actions
from .validator import validate_prompt

logger = logging.getLogger(__name__)

FALLBACK_WORDS_PER_SEC = 2.2  # Test-environment-only fallback when Kokoro is mocked
ACTION_DURATION_S = 1.5  # Extra time per action block
MAX_CHUNK_DURATION_S = (
    15.0  # Safe generation limit — model trained on 20s but repeats beyond ~15s
)
LTX_MULTIPLIER = 1.5  # LTX speaks slower than Kokoro; overshoot for trimming

# Kokoro singleton (loaded once, reused)
_kokoro_pipeline = None
_kokoro_available: bool | None = None


def _get_kokoro():
    """Get or initialize the Kokoro TTS pipeline for duration estimation.

    Kokoro is 82M params, runs on CPU. Loaded once and cached.
    Falls back to word-count heuristic only in test environments.
    """
    global _kokoro_pipeline, _kokoro_available

    if _kokoro_available is False:
        return None

    if _kokoro_pipeline is not None:
        return _kokoro_pipeline

    try:
        from kokoro import KPipeline

        pipe = KPipeline(lang_code="a")
        # Verify it's a real Kokoro pipeline (not a mock in tests)
        if not hasattr(pipe, "__module__") or "kokoro" not in str(
            getattr(pipe, "__module__", "")
        ):
            raise TypeError("Kokoro pipeline is not genuine (test mock)")
        _kokoro_pipeline = pipe
        _kokoro_available = True
        logger.info("Kokoro TTS loaded for duration estimation")
        return _kokoro_pipeline
    except TypeError:
        # Test environment with mocks, fall back silently
        _kokoro_available = False
        return None
    except (ImportError, Exception) as e:
        _kokoro_available = False
        logger.error("Kokoro is required but not available: %s", e)
        raise RuntimeError(
            f"Kokoro TTS is a required dependency for duration estimation. "
            f"Install it with: pip install kokoro. Error: {e}"
        ) from e


def _kokoro_duration(text: str) -> float | None:
    """Estimate speech duration using Kokoro TTS phoneme-level timing.

    Args:
        text: Speech text to estimate duration for

    Returns:
        Duration in seconds, or None if Kokoro unavailable
    """
    pipe = _get_kokoro()
    if pipe is None:
        return None

    try:
        total_frames = 0
        for result in pipe(text, voice="af_heart"):
            if hasattr(result, "audio") and result.audio is not None:
                total_frames += len(result.audio)

        # Kokoro outputs at 24000Hz
        duration = total_frames / 24000.0
        return duration
    except Exception as e:
        logger.warning("Kokoro estimation failed: %s", e)
        return None


@dataclass
class ChunkSpec:
    compiled_prompt: str
    duration_s: float
    seed: int
    expected_text: str
    language: str = "en"


def _split_into_sentences(text: str) -> list[str]:
    """Split text into individual sentences at .!? boundaries."""
    sentences = []
    current = ""
    for char in text:
        current += char
        if char in ".!?":
            stripped = current.strip()
            if stripped:
                sentences.append(stripped)
            current = ""
    if current.strip():
        sentences.append(current.strip())
    return sentences


def _estimate_sentence_durations(sentences: list[str]) -> list[float]:
    """Estimate Kokoro duration for each sentence individually.

    One Kokoro call per sentence. Returns raw Kokoro durations (before
    LTX multiplier). Falls back to word-count heuristic per sentence
    only in test environments where Kokoro is mocked.
    """
    durations = []
    for sent in sentences:
        dur = _kokoro_duration(sent)
        if dur is None:
            # Test environment fallback only
            dur = len(sent.split()) / FALLBACK_WORDS_PER_SEC + 0.3
        durations.append(dur)
    return durations


def split_text_by_duration(
    text: str,
    multiplier: float = LTX_MULTIPLIER,
    max_duration: float = MAX_CHUNK_DURATION_S,
) -> list[tuple[str, float]]:
    """Split text into chunks using Kokoro duration estimation.

    Kokoro is the source of truth for duration. No word counting.

    Algorithm:
      1. Split text into sentences
      2. Estimate each sentence's duration via Kokoro (one call per sentence)
      3. Greedily merge: accumulate durations, start a new chunk when
         running_sum * multiplier would exceed max_duration

    Duration is additive across sentences because Kokoro estimates are
    phoneme-level with no cross-sentence dependencies.

    Args:
        text: Full speech text.
        multiplier: LTX speaks slower than Kokoro; applied to estimates.
        max_duration: Max audio duration per chunk (model training limit).

    Returns:
        List of (chunk_text, estimated_ltx_duration) tuples.
    """
    sentences = _split_into_sentences(text)
    if not sentences:
        return []

    # Split long sentences at commas if they exceed max_duration on their own
    expanded = []
    for sent in sentences:
        dur = _estimate_sentence_durations([sent])[0]
        if dur * multiplier > max_duration and "," in sent:
            # Split at commas and re-estimate
            clauses = [c.strip() for c in sent.split(",") if c.strip()]
            clause_durs = _estimate_sentence_durations(clauses)
            sub_texts: list[str] = []
            sub_dur = 0.0
            for clause, cdur in zip(clauses, clause_durs):
                if sub_texts and (sub_dur + cdur) * multiplier > max_duration:
                    expanded.append(", ".join(sub_texts))
                    sub_texts = []
                    sub_dur = 0.0
                sub_texts.append(clause)
                sub_dur += cdur
            if sub_texts:
                expanded.append(", ".join(sub_texts))
        else:
            expanded.append(sent)

    durations = _estimate_sentence_durations(expanded)

    chunks: list[tuple[str, float]] = []
    current_texts: list[str] = []
    current_dur = 0.0

    for sent, dur in zip(expanded, durations):
        if current_texts and (current_dur + dur) * multiplier > max_duration:
            chunk_text = " ".join(current_texts)
            chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))
            current_texts = []
            current_dur = 0.0

        current_texts.append(sent)
        current_dur += dur

    if current_texts:
        chunk_text = " ".join(current_texts)
        chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))

    return chunks


def estimate_duration(
    text: str,
    num_actions: int = 0,
    multiplier: float = LTX_MULTIPLIER,
) -> float:
    """Estimate audio duration for a single chunk of text.

    Used for single-chunk prompts that don't need splitting.

    Args:
        text: Speech text (no actions)
        num_actions: Number of action blocks (adds time for breaths/pauses)
        multiplier: Duration multiplier (LTX speaks slower than Kokoro)
    """
    kokoro_dur = _kokoro_duration(text)

    if kokoro_dur is not None:
        base_duration = kokoro_dur
        logger.debug("Kokoro estimate: %.1fs for '%s'", kokoro_dur, text[:40])
    else:
        words = len(text.split())
        base_duration = words / FALLBACK_WORDS_PER_SEC + 0.5

    action_time = num_actions * ACTION_DURATION_S
    duration = (base_duration + action_time) * multiplier
    return min(duration, MAX_CHUNK_DURATION_S)


def plan_chunks(
    xml_string: str,
    base_seed: int = -1,
    pace: float = LTX_MULTIPLIER,
) -> list[ChunkSpec]:
    """Plan generation chunks from an XML prompt.

    Validates XML, extracts text, splits into duration-based chunks
    using Kokoro, and builds per-chunk compiled prompts.

    Args:
        xml_string: Valid <speak> XML string
        base_seed: Base seed (-1 for random, otherwise sequential per chunk)
        pace: Duration multiplier (default 1.5). Higher = slower speech.
    """
    result = validate_prompt(xml_string)
    if not result.valid:
        raise ValueError(f"Invalid prompt: {'; '.join(result.errors)}")

    compiled = compile_prompt(xml_string)

    if base_seed == -1:
        base_seed = random.randint(0, 999999)

    # Check if entire text fits in a single chunk (uncapped duration for this check)
    kokoro_dur = _kokoro_duration(compiled.speech_text)
    if kokoro_dur is not None:
        total_dur = kokoro_dur * pace
    else:
        words = len(compiled.speech_text.split())
        total_dur = (words / FALLBACK_WORDS_PER_SEC + 0.5) * pace

    if total_dur <= MAX_CHUNK_DURATION_S:
        return [
            ChunkSpec(
                compiled_prompt=compiled.prompt,
                duration_s=min(total_dur, MAX_CHUNK_DURATION_S),
                seed=base_seed,
                expected_text=compiled.speech_text,
                language=compiled.language,
            )
        ]

    # Extract action-to-sentence mapping before splitting
    sentence_action_map = extract_sentence_actions(xml_string)

    # Split by Kokoro-estimated duration
    text_chunks = split_text_by_duration(compiled.speech_text, multiplier=pace)

    # Track which global sentence index each chunk starts at
    global_sentence_idx = 0

    specs: list[ChunkSpec] = []
    for i, (chunk_text, chunk_dur) in enumerate(text_chunks):
        # Find actions that belong to this chunk's first sentence
        actions_before = sentence_action_map.get(global_sentence_idx)

        chunk_prompt = compile_chunk_prompt(
            speech_text=chunk_text,
            voice=compiled.voice,
            scene=compiled.scene,
            actions_before=actions_before,
            gender=compiled.gender,
            shot=compiled.shot,
        )
        specs.append(
            ChunkSpec(
                compiled_prompt=chunk_prompt,
                duration_s=chunk_dur,
                seed=base_seed + i * 1000,
                expected_text=chunk_text,
                language=compiled.language,
            )
        )

        # Count sentences in this chunk to advance global index
        chunk_sentences = _split_into_sentences(chunk_text)
        global_sentence_idx += len(chunk_sentences)

    logger.info(
        "Planned %d chunks (%.1fs total estimated)",
        len(specs),
        sum(s.duration_s for s in specs),
    )
    return specs