# Copyright (c) 2026 Scenema AI # https://scenema.ai # SPDX-License-Identifier: MIT """Text chunking and duration estimation for Scenema Audio. Splits long text into chunks at sentence boundaries using Kokoro TTS phoneme-level timing as the source of truth for duration. No word counting. Algorithm: 1. Split text into sentences 2. Estimate each sentence's duration via Kokoro (one call per sentence) 3. Greedily merge: accumulate sentence durations, start a new chunk when running_sum * LTX_MULTIPLIER exceeds MAX_CHUNK_DURATION_S """ import logging import random from dataclasses import dataclass from .compiler import compile_chunk_prompt, compile_prompt, extract_sentence_actions from .validator import validate_prompt logger = logging.getLogger(__name__) FALLBACK_WORDS_PER_SEC = 2.2 # Test-environment-only fallback when Kokoro is mocked ACTION_DURATION_S = 1.5 # Extra time per action block MAX_CHUNK_DURATION_S = ( 15.0 # Safe generation limit — model trained on 20s but repeats beyond ~15s ) LTX_MULTIPLIER = 1.5 # LTX speaks slower than Kokoro; overshoot for trimming # Kokoro singleton (loaded once, reused) _kokoro_pipeline = None _kokoro_available: bool | None = None def _get_kokoro(): """Get or initialize the Kokoro TTS pipeline for duration estimation. Kokoro is 82M params, runs on CPU. Loaded once and cached. Falls back to word-count heuristic only in test environments. """ global _kokoro_pipeline, _kokoro_available if _kokoro_available is False: return None if _kokoro_pipeline is not None: return _kokoro_pipeline try: from kokoro import KPipeline pipe = KPipeline(lang_code="a") # Verify it's a real Kokoro pipeline (not a mock in tests) if not hasattr(pipe, "__module__") or "kokoro" not in str( getattr(pipe, "__module__", "") ): raise TypeError("Kokoro pipeline is not genuine (test mock)") _kokoro_pipeline = pipe _kokoro_available = True logger.info("Kokoro TTS loaded for duration estimation") return _kokoro_pipeline except TypeError: # Test environment with mocks, fall back silently _kokoro_available = False return None except (ImportError, Exception) as e: _kokoro_available = False logger.error("Kokoro is required but not available: %s", e) raise RuntimeError( f"Kokoro TTS is a required dependency for duration estimation. " f"Install it with: pip install kokoro. Error: {e}" ) from e def _kokoro_duration(text: str) -> float | None: """Estimate speech duration using Kokoro TTS phoneme-level timing. Args: text: Speech text to estimate duration for Returns: Duration in seconds, or None if Kokoro unavailable """ pipe = _get_kokoro() if pipe is None: return None try: total_frames = 0 for result in pipe(text, voice="af_heart"): if hasattr(result, "audio") and result.audio is not None: total_frames += len(result.audio) # Kokoro outputs at 24000Hz duration = total_frames / 24000.0 return duration except Exception as e: logger.warning("Kokoro estimation failed: %s", e) return None @dataclass class ChunkSpec: compiled_prompt: str duration_s: float seed: int expected_text: str language: str = "en" def _split_into_sentences(text: str) -> list[str]: """Split text into individual sentences at .!? boundaries.""" sentences = [] current = "" for char in text: current += char if char in ".!?": stripped = current.strip() if stripped: sentences.append(stripped) current = "" if current.strip(): sentences.append(current.strip()) return sentences def _estimate_sentence_durations(sentences: list[str]) -> list[float]: """Estimate Kokoro duration for each sentence individually. One Kokoro call per sentence. Returns raw Kokoro durations (before LTX multiplier). Falls back to word-count heuristic per sentence only in test environments where Kokoro is mocked. """ durations = [] for sent in sentences: dur = _kokoro_duration(sent) if dur is None: # Test environment fallback only dur = len(sent.split()) / FALLBACK_WORDS_PER_SEC + 0.3 durations.append(dur) return durations def split_text_by_duration( text: str, multiplier: float = LTX_MULTIPLIER, max_duration: float = MAX_CHUNK_DURATION_S, ) -> list[tuple[str, float]]: """Split text into chunks using Kokoro duration estimation. Kokoro is the source of truth for duration. No word counting. Algorithm: 1. Split text into sentences 2. Estimate each sentence's duration via Kokoro (one call per sentence) 3. Greedily merge: accumulate durations, start a new chunk when running_sum * multiplier would exceed max_duration Duration is additive across sentences because Kokoro estimates are phoneme-level with no cross-sentence dependencies. Args: text: Full speech text. multiplier: LTX speaks slower than Kokoro; applied to estimates. max_duration: Max audio duration per chunk (model training limit). Returns: List of (chunk_text, estimated_ltx_duration) tuples. """ sentences = _split_into_sentences(text) if not sentences: return [] # Split long sentences at commas if they exceed max_duration on their own expanded = [] for sent in sentences: dur = _estimate_sentence_durations([sent])[0] if dur * multiplier > max_duration and "," in sent: # Split at commas and re-estimate clauses = [c.strip() for c in sent.split(",") if c.strip()] clause_durs = _estimate_sentence_durations(clauses) sub_texts: list[str] = [] sub_dur = 0.0 for clause, cdur in zip(clauses, clause_durs): if sub_texts and (sub_dur + cdur) * multiplier > max_duration: expanded.append(", ".join(sub_texts)) sub_texts = [] sub_dur = 0.0 sub_texts.append(clause) sub_dur += cdur if sub_texts: expanded.append(", ".join(sub_texts)) else: expanded.append(sent) durations = _estimate_sentence_durations(expanded) chunks: list[tuple[str, float]] = [] current_texts: list[str] = [] current_dur = 0.0 for sent, dur in zip(expanded, durations): if current_texts and (current_dur + dur) * multiplier > max_duration: chunk_text = " ".join(current_texts) chunks.append((chunk_text, min(current_dur * multiplier, max_duration))) current_texts = [] current_dur = 0.0 current_texts.append(sent) current_dur += dur if current_texts: chunk_text = " ".join(current_texts) chunks.append((chunk_text, min(current_dur * multiplier, max_duration))) return chunks def estimate_duration( text: str, num_actions: int = 0, multiplier: float = LTX_MULTIPLIER, ) -> float: """Estimate audio duration for a single chunk of text. Used for single-chunk prompts that don't need splitting. Args: text: Speech text (no actions) num_actions: Number of action blocks (adds time for breaths/pauses) multiplier: Duration multiplier (LTX speaks slower than Kokoro) """ kokoro_dur = _kokoro_duration(text) if kokoro_dur is not None: base_duration = kokoro_dur logger.debug("Kokoro estimate: %.1fs for '%s'", kokoro_dur, text[:40]) else: words = len(text.split()) base_duration = words / FALLBACK_WORDS_PER_SEC + 0.5 action_time = num_actions * ACTION_DURATION_S duration = (base_duration + action_time) * multiplier return min(duration, MAX_CHUNK_DURATION_S) def plan_chunks( xml_string: str, base_seed: int = -1, pace: float = LTX_MULTIPLIER, ) -> list[ChunkSpec]: """Plan generation chunks from an XML prompt. Validates XML, extracts text, splits into duration-based chunks using Kokoro, and builds per-chunk compiled prompts. Args: xml_string: Valid XML string base_seed: Base seed (-1 for random, otherwise sequential per chunk) pace: Duration multiplier (default 1.5). Higher = slower speech. """ result = validate_prompt(xml_string) if not result.valid: raise ValueError(f"Invalid prompt: {'; '.join(result.errors)}") compiled = compile_prompt(xml_string) if base_seed == -1: base_seed = random.randint(0, 999999) # Check if entire text fits in a single chunk (uncapped duration for this check) kokoro_dur = _kokoro_duration(compiled.speech_text) if kokoro_dur is not None: total_dur = kokoro_dur * pace else: words = len(compiled.speech_text.split()) total_dur = (words / FALLBACK_WORDS_PER_SEC + 0.5) * pace if total_dur <= MAX_CHUNK_DURATION_S: return [ ChunkSpec( compiled_prompt=compiled.prompt, duration_s=min(total_dur, MAX_CHUNK_DURATION_S), seed=base_seed, expected_text=compiled.speech_text, language=compiled.language, ) ] # Extract action-to-sentence mapping before splitting sentence_action_map = extract_sentence_actions(xml_string) # Split by Kokoro-estimated duration text_chunks = split_text_by_duration(compiled.speech_text, multiplier=pace) # Track which global sentence index each chunk starts at global_sentence_idx = 0 specs: list[ChunkSpec] = [] for i, (chunk_text, chunk_dur) in enumerate(text_chunks): # Find actions that belong to this chunk's first sentence actions_before = sentence_action_map.get(global_sentence_idx) chunk_prompt = compile_chunk_prompt( speech_text=chunk_text, voice=compiled.voice, scene=compiled.scene, actions_before=actions_before, gender=compiled.gender, shot=compiled.shot, ) specs.append( ChunkSpec( compiled_prompt=chunk_prompt, duration_s=chunk_dur, seed=base_seed + i * 1000, expected_text=chunk_text, language=compiled.language, ) ) # Count sentences in this chunk to advance global index chunk_sentences = _split_into_sentences(chunk_text) global_sentence_idx += len(chunk_sentences) logger.info( "Planned %d chunks (%.1fs total estimated)", len(specs), sum(s.duration_s for s in specs), ) return specs