Spaces:

multimodalart
/

scenema-audio

Running on Zero

scenema-audio / src /audio_core /chunker.py

multimodalart

Initial Gradio ZeroGPU app for Scenema Audio

cdc4405 10 days ago

11 kB

	# Copyright (c) 2026 Scenema AI
	# https://scenema.ai
	# SPDX-License-Identifier: MIT

	"""Text chunking and duration estimation for Scenema Audio.

	Splits long text into chunks at sentence boundaries using Kokoro TTS
	phoneme-level timing as the source of truth for duration. No word counting.

	Algorithm:
	1. Split text into sentences
	2. Estimate each sentence's duration via Kokoro (one call per sentence)
	3. Greedily merge: accumulate sentence durations, start a new chunk
	when running_sum * LTX_MULTIPLIER exceeds MAX_CHUNK_DURATION_S
	"""

	import logging
	import random
	from dataclasses import dataclass

	from .compiler import compile_chunk_prompt, compile_prompt, extract_sentence_actions
	from .validator import validate_prompt

	logger = logging.getLogger(__name__)

	FALLBACK_WORDS_PER_SEC = 2.2 # Test-environment-only fallback when Kokoro is mocked
	ACTION_DURATION_S = 1.5 # Extra time per action block
	MAX_CHUNK_DURATION_S = (
	15.0 # Safe generation limit — model trained on 20s but repeats beyond ~15s
	)
	LTX_MULTIPLIER = 1.5 # LTX speaks slower than Kokoro; overshoot for trimming

	# Kokoro singleton (loaded once, reused)
	_kokoro_pipeline = None
	_kokoro_available: bool \| None = None


	def _get_kokoro():
	"""Get or initialize the Kokoro TTS pipeline for duration estimation.

	Kokoro is 82M params, runs on CPU. Loaded once and cached.
	Falls back to word-count heuristic only in test environments.
	"""
	global _kokoro_pipeline, _kokoro_available

	if _kokoro_available is False:
	return None

	if _kokoro_pipeline is not None:
	return _kokoro_pipeline

	try:
	from kokoro import KPipeline

	pipe = KPipeline(lang_code="a")
	# Verify it's a real Kokoro pipeline (not a mock in tests)
	if not hasattr(pipe, "__module__") or "kokoro" not in str(
	getattr(pipe, "__module__", "")
	):
	raise TypeError("Kokoro pipeline is not genuine (test mock)")
	_kokoro_pipeline = pipe
	_kokoro_available = True
	logger.info("Kokoro TTS loaded for duration estimation")
	return _kokoro_pipeline
	except TypeError:
	# Test environment with mocks, fall back silently
	_kokoro_available = False
	return None
	except (ImportError, Exception) as e:
	_kokoro_available = False
	logger.error("Kokoro is required but not available: %s", e)
	raise RuntimeError(
	f"Kokoro TTS is a required dependency for duration estimation. "
	f"Install it with: pip install kokoro. Error: {e}"
	) from e


	def _kokoro_duration(text: str) -> float \| None:
	"""Estimate speech duration using Kokoro TTS phoneme-level timing.

	Args:
	text: Speech text to estimate duration for

	Returns:
	Duration in seconds, or None if Kokoro unavailable
	"""
	pipe = _get_kokoro()
	if pipe is None:
	return None

	try:
	total_frames = 0
	for result in pipe(text, voice="af_heart"):
	if hasattr(result, "audio") and result.audio is not None:
	total_frames += len(result.audio)

	# Kokoro outputs at 24000Hz
	duration = total_frames / 24000.0
	return duration
	except Exception as e:
	logger.warning("Kokoro estimation failed: %s", e)
	return None


	@dataclass
	class ChunkSpec:
	compiled_prompt: str
	duration_s: float
	seed: int
	expected_text: str
	language: str = "en"


	def _split_into_sentences(text: str) -> list[str]:
	"""Split text into individual sentences at .!? boundaries."""
	sentences = []
	current = ""
	for char in text:
	current += char
	if char in ".!?":
	stripped = current.strip()
	if stripped:
	sentences.append(stripped)
	current = ""
	if current.strip():
	sentences.append(current.strip())
	return sentences


	def _estimate_sentence_durations(sentences: list[str]) -> list[float]:
	"""Estimate Kokoro duration for each sentence individually.

	One Kokoro call per sentence. Returns raw Kokoro durations (before
	LTX multiplier). Falls back to word-count heuristic per sentence
	only in test environments where Kokoro is mocked.
	"""
	durations = []
	for sent in sentences:
	dur = _kokoro_duration(sent)
	if dur is None:
	# Test environment fallback only
	dur = len(sent.split()) / FALLBACK_WORDS_PER_SEC + 0.3
	durations.append(dur)
	return durations


	def split_text_by_duration(
	text: str,
	multiplier: float = LTX_MULTIPLIER,
	max_duration: float = MAX_CHUNK_DURATION_S,
	) -> list[tuple[str, float]]:
	"""Split text into chunks using Kokoro duration estimation.

	Kokoro is the source of truth for duration. No word counting.

	Algorithm:
	1. Split text into sentences
	2. Estimate each sentence's duration via Kokoro (one call per sentence)
	3. Greedily merge: accumulate durations, start a new chunk when
	running_sum * multiplier would exceed max_duration

	Duration is additive across sentences because Kokoro estimates are
	phoneme-level with no cross-sentence dependencies.

	Args:
	text: Full speech text.
	multiplier: LTX speaks slower than Kokoro; applied to estimates.
	max_duration: Max audio duration per chunk (model training limit).

	Returns:
	List of (chunk_text, estimated_ltx_duration) tuples.
	"""
	sentences = _split_into_sentences(text)
	if not sentences:
	return []

	# Split long sentences at commas if they exceed max_duration on their own
	expanded = []
	for sent in sentences:
	dur = _estimate_sentence_durations([sent])[0]
	if dur * multiplier > max_duration and "," in sent:
	# Split at commas and re-estimate
	clauses = [c.strip() for c in sent.split(",") if c.strip()]
	clause_durs = _estimate_sentence_durations(clauses)
	sub_texts: list[str] = []
	sub_dur = 0.0
	for clause, cdur in zip(clauses, clause_durs):
	if sub_texts and (sub_dur + cdur) * multiplier > max_duration:
	expanded.append(", ".join(sub_texts))
	sub_texts = []
	sub_dur = 0.0
	sub_texts.append(clause)
	sub_dur += cdur
	if sub_texts:
	expanded.append(", ".join(sub_texts))
	else:
	expanded.append(sent)

	durations = _estimate_sentence_durations(expanded)

	chunks: list[tuple[str, float]] = []
	current_texts: list[str] = []
	current_dur = 0.0

	for sent, dur in zip(expanded, durations):
	if current_texts and (current_dur + dur) * multiplier > max_duration:
	chunk_text = " ".join(current_texts)
	chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))
	current_texts = []
	current_dur = 0.0

	current_texts.append(sent)
	current_dur += dur

	if current_texts:
	chunk_text = " ".join(current_texts)
	chunks.append((chunk_text, min(current_dur * multiplier, max_duration)))

	return chunks


	def estimate_duration(
	text: str,
	num_actions: int = 0,
	multiplier: float = LTX_MULTIPLIER,
	) -> float:
	"""Estimate audio duration for a single chunk of text.

	Used for single-chunk prompts that don't need splitting.

	Args:
	text: Speech text (no actions)
	num_actions: Number of action blocks (adds time for breaths/pauses)
	multiplier: Duration multiplier (LTX speaks slower than Kokoro)
	"""
	kokoro_dur = _kokoro_duration(text)

	if kokoro_dur is not None:
	base_duration = kokoro_dur
	logger.debug("Kokoro estimate: %.1fs for '%s'", kokoro_dur, text[:40])
	else:
	words = len(text.split())
	base_duration = words / FALLBACK_WORDS_PER_SEC + 0.5

	action_time = num_actions * ACTION_DURATION_S
	duration = (base_duration + action_time) * multiplier
	return min(duration, MAX_CHUNK_DURATION_S)


	def plan_chunks(
	xml_string: str,
	base_seed: int = -1,
	pace: float = LTX_MULTIPLIER,
	) -> list[ChunkSpec]:
	"""Plan generation chunks from an XML prompt.

	Validates XML, extracts text, splits into duration-based chunks
	using Kokoro, and builds per-chunk compiled prompts.

	Args:
	xml_string: Valid <speak> XML string
	base_seed: Base seed (-1 for random, otherwise sequential per chunk)
	pace: Duration multiplier (default 1.5). Higher = slower speech.
	"""
	result = validate_prompt(xml_string)
	if not result.valid:
	raise ValueError(f"Invalid prompt: {'; '.join(result.errors)}")

	compiled = compile_prompt(xml_string)

	if base_seed == -1:
	base_seed = random.randint(0, 999999)

	# Check if entire text fits in a single chunk (uncapped duration for this check)
	kokoro_dur = _kokoro_duration(compiled.speech_text)
	if kokoro_dur is not None:
	total_dur = kokoro_dur * pace
	else:
	words = len(compiled.speech_text.split())
	total_dur = (words / FALLBACK_WORDS_PER_SEC + 0.5) * pace

	if total_dur <= MAX_CHUNK_DURATION_S:
	return [
	ChunkSpec(
	compiled_prompt=compiled.prompt,
	duration_s=min(total_dur, MAX_CHUNK_DURATION_S),
	seed=base_seed,
	expected_text=compiled.speech_text,
	language=compiled.language,
	)
	]

	# Extract action-to-sentence mapping before splitting
	sentence_action_map = extract_sentence_actions(xml_string)

	# Split by Kokoro-estimated duration
	text_chunks = split_text_by_duration(compiled.speech_text, multiplier=pace)

	# Track which global sentence index each chunk starts at
	global_sentence_idx = 0

	specs: list[ChunkSpec] = []
	for i, (chunk_text, chunk_dur) in enumerate(text_chunks):
	# Find actions that belong to this chunk's first sentence
	actions_before = sentence_action_map.get(global_sentence_idx)

	chunk_prompt = compile_chunk_prompt(
	speech_text=chunk_text,
	voice=compiled.voice,
	scene=compiled.scene,
	actions_before=actions_before,
	gender=compiled.gender,
	shot=compiled.shot,
	)
	specs.append(
	ChunkSpec(
	compiled_prompt=chunk_prompt,
	duration_s=chunk_dur,
	seed=base_seed + i * 1000,
	expected_text=chunk_text,
	language=compiled.language,
	)
	)

	# Count sentences in this chunk to advance global index
	chunk_sentences = _split_into_sentences(chunk_text)
	global_sentence_idx += len(chunk_sentences)

	logger.info(
	"Planned %d chunks (%.1fs total estimated)",
	len(specs),
	sum(s.duration_s for s in specs),
	)
	return specs