Spaces:
Running on Zero
Running on Zero
File size: 9,469 Bytes
cdc4405 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | # Copyright (c) 2026 Scenema AI
# https://scenema.ai
# SPDX-License-Identifier: MIT
"""XML prompt compiler for Scenema Audio.
Compiles a <speak> XML prompt into the video-style flat text prompt
that the LTX 2.3 audio model expects.
Supports three block types inside <speak>:
<action> — delivery/performance cues (how the person speaks/acts)
<sound> — audio events that should be heard (SFX, ambient sounds)
Text — the actual speech content
And three shot modes via the shot attribute:
closeup (default) — speech-focused, no SFX, clean audio
wide — environment + speech, SFX prominent
scene — raw scene description, maximum SFX
Example (closeup mode):
Input:
<speak voice="Deep male voice" scene="A dimly lit room" gender="male">
<action>He takes a slow breath</action>
Many years later, as he faced the firing squad...
</speak>
Output:
Close-up in a dimly lit room. He takes a slow breath.
"Many years later, as he faced the firing squad..."
Deep male voice.
Example (scene mode with SFX):
Input:
<speak voice="Tense male whisper" scene="Dark room, heavy rain"
gender="male" shot="scene">
<sound>A phone rings twice then stops</sound>
<action>He picks up the receiver and speaks in a low whisper</action>
Its done. The package is at the location.
<sound>Thunder rumbles in the distance</sound>
<action>He continues urgently</action>
You have thirty minutes.
</speak>
Output:
Dark room, heavy rain. A phone rings twice then stops.
He picks up the receiver and speaks in a low whisper:
"Its done. The package is at the location."
Thunder rumbles in the distance. He continues urgently:
"You have thirty minutes."
Tense male whisper. Dark room, heavy rain.
"""
import xml.etree.ElementTree as ET
from dataclasses import dataclass
DEFAULT_SCENE = "a person speaking to camera"
@dataclass
class CompiledPrompt:
prompt: str
speech_text: str
voice: str
scene: str | None
language: str
gender: str
shot: str
@dataclass
class TextBlock:
text: str
@dataclass
class ActionBlock:
text: str
@dataclass
class SoundBlock:
text: str
Block = TextBlock | ActionBlock | SoundBlock
def _extract_blocks(root: ET.Element) -> list[Block]:
"""Walk <speak> children in document order, extract text, action, and sound blocks."""
blocks: list[Block] = []
if root.text and root.text.strip():
blocks.append(TextBlock(text=root.text.strip()))
for child in root:
if child.tag == "action" and child.text and child.text.strip():
blocks.append(ActionBlock(text=child.text.strip()))
elif child.tag == "sound" and child.text and child.text.strip():
blocks.append(SoundBlock(text=child.text.strip()))
if child.tail and child.tail.strip():
blocks.append(TextBlock(text=child.tail.strip()))
return blocks
def _ensure_trailing_punctuation(text: str) -> str:
"""Ensure text ends with sentence-ending punctuation."""
if text and text[-1] not in ".!?\"'":
return text + "."
return text
SHOT_PREFIXES = {
"closeup": "Close-up in",
"wide": "Wide shot of",
"scene": "",
}
def _compile_blocks(
blocks: list[Block],
voice: str,
scene: str | None,
gender: str = "male",
shot: str = "closeup",
) -> str:
"""Compile blocks into the video-style prompt string."""
parts: list[str] = []
is_scene_mode = shot in ("scene", "wide")
pronoun = "She" if gender == "female" else "He"
scene_text = scene if scene else DEFAULT_SCENE
prefix = SHOT_PREFIXES.get(shot, SHOT_PREFIXES["closeup"])
if prefix:
parts.append(f"{prefix} {scene_text}.")
else:
parts.append(f"{scene_text}.")
first_speech = True
for block in blocks:
if isinstance(block, SoundBlock):
# Sound events compile as standalone sentences
parts.append(_ensure_trailing_punctuation(block.text))
elif isinstance(block, ActionBlock):
if is_scene_mode:
# In scene/wide mode, action flows into speech with connector
# Don't add punctuation — the colon before the quote handles it
parts.append(block.text + ":")
else:
# In closeup mode, action is a standalone sentence
parts.append(_ensure_trailing_punctuation(block.text))
elif isinstance(block, TextBlock):
clean_text = _ensure_trailing_punctuation(block.text)
if (
is_scene_mode
and first_speech
and not any(isinstance(b, ActionBlock) for b in blocks)
):
# No action before first speech in scene mode — add pronoun
parts.append(f'{pronoun} speaks: "{clean_text}"')
else:
parts.append(f'"{clean_text}"')
first_speech = False
parts.append(_ensure_trailing_punctuation(voice))
# In scene/wide mode, repeat scene as SFX reinforcement at the end
if is_scene_mode and scene:
parts.append(_ensure_trailing_punctuation(scene))
return " ".join(parts)
def _extract_speech_only(blocks: list[Block]) -> str:
"""Extract only speech text (no actions or sounds) for duration estimation."""
texts = [b.text for b in blocks if isinstance(b, TextBlock)]
return " ".join(texts)
def compile_prompt(xml_string: str) -> CompiledPrompt:
"""Compile a <speak> XML prompt into a video-style text prompt.
Args:
xml_string: Valid <speak> XML string (must pass validate_prompt first)
Returns:
CompiledPrompt with the compiled prompt and extracted metadata
"""
root = ET.fromstring(xml_string)
voice = root.get("voice", "").strip()
scene = root.get("scene")
if scene:
scene = scene.strip()
language = root.get("language", "en").strip()
gender = root.get("gender", "male").strip()
shot = root.get("shot", "closeup").strip()
blocks = _extract_blocks(root)
prompt = _compile_blocks(blocks, voice, scene, gender, shot)
speech_text = _extract_speech_only(blocks)
return CompiledPrompt(
prompt=prompt,
speech_text=speech_text,
voice=voice,
scene=scene,
language=language,
gender=gender,
shot=shot,
)
def extract_sentence_actions(xml_string: str) -> dict[int, list[str]]:
"""Map sentence indices to their preceding action blocks.
Walks the XML blocks in order, tracking the most recent action(s).
When a text block is encountered, its sentences inherit the pending actions.
Only the first sentence of each text block gets the actions (the action
precedes the text block in the XML).
Returns:
Dict mapping sentence index (0-based across all speech text) to a list
of action strings that precede that sentence.
"""
root = ET.fromstring(xml_string)
blocks = _extract_blocks(root)
sentence_actions: dict[int, list[str]] = {}
pending_actions: list[str] = []
sentence_idx = 0
for block in blocks:
if isinstance(block, ActionBlock):
pending_actions.append(block.text)
elif isinstance(block, TextBlock):
# Split this text block into sentences to count them
text = block.text.strip()
sentences = []
current = ""
for char in text:
current += char
if char in ".!?":
s = current.strip()
if s:
sentences.append(s)
current = ""
if current.strip():
sentences.append(current.strip())
if pending_actions and sentences:
sentence_actions[sentence_idx] = pending_actions.copy()
pending_actions.clear()
sentence_idx += len(sentences)
return sentence_actions
def extract_speech_text(xml_string: str) -> str:
"""Extract only the speech text from XML, ignoring actions and sounds.
Useful for duration estimation (Kokoro) without compiling the full prompt.
"""
root = ET.fromstring(xml_string)
blocks = _extract_blocks(root)
return _extract_speech_only(blocks)
def compile_chunk_prompt(
speech_text: str,
voice: str,
scene: str | None = None,
actions_before: list[str] | None = None,
actions_after: list[str] | None = None,
gender: str = "male",
shot: str = "closeup",
) -> str:
"""Compile a single chunk's prompt from pre-split text.
Used by the chunker to build per-chunk prompts after text splitting.
Args:
speech_text: The chunk's speech text portion.
voice: Voice description string.
scene: Scene description string (optional).
actions_before: Action blocks to prepend before speech.
actions_after: Action blocks to append after speech.
Returns:
Compiled video-style prompt string.
"""
blocks: list[Block] = []
if actions_before:
for a in actions_before:
blocks.append(ActionBlock(text=a))
blocks.append(TextBlock(text=speech_text))
if actions_after:
for a in actions_after:
blocks.append(ActionBlock(text=a))
return _compile_blocks(blocks, voice, scene, gender, shot)
|