Spaces:

ResembleAI
/

Dramabox

Running on Zero

App Files Files Community

Dramabox / src /text_chunker.py

Manmay

Long-form chunking + RE-USE on reference

7e0eb32 3 days ago

raw

history blame contribute delete

7.76 kB

	"""Prompt chunking for long-form DramaBox generation.

	The base LTX-2.3 audio DiT was trained on clips <= ~20 s. The silence-prior
	patch in ``inference_server.py`` keeps generations sane up to ~45 s, but the
	prior re-emerges past that boundary. For arbitrary-length prompts we split the
	text into < 45 s chunks, generate each conditioned on the same voice reference,
	and crossfade them back together.

	Chunking is quote-aware (sentence terminators inside ``"..."`` don't count)
	and preserves the speaker-description prefix on every chunk so the model keeps
	the same persona / delivery style across joins.
	"""
	from __future__ import annotations

	import re
	from dataclasses import dataclass
	from typing import List, Optional


	# Matches the leading speaker description, ending at the first comma that's
	# directly followed by a space + opening quote. Anything before that is treated
	# as persona/style metadata and re-attached to every chunk.
	# "A shadowy villain speaks with cold menace, \"You have entered...\""
	# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	_PREFIX_RE = re.compile(r'^([^"\']{3,}?)(,\s*)(?=["\'])', re.DOTALL)


	@dataclass
	class PromptChunk:
	text: str
	est_duration_s: float


	def extract_speaker_prefix(prompt: str) -> tuple[Optional[str], str]:
	"""Return ``(prefix, body)`` where ``prefix`` is the speaker description.

	If the prompt has the canonical ``"<persona>, "<dialogue>"..."`` form, the
	persona (without the trailing comma) is returned as the prefix and the rest
	of the prompt as the body. Otherwise ``(None, prompt)`` — no prefix to
	propagate, the whole prompt is treated as a single body.
	"""
	m = _PREFIX_RE.match(prompt)
	if not m:
	return None, prompt
	return m.group(1).strip(), prompt[m.end():]


	def split_sentences_outside_quotes(text: str) -> List[str]:
	"""Split ``text`` into sentences, ignoring terminators inside quotes.

	A "sentence" here is a span ending in ``.``/``!``/``?`` (optionally followed
	by a closing quote) at the top level — i.e. not inside an open ``"..."`` or
	``'...'`` pair. Empty / whitespace-only fragments are dropped.

	Examples:
	>>> split_sentences_outside_quotes('He says, "Hi, how are you?" Then leaves.')
	['He says, "Hi, how are you?"', 'Then leaves.']
	"""
	sentences: List[str] = []
	buf: List[str] = []
	in_double = False
	in_single = False
	i = 0
	n = len(text)
	while i < n:
	ch = text[i]
	buf.append(ch)

	if ch == '"' and not in_single:
	was_inside = in_double
	in_double = not in_double
	# Treat the closing quote as a sentence boundary if the last
	# meaningful char inside it was a terminator: ``...how are you?"``.
	if was_inside and len(buf) >= 2 and buf[-2] in ".!?":
	# Boundary requires whitespace / end-of-string after.
	if i + 1 >= n or text[i + 1].isspace():
	sentence = "".join(buf).strip()
	if sentence:
	sentences.append(sentence)
	buf = []
	i += 1
	continue

	elif ch == "'" and not in_double:
	# Apostrophes inside a word (don't, it's) are not quote toggles.
	prev = text[i - 1] if i > 0 else " "
	nxt = text[i + 1] if i + 1 < n else " "
	if not (prev.isalpha() and nxt.isalpha()):
	in_single = not in_single

	elif ch in ".!?" and not in_double and not in_single:
	# Greedily eat trailing closing quotes / punctuation.
	j = i + 1
	while j < n and text[j] in '."\')]':
	buf.append(text[j])
	if text[j] == '"':
	in_double = not in_double # closing quote toggle
	j += 1
	if j >= n or text[j].isspace():
	sentence = "".join(buf).strip()
	if sentence:
	sentences.append(sentence)
	buf = []
	i = j
	continue
	i += 1

	tail = "".join(buf).strip()
	if tail:
	sentences.append(tail)
	return sentences


	def _assemble(prefix: Optional[str], sentences: List[str]) -> str:
	body = " ".join(s.strip() for s in sentences if s.strip())
	if not prefix:
	return body
	# Re-attach prefix in the canonical "persona, body" form. If the first
	# sentence already starts with a stage direction (no opening quote), drop
	# the comma + use a period so the syntax reads naturally.
	if body.lstrip().startswith(("'", '"')):
	return f"{prefix}, {body}"
	return f"{prefix}. {body}"


	def chunk_prompt_for_duration(
	prompt: str,
	max_duration_s: float = 45.0,
	target_duration_s: float = 37.0,
	duration_multiplier: float = 1.1,
	) -> List[PromptChunk]:
	"""Split ``prompt`` into <= ``max_duration_s`` chunks.

	Args:
	prompt: Full scene prompt (DramaBox format or plain text).
	max_duration_s: Hard cap per chunk; we never emit a chunk whose
	estimator output (after ``duration_multiplier``) exceeds this.
	target_duration_s: Soft cap; we close the current chunk when adding
	the next sentence would push it past this. Leaving 5-10 s of
	headroom below ``max_duration_s`` keeps us safe against the
	estimator under-shooting by ~10-15% on action-heavy prompts.
	duration_multiplier: Same breathing-room multiplier the inference
	server applies in ``estimate_duration``; matches the per-chunk
	target the model is actually asked to generate.

	Returns:
	List of :class:`PromptChunk`. Single-chunk prompts return a 1-element
	list with the original prompt unchanged.
	"""
	from duration_estimator import estimate_speech_duration

	def _est(t: str) -> float:
	return estimate_speech_duration(t) * duration_multiplier

	total = _est(prompt)
	if total <= max_duration_s:
	return [PromptChunk(text=prompt, est_duration_s=total)]

	prefix, body = extract_speaker_prefix(prompt)
	sentences = split_sentences_outside_quotes(body)
	if not sentences:
	# Degenerate: no sentence boundaries. Fall back to whitespace-token
	# chunking so we still produce SOMETHING under the cap.
	sentences = body.split()

	chunks: List[PromptChunk] = []
	current: List[str] = []
	current_dur = 0.0

	for sent in sentences:
	candidate = _assemble(prefix, current + [sent])
	cand_dur = _est(candidate)

	if current and cand_dur > target_duration_s:
	# Close the current chunk before adding this sentence.
	assembled = _assemble(prefix, current)
	chunks.append(PromptChunk(text=assembled, est_duration_s=_est(assembled)))
	current = [sent]
	current_dur = _est(_assemble(prefix, current))
	else:
	current.append(sent)
	current_dur = cand_dur

	# Pathological case: a single sentence whose estimator output is
	# already past max_duration_s. Emit it on its own and let downstream
	# generate() truncate the request at the model's hard limit; the user
	# gets a degraded but non-crashing result instead of an exception.
	if len(current) == 1 and current_dur > max_duration_s:
	solo = _assemble(prefix, current)
	chunks.append(PromptChunk(text=solo, est_duration_s=current_dur))
	current = []
	current_dur = 0.0

	if current:
	assembled = _assemble(prefix, current)
	chunks.append(PromptChunk(text=assembled, est_duration_s=_est(assembled)))

	return chunks