Dramabox / ltx2 /ltx_pipelines /utils /constants.py
Manmay's picture
DramaBox Space — initial app + vendored ltx2
08c5e28 verified
import logging
from dataclasses import dataclass, field, replace
from safetensors import safe_open
from ltx_core.components.guiders import MultiModalGuiderParams
from ltx_core.types import SpatioTemporalScaleFactors
# =============================================================================
# Diffusion Schedule
# =============================================================================
# Noise schedule for the distilled pipeline. These sigma values control noise
# levels at each denoising step and were tuned to match the distillation process.
DISTILLED_SIGMA_VALUES = [1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0]
# Reduced schedule for super-resolution stage 2 (subset of distilled values)
STAGE_2_DISTILLED_SIGMA_VALUES = [0.909375, 0.725, 0.421875, 0.0]
# =============================================================================
# Pipeline Parameters
# =============================================================================
@dataclass(frozen=True)
class PipelineParams:
seed: int = 10
stage_1_height: int = 512
stage_1_width: int = 768
num_frames: int = 121
frame_rate: float = 24.0
num_inference_steps: int = 40
video_guider_params: MultiModalGuiderParams = field(
default_factory=lambda: MultiModalGuiderParams(
cfg_scale=3.0,
stg_scale=1.0,
rescale_scale=0.7,
modality_scale=3.0,
skip_step=0,
stg_blocks=[29],
)
)
audio_guider_params: MultiModalGuiderParams = field(
default_factory=lambda: MultiModalGuiderParams(
cfg_scale=7.0,
stg_scale=1.0,
rescale_scale=0.7,
modality_scale=3.0,
skip_step=0,
stg_blocks=[29],
)
)
@property
def stage_2_height(self) -> int:
return int(self.stage_1_height * 2)
@property
def stage_2_width(self) -> int:
return int(self.stage_1_width * 2)
# Default params for LTX-2.0 non-distilled models. These can be overridden by detecting from checkpoint metadata.
LTX_2_PARAMS = PipelineParams()
# Default params for LTX-2.3 non-distilled models. These override some of the LTX-2.0 defaults.
LTX_2_3_PARAMS = replace(
LTX_2_PARAMS,
num_inference_steps=30,
video_guider_params=replace(LTX_2_PARAMS.video_guider_params, stg_blocks=[28]),
audio_guider_params=replace(LTX_2_PARAMS.audio_guider_params, stg_blocks=[28]),
)
LTX_2_3_HQ_PARAMS = PipelineParams(
num_inference_steps=15,
stage_1_height=1088 // 2,
stage_1_width=1920 // 2,
video_guider_params=MultiModalGuiderParams(
cfg_scale=3.0,
stg_scale=0.0,
rescale_scale=0.45,
modality_scale=3.0,
skip_step=0,
stg_blocks=[],
),
audio_guider_params=MultiModalGuiderParams(
cfg_scale=7.0,
stg_scale=0.0,
rescale_scale=1.0,
modality_scale=3.0,
skip_step=0,
stg_blocks=[],
),
)
DEFAULT_LORA_STRENGTH = 1.0
DEFAULT_IMAGE_CRF = 33
VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
VIDEO_LATENT_CHANNELS = 128
_LTX_2_3_MODEL_VERSION_PREFIX = "2.3"
def detect_params(checkpoint_path: str) -> PipelineParams:
"""Detect pipeline params from checkpoint metadata.
Reads the ``model_version`` field from the safetensors config metadata.
Returns ``LTX_2_3_PARAMS`` when the version starts with "2.3",
otherwise falls back to ``LTX_2_PARAMS``.
"""
logger = logging.getLogger(__name__)
try:
with safe_open(checkpoint_path, framework="pt") as f:
metadata = f.metadata() or {}
version = metadata.get("model_version", "")
except Exception:
logger.warning("Could not read checkpoint metadata from %s, using LTX-2 defaults", checkpoint_path)
return LTX_2_PARAMS
if version.startswith(_LTX_2_3_MODEL_VERSION_PREFIX):
return LTX_2_3_PARAMS
logger.info("Using LTX_2_PARAMS for checkpoint (version=%s)", version or "unknown")
return LTX_2_PARAMS
# =============================================================================
# Prompts
# =============================================================================
DEFAULT_NEGATIVE_PROMPT = (
"blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, "
"grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, "
"deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, "
"wrong hand count, artifacts around text, inconsistent perspective, camera shake, incorrect depth of "
"field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent "
"lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny "
"valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, wrong gaze direction, "
"mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, "
"off-sync audio, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward "
"pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, flat lighting, "
"inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
)