Spaces:

bytedance-research
/

Lance

Running on Zero

File size: 170,287 Bytes

from __future__ import annotations

import argparse
import base64
import concurrent.futures
import gc
import hashlib
import html
import math
import json
import os
import random
import re
import shutil
import subprocess
import sys
import threading
import time
import traceback
from collections import deque
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Optional
from urllib.parse import quote

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")

try:
    import spaces
except ImportError:  # pragma: no cover - keeps local CPU runs working
    class _SpacesShim:
        @staticmethod
        def GPU(*args, **kwargs):
            if args and callable(args[0]) and not kwargs:
                return args[0]

            def decorator(fn):
                return fn

            return decorator

    spaces = _SpacesShim()

import gradio as gr
import torch
from huggingface_hub import snapshot_download
from safetensors import safe_open
from safetensors.torch import load_file, save_file
from transformers import set_seed
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig

from common.utils.logging import get_logger
from common.utils.misc import AutoEncoderParams, tuple_mul
from config.config_factory import DataArguments, InferenceArguments, ModelArguments
from data.data_utils import add_special_tokens
from data.dataset_base import DataConfig, simple_custom_collate
from data.datasets_custom import ValidationDataset
from inference_lance import (
    PROMPT_JSON_FILENAME,
    apply_inference_defaults,
    clean_memory,
    init_from_model_path_if_needed,
    save_prompt_results,
    validate_on_fixed_batch,
)
from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
from modeling.qwen2 import Qwen2Tokenizer
from modeling.qwen2.modeling_qwen2 import Qwen2Config
from modeling.vae.wan.model import WanVideoVAE
from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel


REPO_ROOT = Path(__file__).resolve().parent
GRADIO_TMP_ROOT = Path(os.getenv("LANCE_GRADIO_TMP_ROOT", "/tmp/lance_gradio")).expanduser()
TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
RUN_RECORD_FILENAME = "generation_record.json"

LOCAL_MODEL_BASE_DIR = Path("downloads")
SPACE_MODEL_BASE_DIR = Path("/data/lance_models")
DEFAULT_MODEL_REPO_ID = "bytedance-research/Lance"
DEFAULT_FLASH_ATTN_VERSION = "2.8.3"
DEFAULT_FLASH_ATTN_WHEEL_URL = "https://huggingface.co/strangertoolshf/flash_attention_2_wheelhouse/resolve/main/wheelhouse-flash_attn-2.8.3/linux_x86_64/torch2.8/cu12/abiTRUE/cp310/flash_attn-2.8.3+cu12torch2.8cxx11abiTRUE-cp310-cp310-linux_x86_64.whl"
DEFAULT_MODEL_VARIANT = "video"
MODEL_VARIANT_VIDEO = "video"
MODEL_VARIANT_IMAGE = "image"
MODEL_VARIANT_TO_DIR = {
    MODEL_VARIANT_VIDEO: "Lance_3B_Video",
    MODEL_VARIANT_IMAGE: "Lance_3B",
}
DEFAULT_MODEL_PATH = LOCAL_MODEL_BASE_DIR / MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]
DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
DEFAULT_TASK = "t2v"
DEFAULT_TIMESTEPS = 30
DEFAULT_TIMESTEP_SHIFT = 3.5
DEFAULT_CFG_TEXT_SCALE = 4.0
DEFAULT_RESOLUTION = "video_480p"
DEFAULT_VIDEO_EDIT_RESOLUTION = "video_480p"
DEFAULT_IMAGE_RESOLUTION = "image_768x768"
DEFAULT_BASIC_SEED = 42
DEFAULT_HEIGHT = 352
DEFAULT_WIDTH = 640
DEFAULT_IMAGE_SIZE = 768
DEFAULT_VIDEO_DURATION_SECONDS = 5
MAX_VIDEO_DURATION_SECONDS = 10
MAX_VIDEO_NUM_FRAMES = 12 * MAX_VIDEO_DURATION_SECONDS + 1
DEFAULT_NUM_FRAMES = 12 * DEFAULT_VIDEO_DURATION_SECONDS + 1
DEFAULT_VIDEO_ASPECT_RATIO = "16:9"
DEFAULT_IMAGE_ASPECT_RATIO = "1:1"
ASPECT_RATIO_CHOICES = ["21:9", "16:9", "3:2", "4:3", "1:1", "3:4", "2:3", "9:16"]

VIDEO_360P_ASPECT_RATIO_TO_SIZE = {
    "21:9": (672, 288),
    "16:9": (640, 352),
    "3:2": (528, 352),
    "4:3": (560, 416),
    "1:1": (480, 480),
    "3:4": (416, 560),
    "2:3": (352, 528),
    "9:16": (352, 640),
}

VIDEO_480P_ASPECT_RATIO_TO_SIZE = {
    "21:9": (976, 416),
    "16:9": (848, 480),
    "3:2": (784, 528),
    "4:3": (736, 560),
    "1:1": (640, 640),
    "3:4": (560, 736),
    "2:3": (528, 784),
    "9:16": (480, 848),
}

VIDEO_RESOLUTION_TO_SIZE_MAP = {
    "video_360p": VIDEO_360P_ASPECT_RATIO_TO_SIZE,
    "video_480p": VIDEO_480P_ASPECT_RATIO_TO_SIZE,
}

IMAGE_ASPECT_RATIO_TO_SIZE = {
    "21:9": (1168, 496),
    "16:9": (1024, 576),
    "3:2": (944, 624),
    "4:3": (880, 672),
    "1:1": (768, 768),
    "3:4": (672, 880),
    "2:3": (624, 944),
    "9:16": (576, 1024),
}
DEFAULT_GPUS = "0"
DEFAULT_QUEUE_SIZE = 32
DEFAULT_CONCURRENCY_LIMIT = 1
USE_KVCACHE = True
TEXT_TEMPLATE = True
RECORD_WRITE_LOCK = threading.Lock()

LANCE_HOMEPAGE_URL = "https://lance-project.github.io/"
LANCE_PAPER_URL = "http://arxiv.org/abs/2605.18678"
LANCE_HUGGING_FACE_URL = "https://huggingface.co/bytedance-research/Lance"
LANCE_GITHUB_URL = "https://github.com/bytedance/Lance"
LANCE_LOGO_PATH = REPO_ROOT / "assets" / "logo" / "lance-logo.png"

APP_CSS = """
:root {
  color-scheme: light;
  --lance-accent: #fb923c;
  --lance-accent-hover: #f97316;
  --lance-surface: #ffffff;
  --lance-surface-muted: #f8fafc;
  --lance-border: rgba(148, 163, 184, .36);
  --lance-text: #111827;
  --lance-text-muted: #475569;
  --lance-shadow: 0 8px 24px rgba(15, 23, 42, .08);
  --body-background-fill: var(--lance-surface);
  --background-fill-primary: var(--lance-surface);
  --block-background-fill: var(--lance-surface);
  --input-background-fill: var(--lance-surface);
  --button-primary-background-fill: var(--lance-accent);
  --button-primary-background-fill-hover: var(--lance-accent-hover);
  --button-primary-text-color: #0f172a;
}
body, .gradio-container, .contain { background: var(--lance-surface) !important; color: var(--lance-text) !important; }
.gradio-container, .contain { max-width: 1180px !important; margin: 0 auto !important; }
.lance-hero { text-align: center; padding: 8px 12px 4px; }
.lance-logo { width: min(150px, 34vw); height: auto; display: block; margin: 0 auto 4px; }
.lance-title { margin: 0 auto 5px; font-size: clamp(22px, 2.4vw, 32px); line-height: 1.08; font-weight: 800; }
.lance-badges { display: flex; flex-wrap: wrap; justify-content: center; gap: 6px; margin: 4px auto 0; }
.lance-badges a { line-height: 0; }
.lance-badges img { height: 20px; width: auto; display: block; }
.lance-status, .lance-run-status { max-width: 1120px; margin: 8px auto !important; }
.lance-run-status p { margin: 0 !important; }
.lance-run-status-pill { display: inline-flex; align-items: center; gap: 8px; padding: 8px 12px; border-radius: 999px; border: 1px solid var(--lance-border); background: var(--lance-surface); color: var(--lance-text-muted); font-size: 14px; font-weight: 700; box-shadow: var(--lance-shadow); }
.lance-run-status-chip { width: 8px; height: 8px; border-radius: 999px; background: var(--lance-accent); box-shadow: 0 0 0 4px rgba(251,146,60,.18); }
.lance-run-status-dots i { display: inline-block; width: 4px; height: 4px; margin-left: 3px; border-radius: 999px; background: currentColor; opacity: .45; animation: lance-dot-pulse 1.1s infinite ease-in-out; }
.lance-run-status-dots i:nth-child(2) { animation-delay: .15s; }
.lance-run-status-dots i:nth-child(3) { animation-delay: .3s; }
@keyframes lance-dot-pulse { 40% { transform: translateY(-1px); opacity: 1; } }

.lance-main-row { display: grid !important; grid-template-columns: minmax(0, 1.16fr) minmax(0, 0.84fr) !important; gap: 18px !important; align-items: start !important; }
.lance-main-column { min-width: 0 !important; width: 100% !important; }
.lance-panel, .lance-control-field, .example-panel { border: 0 !important; box-shadow: none !important; background: transparent !important; padding: 0 !important; }
.lance-panel > .form, .lance-control-field > .form, .lance-label-html, .lance-label-html > div, .lance-label-html .wrap { border: 0 !important; background: transparent !important; box-shadow: none !important; padding: 0 !important; margin: 0 !important; min-height: 0 !important; }
.lance-section-label, .lance-generation-label { margin: 0 0 10px !important; font-weight: 800 !important; color: var(--body-text-color) !important; }
.lance-section-label { font-size: 18px !important; }
.lance-generation-label { font-size: 14px !important; }
.lance-label-icon { display: none !important; }
.lance-output-label { display: inline-flex !important; align-items: center !important; gap: 8px !important; }
.lance-output-label .lance-label-icon { display: inline-flex !important; align-items: center !important; justify-content: center !important; width: 20px !important; height: 20px !important; color: var(--lance-accent) !important; }
.lance-output-label .lance-label-icon svg { width: 18px !important; height: 18px !important; display: block !important; }

.lance-taskbar-wrap { max-width: 1120px; margin: 0 auto 12px !important; }
.task-selector {
  overflow-x: auto !important;
  padding: 4px 0 12px !important;
  scrollbar-width: thin;
  display: flex !important;
  justify-content: center !important;
}
.task-selector > .wrap, .task-selector .wrap {
  width: max-content !important;
  max-width: min(100%, 1080px) !important;
  margin: 0 auto !important;
  padding: 4px !important;
  display: flex !important;
  justify-content: center !important;
  flex-wrap: nowrap !important;
  gap: 10px !important;
  border-radius: 999px !important;
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
}
.task-selector label {
  min-width: max-content !important;
  min-height: 38px !important;
  padding: 9px 18px !important;
  border: 0 !important;
  border-radius: 999px !important;
  background: #f1f5f9 !important;
  color: var(--lance-text-muted) !important;
  justify-content: center !important;
  white-space: nowrap !important;
}
.task-selector label:has(input:checked) { background: var(--lance-accent) !important; color: #0f172a !important; box-shadow: 0 6px 16px rgba(251,146,60,.22) !important; }
.task-selector input:checked + span { color: #0f172a !important; font-weight: 800 !important; }

.lance-taskbar-wrap,
.lance-taskbar-wrap > div,
.lance-taskbar-wrap > .form,
.lance-taskbar-wrap .block,
.task-selector,
.task-selector > div,
.task-selector > .form,
.task-selector .form,
.task-selector .wrap {
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
}
.task-selector > .wrap,
.task-selector .wrap {
  padding: 0 !important;
}
.task-selector label {
  background: #f8fafc !important;
  border: 1px solid rgba(148,163,184,.25) !important;
  box-shadow: 0 3px 10px rgba(15,23,42,.04) !important;
}
.task-selector label:has(input:checked) {
  background: var(--lance-accent) !important;
  border-color: transparent !important;
  color: #0f172a !important;
  box-shadow: 0 8px 18px rgba(249,115,22,.24) !important;
}
.task-selector input:checked + span { color: #0f172a !important; }

.lance-task-prompt-panel { max-width: 1040px; margin: 0 auto 10px !important; }
.main-prompt-control, .main-prompt-control > div, .main-prompt-control .wrap { border: 0 !important; background: transparent !important; box-shadow: none !important; }
.main-prompt-control textarea { min-height: 160px !important; padding: 18px !important; border: 1px solid var(--lance-border) !important; border-radius: 16px !important; background: var(--lance-surface) !important; color: var(--lance-text) !important; font-size: 15px !important; line-height: 1.45 !important; box-shadow: var(--lance-shadow) !important; }
.main-prompt-control textarea::placeholder { color: #94a3b8 !important; }
.prompt-options {
  position: relative !important;
  z-index: 2 !important;
  margin: 8px 0 16px !important;
  padding: 0 !important;
}
.prompt-options > .form {
  display: grid !important;
  grid-template-columns: repeat(4, max-content) !important;
  align-items: center !important;
  justify-content: start !important;
  justify-items: start !important;
  gap: 6px !important;
  width: max-content !important;
  max-width: 100% !important;
}

.prompt-chip,
.prompt-chip > .form,
.prompt-chip > div,
.prompt-chip .block,
.prompt-chip .form,
.prompt-chip .container,
.prompt-chip .wrap {
  width: 100% !important;
  min-width: 0 !important;
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
  padding: 0 !important;
  margin: 0 !important;
}
.prompt-chip {
  display: block !important;
  min-width: 0 !important;
  width: auto !important;
  flex: 0 0 auto !important;
}
.prompt-chip .wrap,
.prompt-chip .container,
.prompt-chip > .form,
.prompt-chip .form {
  display: inline-flex !important;
  align-items: center !important;
  width: auto !important;
}
.prompt-chip button,
.prompt-chip [role="button"],
.prompt-chip select,
.prompt-chip input {
  width: auto !important;
  min-width: 58px !important;
  min-height: 32px !important;
  height: 32px !important;
  border-radius: 999px !important;
  border: 1px solid var(--lance-border) !important;
  outline: 0 !important;
  background: var(--lance-surface-muted) !important;
  color: var(--lance-text) !important;
  font-size: 10px !important;
  font-weight: 800 !important;
  box-shadow: none !important;
  padding: 0 8px !important;
}
.frame-interpolation-row button,
.frame-interpolation-row [role="button"],
.frame-interpolation-row select,
.frame-interpolation-row input { min-width: 82px !important; }
.video-resolution-row button,
.video-resolution-row [role="button"],
.video-resolution-row select,
.video-resolution-row input { min-width: 58px !important; }
.aspect-ratio-row button,
.aspect-ratio-row [role="button"],
.aspect-ratio-row select,
.aspect-ratio-row input { min-width: 48px !important; }
.video-duration-row button,
.video-duration-row [role="button"],
.video-duration-row select,
.video-duration-row input { min-width: 44px !important; }
.output-resolution-row button,
.output-resolution-row [role="button"],
.output-resolution-row select,
.output-resolution-row input { min-width: 70px !important; }
.prompt-chip button,
.prompt-chip [role="button"] { white-space: nowrap !important; }
.prompt-chip .icon-wrap,
.prompt-chip .select-arrow,
.prompt-chip .label-wrap,
.prompt-chip .block-title,
.prompt-chip .block-info,
.prompt-chip label {
  background: transparent !important;
  border: 0 !important;
  box-shadow: none !important;
}
@media (max-width: 1200px) {
  .lance-main-row { grid-template-columns: minmax(0, 1.24fr) minmax(0, 0.76fr) !important; }
  .prompt-options > .form {
    grid-template-columns: repeat(4, max-content) !important;
    justify-content: start !important;
    gap: 4px !important;
  }
  .prompt-chip button, .prompt-chip [role="button"], .prompt-chip select, .prompt-chip input {
    font-size: 9.5px !important;
    min-width: 50px !important;
    padding: 0 6px !important;
  }
  .frame-interpolation-row button,
  .frame-interpolation-row [role="button"],
  .frame-interpolation-row select,
  .frame-interpolation-row input { min-width: 76px !important; }
  .aspect-ratio-row button,
  .aspect-ratio-row [role="button"],
  .aspect-ratio-row select,
  .aspect-ratio-row input { min-width: 42px !important; }
  .video-duration-row button,
  .video-duration-row [role="button"],
  .video-duration-row select,
  .video-duration-row input { min-width: 40px !important; }
}

.prompt-options {
  margin: 8px 0 16px !important;
  padding: 0 !important;
}
.prompt-options > .form {
  display: inline-flex !important;
  flex-wrap: nowrap !important;
  justify-content: flex-start !important;
  justify-items: start !important;
  align-items: center !important;
  gap: 6px !important;
  width: auto !important;
  max-width: 100% !important;
}
.prompt-chip,
.prompt-chip > .form,
.prompt-chip > div,
.prompt-chip .block,
.prompt-chip .form,
.prompt-chip .container,
.prompt-chip .wrap {
  width: auto !important;
  min-width: 0 !important;
  max-width: none !important;
}
.prompt-chip button,
.prompt-chip [role="button"],
.prompt-chip select,
.prompt-chip input {
  width: auto !important;
  min-width: 0 !important;
  height: 30px !important;
  min-height: 30px !important;
  font-size: 9.5px !important;
  padding: 0 8px !important;
  border-radius: 999px !important;
}
.frame-interpolation-row button,
.frame-interpolation-row [role="button"],
.frame-interpolation-row select,
.frame-interpolation-row input { min-width: 74px !important; max-width: 82px !important; }
.video-resolution-row button,
.video-resolution-row [role="button"],
.video-resolution-row select,
.video-resolution-row input { min-width: 50px !important; max-width: 58px !important; }
.aspect-ratio-row button,
.aspect-ratio-row [role="button"],
.aspect-ratio-row select,
.aspect-ratio-row input { min-width: 44px !important; max-width: 52px !important; }
.video-duration-row button,
.video-duration-row [role="button"],
.video-duration-row select,
.video-duration-row input { min-width: 38px !important; max-width: 46px !important; }
.output-resolution-row button,
.output-resolution-row [role="button"],
.output-resolution-row select,
.output-resolution-row input { min-width: 64px !important; max-width: 80px !important; }
@media (max-width: 1200px) {
  .prompt-options > .form {
    display: inline-flex !important;
    flex-wrap: nowrap !important;
    justify-content: flex-start !important;
    gap: 4px !important;
    width: auto !important;
  }
  .prompt-chip button,
  .prompt-chip [role="button"],
  .prompt-chip select,
  .prompt-chip input {
    font-size: 9px !important;
    padding: 0 6px !important;
    height: 29px !important;
    min-height: 29px !important;
  }
}

.lance-display-frame, .lance-display-frame > div, .lance-display-frame textarea, .output-media-control { width: 100% !important; }
.lance-output-panel { background: transparent !important; }
.lance-output-panel .lance-display-frame > div,
.lance-output-panel .lance-display-frame .wrap,
.lance-output-panel .output-media-control,
.lance-output-panel .output-media-control > div {
  border: 0 !important;
  background: transparent !important;
  box-shadow: none !important;
  padding: 0 !important;
}
.lance-output-panel .output-media-control video,
.lance-output-panel .output-media-control img,
.lance-output-panel .lance-display-frame textarea {
  border-radius: 18px !important;
  border: 1px solid rgba(116, 126, 140, .34) !important;
  background: linear-gradient(180deg, rgba(250,251,253,.94), rgba(244,246,249,.9)) !important;
  box-shadow: 0 10px 28px rgba(15,23,42,.10), inset 0 0 0 1px rgba(255,255,255,.75) !important;
}
.lance-output-panel .lance-display-frame textarea { color: #101828 !important; }
.output-media-control video, .output-media-control img { border-radius: 18px !important; }
.lance-run-button { max-width: 1040px !important; margin: 10px auto 16px !important; border-radius: 12px !important; font-size: 18px !important; font-weight: 800 !important; }
.lance-quota-note {
  max-width: 1040px !important;
  margin: -8px auto 16px !important;
  text-align: center !important;
  color: var(--lance-text-muted) !important;
  font-size: 13px !important;
  line-height: 1.45 !important;
}
.lance-quota-note p {
  margin: 0 !important;
}
button.lance-run-button, .lance-run-button button { width: 100% !important; border: 0 !important; border-radius: 12px !important; background: var(--lance-accent) !important; color: #0f172a !important; font-size: 18px !important; font-weight: 800 !important; box-shadow: 0 10px 24px rgba(249,115,22,.22) !important; }
button.lance-run-button:hover, .lance-run-button button:hover { background: var(--lance-accent-hover) !important; color: #0f172a !important; }

button.lance-run-button, .lance-run-button button {
  background: var(--lance-accent) !important;
  color: #0f172a !important;
  box-shadow: 0 10px 24px rgba(249,115,22,.22) !important;
}
button.lance-run-button:hover, .lance-run-button button:hover {
  background: var(--lance-accent-hover) !important;
  color: #0f172a !important;
}

.lance-advanced-accordion { max-width: 1040px; margin: 8px auto 0 !important; }
.lance-advanced-accordion .label-wrap, .lance-advanced-accordion summary { font-weight: 800 !important; }

.lance-recommended-section { max-width: 1040px; margin: 20px auto 0 !important; }
.lance-recommended-section .lance-section-label { text-align: left !important; font-size: 20px !important; margin-bottom: 12px !important; }
.prompt-example-full-table {
  max-height: 420px !important;
  overflow: auto !important;
  border: 1px solid rgba(148,163,184,.24) !important;
  border-radius: 18px !important;
  background: linear-gradient(180deg, #ffffff, #f8fafc) !important;
  box-shadow: 0 12px 28px rgba(15,23,42,.07) !important;
  padding: 12px !important;
}
.prompt-example-full-table > .form { gap: 10px !important; }
.prompt-examples .prompt-example-row-button,
.prompt-examples .prompt-example-row-button button {
  width: 100% !important;
  height: auto !important;
  min-height: 52px !important;
  max-height: 150px !important;
  padding: 12px 14px !important;
  border: 1px solid rgba(148,163,184,.22) !important;
  border-radius: 14px !important;
  background: #fff !important;
  color: var(--lance-text) !important;
  text-align: left !important;
  justify-content: flex-start !important;
  align-items: flex-start !important;
  white-space: normal !important;
  overflow-y: auto !important;
  box-shadow: 0 6px 16px rgba(15,23,42,.045) !important;
  transition: transform .12s ease, box-shadow .12s ease, border-color .12s ease !important;
}
.prompt-examples .prompt-example-row-button:hover,
.prompt-examples .prompt-example-row-button button:hover {
  transform: translateY(-1px) !important;
  border-color: rgba(251,146,60,.48) !important;
  box-shadow: 0 10px 22px rgba(15,23,42,.075) !important;
}
.prompt-examples .prompt-example-row-button span,
.prompt-examples .prompt-example-row-button p,
.prompt-examples .prompt-example-row-button div {
  white-space: pre-wrap !important;
  overflow-wrap: anywhere !important;
  word-break: break-word !important;
  line-height: 1.38 !important;
  color: var(--lance-text) !important;
}

.prompt-example-multimodal-row,
.prompt-example-multimodal-row > .form {
  width: 100% !important;
  min-width: 0 !important;
  margin: 0 !important;
  gap: 12px !important;
  align-items: stretch !important;
}
.prompt-example-multimodal-row > .form {
  display: grid !important;
  grid-template-columns: minmax(0, 1fr) 230px !important;
  padding: 8px !important;
  border: 1px solid rgba(148,163,184,.20) !important;
  border-radius: 16px !important;
  background: #fff !important;
  box-shadow: 0 6px 16px rgba(15,23,42,.045) !important;
}
.prompt-example-prompt-cell,
.prompt-example-prompt-cell > .form,
.prompt-example-media-cell,
.prompt-example-media-cell > .form {
  min-width: 0 !important;
  width: 100% !important;
  margin: 0 !important;
  padding: 0 !important;
  border: 0 !important;
  background: transparent !important;
  box-shadow: none !important;
}
.prompt-example-multimodal-row .prompt-example-row-button,
.prompt-example-multimodal-row .prompt-example-row-button button {
  height: 100% !important;
  min-height: 132px !important;
  max-height: 132px !important;
  border: 0 !important;
  box-shadow: none !important;
  background: #f8fafc !important;
}
.prompt-example-media-html,
.prompt-example-media-html > div,
.prompt-example-media-html .wrap {
  width: 100% !important;
  height: 132px !important;
  min-height: 132px !important;
  max-height: 132px !important;
  margin: 0 !important;
  padding: 0 !important;
  border: 1px solid rgba(148,163,184,.22) !important;
  border-radius: 14px !important;
  background: #fff !important;
  box-shadow: none !important;
  overflow: hidden !important;
}
.prompt-example-media-html video,
.prompt-example-media-html img,
.example-preview-video,
.example-preview-image {
  width: 100% !important;
  height: 132px !important;
  border-radius: 12px !important;
  display: block !important;
  background: var(--lance-surface-muted) !important;
  object-fit: contain !important;
  object-position: center center !important;
}
.reference-media-fallback {
  width: 100% !important;
  height: 132px !important;
  border-radius: 12px !important;
  display: flex !important;
  align-items: center !important;
  justify-content: center !important;
  background: var(--lance-surface-muted) !important;
  color: var(--lance-text-muted) !important;
  font-size: 12px !important;
  font-weight: 700 !important;
  text-align: center !important;
}
@media (max-width: 760px) {
  .prompt-example-multimodal-row > .form { grid-template-columns: minmax(0, 1fr) 140px !important; }
  .prompt-example-multimodal-row .prompt-example-row-button,
  .prompt-example-multimodal-row .prompt-example-row-button button,
  .prompt-example-media-html,
  .prompt-example-media-html > div,
  .prompt-example-media-html .wrap,
  .prompt-example-media-html video,
  .prompt-example-media-html img,
  .example-preview-video,
  .example-preview-image {
    height: 108px !important;
    min-height: 108px !important;
    max-height: 108px !important;
  }
}

@media (max-width: 900px) { .lance-main-row { grid-template-columns: minmax(0, 1fr) !important; } .prompt-options { margin-top: 8px !important; } }

.prompt-example-full-table {
  max-height: none !important;
  overflow: visible !important;
  padding: 18px !important;
}
.prompt-example-full-table > .form {
  gap: 18px !important;
}
.prompt-examples .prompt-example-row-button,
.prompt-examples .prompt-example-row-button button {
  min-height: 168px !important;
  height: auto !important;
  max-height: none !important;
  padding: 22px 24px !important;
  line-height: 1.62 !important;
  overflow: hidden !important;
  display: flex !important;
  align-items: flex-start !important;
}
.prompt-examples .prompt-example-row-button span,
.prompt-examples .prompt-example-row-button p,
.prompt-examples .prompt-example-row-button div {
  line-height: 1.62 !important;
  overflow: hidden !important;
}
.prompt-example-multimodal-row .prompt-example-row-button,
.prompt-example-multimodal-row .prompt-example-row-button button,
.prompt-example-media-html,
.prompt-example-media-html > div,
.prompt-example-media-html .wrap,
.prompt-example-media-html video,
.prompt-example-media-html img,
.example-preview-video,
.example-preview-image,
.reference-media-fallback {
  min-height: 160px !important;
  height: 160px !important;
  max-height: 160px !important;
}

.prompt-example-full-table {
  max-height: 560px !important;
}
.prompt-examples .prompt-example-row-button,
.prompt-examples .prompt-example-row-button button {
  min-height: 96px !important;
  max-height: none !important;
  padding: 18px 20px !important;
  overflow-y: visible !important;
}
.prompt-examples .prompt-example-row-button span,
.prompt-examples .prompt-example-row-button p,
.prompt-examples .prompt-example-row-button div {
  line-height: 1.55 !important;
}

.task-selector label:has(input:checked) {
  box-shadow: 0 4px 10px rgba(249,115,22,.12) !important;
}

.prompt-options {
  margin: 5px 0 14px !important;
}
.prompt-options > .form {
  gap: 7px !important;
}
.prompt-chip button,
.prompt-chip [role="button"],
.prompt-chip select,
.prompt-chip input {
  height: 31px !important;
  min-height: 31px !important;
  font-size: 10.5px !important;
  padding: 0 9px !important;
}
.frame-interpolation-row button,
.frame-interpolation-row [role="button"],
.frame-interpolation-row select,
.frame-interpolation-row input { min-width: 78px !important; max-width: 88px !important; }
.video-resolution-row button,
.video-resolution-row [role="button"],
.video-resolution-row select,
.video-resolution-row input { min-width: 54px !important; max-width: 62px !important; }
.aspect-ratio-row button,
.aspect-ratio-row [role="button"],
.aspect-ratio-row select,
.aspect-ratio-row input { min-width: 48px !important; max-width: 56px !important; }
.video-duration-row button,
.video-duration-row [role="button"],
.video-duration-row select,
.video-duration-row input { min-width: 42px !important; max-width: 50px !important; }
.output-resolution-row button,
.output-resolution-row [role="button"],
.output-resolution-row select,
.output-resolution-row input { min-width: 68px !important; max-width: 86px !important; }

.lance-recommended-section { margin-top: 24px !important; }
.prompt-example-full-table {
  max-height: 480px !important;
  padding: 16px !important;
}
.prompt-example-full-table > .form {
  gap: 12px !important;
}
.prompt-examples .prompt-example-row-button,
.prompt-examples .prompt-example-row-button button {
  min-height: 66px !important;
  padding: 16px 18px !important;
  line-height: 1.48 !important;
}
.prompt-examples .prompt-example-row-button span,
.prompt-examples .prompt-example-row-button p,
.prompt-examples .prompt-example-row-button div {
  line-height: 1.48 !important;
}
.prompt-example-multimodal-row,
.prompt-example-multimodal-row > .form {
  gap: 14px !important;
}
.prompt-example-multimodal-row > .form {
  padding: 12px !important;
}
.prompt-example-multimodal-row .prompt-example-row-button,
.prompt-example-multimodal-row .prompt-example-row-button button,
.prompt-example-media-html,
.prompt-example-media-html > div,
.prompt-example-media-html .wrap,
.prompt-example-media-html video,
.prompt-example-media-html img,
.example-preview-video,
.example-preview-image,
.reference-media-fallback {
  min-height: 148px !important;
  height: 148px !important;
  max-height: 148px !important;
}

@media (max-width: 1200px) {
  .prompt-options { margin-top: 5px !important; }
  .prompt-chip button,
  .prompt-chip [role="button"],
  .prompt-chip select,
  .prompt-chip input {
    font-size: 10px !important;
    height: 30px !important;
    min-height: 30px !important;
    padding: 0 7px !important;
  }
}

.prompt-example-full-table,
.prompt-example-full-table > .form,
.prompt-examples,
.prompt-examples > .form {
  max-height: none !important;
  height: auto !important;
  overflow: visible !important;
}

.prompt-example-full-table {
  padding: 16px !important;
}

.prompt-example-full-table > .form {
  gap: 14px !important;
}

.prompt-examples .prompt-example-row-button,
.prompt-examples .prompt-example-row-button button {
  min-height: 96px !important;
  height: auto !important;
  max-height: none !important;
  padding: 18px 22px !important;
  overflow: visible !important;
  white-space: normal !important;
  display: block !important;
  text-align: left !important;
}

.prompt-examples .prompt-example-row-button span,
.prompt-examples .prompt-example-row-button p,
.prompt-examples .prompt-example-row-button div {
  max-height: none !important;
  height: auto !important;
  overflow: visible !important;
  white-space: normal !important;
  overflow-wrap: anywhere !important;
  word-break: normal !important;
  line-height: 1.5 !important;
  text-overflow: unset !important;
  -webkit-line-clamp: unset !important;
  line-clamp: unset !important;
}

.prompt-example-multimodal-row,
.prompt-example-multimodal-row > .form {
  max-height: none !important;
  overflow: visible !important;
  gap: 12px !important;
}

.prompt-example-multimodal-row > .form {
  padding: 12px !important;
}

.prompt-example-multimodal-row .prompt-example-row-button,
.prompt-example-multimodal-row .prompt-example-row-button button,
.prompt-example-media-html,
.prompt-example-media-html > div,
.prompt-example-media-html .wrap,
.prompt-example-media-html video,
.prompt-example-media-html img,
.example-preview-video,
.example-preview-image,
.reference-media-fallback {
  min-height: 148px !important;
  height: 148px !important;
  max-height: 148px !important;
}

.lance-output-panel .output-media-control {
  min-height: 220px !important;
  border: 1px solid rgba(116,126,140,.34) !important;
  border-radius: 18px !important;
  background: linear-gradient(180deg, rgba(250,251,253,.94), rgba(244,246,249,.9)) !important;
  box-shadow: 0 10px 28px rgba(15,23,42,.10), inset 0 0 0 1px rgba(255,255,255,.75) !important;
  overflow: hidden !important;
}

.lance-output-panel .output-media-control > div,
.lance-output-panel .output-media-control .wrap {
  border: 0 !important;
  background: transparent !important;
  box-shadow: none !important;
}

.lance-output-panel .output-media-control video,
.lance-output-panel .output-media-control img {
  border: 0 !important;
  background: transparent !important;
  box-shadow: none !important;
  border-radius: 18px !important;
  width: 100% !important;
  height: 100% !important;
  object-fit: contain !important;
}

.frame-interpolation-row button,
.frame-interpolation-row [role="button"],
.frame-interpolation-row select,
.frame-interpolation-row input {
  min-width: 138px !important;
  max-width: 158px !important;
  width: auto !important;
  font-size: 10.5px !important;
  padding-left: 12px !important;
  padding-right: 12px !important;
}

@media (max-width: 1200px) {
  .frame-interpolation-row button,
  .frame-interpolation-row [role="button"],
  .frame-interpolation-row select,
  .frame-interpolation-row input {
    min-width: 126px !important;
    max-width: 146px !important;
    font-size: 10px !important;
    padding-left: 10px !important;
    padding-right: 10px !important;
  }
}

.lance-output-panel .output-text-control {
  min-height: 220px !important;
  border: 1px solid rgba(116,126,140,.34) !important;
  border-radius: 18px !important;
  background: linear-gradient(180deg, rgba(250,251,253,.94), rgba(244,246,249,.9)) !important;
  box-shadow: 0 10px 28px rgba(15,23,42,.10), inset 0 0 0 1px rgba(255,255,255,.75) !important;
  overflow: hidden !important;
  padding: 0 !important;
}

.lance-output-panel .output-text-control > div,
.lance-output-panel .output-text-control .wrap,
.lance-output-panel .output-text-control .container {
  border: 0 !important;
  background: transparent !important;
  box-shadow: none !important;
  padding: 0 !important;
}

.lance-output-panel .output-text-control textarea {
  min-height: 220px !important;
  border: 0 !important;
  border-radius: 18px !important;
  background: transparent !important;
  box-shadow: none !important;
  color: #101828 !important;
  padding: 18px !important;
  resize: none !important;
}

.prompt-options > .form {
  display: inline-flex !important;
  flex-wrap: nowrap !important;
  justify-content: flex-start !important;
  align-items: center !important;
  gap: 8px !important;
  width: auto !important;
  max-width: 100% !important;
}

.prompt-chip button,
.prompt-chip [role="button"],
.prompt-chip select,
.prompt-chip input {
  height: 36px !important;
  min-height: 36px !important;
  font-size: 12px !important;
  font-weight: 800 !important;
  padding-left: 12px !important;
  padding-right: 12px !important;
}

.frame-interpolation-row button,
.frame-interpolation-row [role="button"],
.frame-interpolation-row select,
.frame-interpolation-row input {
  min-width: 166px !important;
  max-width: 184px !important;
}

.video-resolution-row button,
.video-resolution-row [role="button"],
.video-resolution-row select,
.video-resolution-row input {
  min-width: 74px !important;
  max-width: 84px !important;
}

.aspect-ratio-row button,
.aspect-ratio-row [role="button"],
.aspect-ratio-row select,
.aspect-ratio-row input {
  min-width: 72px !important;
  max-width: 82px !important;
}

.video-duration-row button,
.video-duration-row [role="button"],
.video-duration-row select,
.video-duration-row input {
  min-width: 62px !important;
  max-width: 72px !important;
}

.output-resolution-row button,
.output-resolution-row [role="button"],
.output-resolution-row select,
.output-resolution-row input {
  min-width: 92px !important;
  max-width: 114px !important;
}

@media (max-width: 1200px) {
  .prompt-options > .form {
    gap: 6px !important;
  }
  .prompt-chip button,
  .prompt-chip [role="button"],
  .prompt-chip select,
  .prompt-chip input {
    height: 34px !important;
    min-height: 34px !important;
    font-size: 11px !important;
    padding-left: 9px !important;
    padding-right: 9px !important;
  }
  .frame-interpolation-row button,
  .frame-interpolation-row [role="button"],
  .frame-interpolation-row select,
  .frame-interpolation-row input {
    min-width: 148px !important;
    max-width: 166px !important;
  }
  .video-resolution-row button,
  .video-resolution-row [role="button"],
  .video-resolution-row select,
  .video-resolution-row input {
    min-width: 66px !important;
    max-width: 76px !important;
  }
  .aspect-ratio-row button,
  .aspect-ratio-row [role="button"],
  .aspect-ratio-row select,
  .aspect-ratio-row input {
    min-width: 64px !important;
    max-width: 74px !important;
  }
  .video-duration-row button,
  .video-duration-row [role="button"],
  .video-duration-row select,
  .video-duration-row input {
    min-width: 56px !important;
    max-width: 66px !important;
  }
}

.lance-run-button {
  margin-bottom: 6px !important;
}

.lance-quota-note,
.lance-quota-note > div,
.lance-quota-note .wrap,
.lance-quota-note .prose {
  min-height: 0 !important;
  padding-top: 0 !important;
  padding-bottom: 0 !important;
}

.lance-quota-note {
  max-width: 1040px !important;
  margin: 0 auto 8px !important;
  text-align: center !important;
  color: var(--lance-text-muted) !important;
  font-size: 12px !important;
  line-height: 1.1 !important;
}

.lance-quota-note p {
  margin: 0 !important;
  padding: 0 !important;
  line-height: 1.1 !important;
}

.frame-interpolation-row,
.frame-interpolation-disabled {
  display: none !important;
  visibility: hidden !important;
  width: 0 !important;
  max-width: 0 !important;
  height: 0 !important;
  max-height: 0 !important;
  min-height: 0 !important;
  margin: 0 !important;
  padding: 0 !important;
  overflow: hidden !important;
}

"""

APP_JS = None

TASK_T2V = "t2v"
TASK_T2I = "t2i"
TASK_V2T = "v2t"
TASK_X2T = "x2t"
TASK_X2T_VIDEO = "x2t_video"
TASK_X2T_IMAGE = "x2t_image"
TASK_IMAGE_EDIT = "image_edit"
TASK_VIDEO_EDIT = "video_edit"
TASK_LABEL_VIDEO_GENERATION = "Video Generation"
TASK_LABEL_VIDEO_EDIT = "Video Edit"
TASK_LABEL_VIDEO_UNDERSTANDING = "Video Understanding"
TASK_LABEL_IMAGE_GENERATION = "Image Generation"
TASK_LABEL_IMAGE_EDIT = "Image Edit"
TASK_LABEL_IMAGE_UNDERSTANDING = "Image Understanding"
TASK_CHOICES = [
    TASK_LABEL_VIDEO_GENERATION,
    TASK_LABEL_VIDEO_EDIT,
    TASK_LABEL_VIDEO_UNDERSTANDING,
    TASK_LABEL_IMAGE_GENERATION,
    TASK_LABEL_IMAGE_EDIT,
    TASK_LABEL_IMAGE_UNDERSTANDING,
]
TASK_LABEL_TO_INTERNAL = {
    TASK_LABEL_VIDEO_GENERATION: TASK_T2V,
    TASK_LABEL_VIDEO_EDIT: TASK_VIDEO_EDIT,
    TASK_LABEL_VIDEO_UNDERSTANDING: TASK_X2T_VIDEO,
    TASK_LABEL_IMAGE_GENERATION: TASK_T2I,
    TASK_LABEL_IMAGE_EDIT: TASK_IMAGE_EDIT,
    TASK_LABEL_IMAGE_UNDERSTANDING: TASK_X2T_IMAGE,
    TASK_T2V: TASK_T2V,
    TASK_VIDEO_EDIT: TASK_VIDEO_EDIT,
    TASK_V2T: TASK_X2T_VIDEO,
    TASK_X2T: TASK_X2T_VIDEO,
    TASK_X2T_VIDEO: TASK_X2T_VIDEO,
    TASK_T2I: TASK_T2I,
    TASK_IMAGE_EDIT: TASK_IMAGE_EDIT,
    TASK_X2T_IMAGE: TASK_X2T_IMAGE,
}
GENERATION_TASKS = {TASK_T2V, TASK_T2I, TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
VIDEO_RESOLUTION_DISPLAY_CHOICES = [("360p", "video_360p"), ("480p", "video_480p")]
V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."


def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
    """Get Aspect Ratio choices with default/recommended marker for the given task."""
    internal_task = normalize_task(task)
    default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
    return [
        (f"{ratio}" if ratio == default_ratio else ratio, ratio)
        for ratio in ASPECT_RATIO_CHOICES
    ]


def get_video_duration_choices() -> list[tuple[str, int]]:
    return [(f"{seconds}s", seconds) for seconds in range(1, 11)]

def env_flag(name: str, default: bool) -> bool:
    value = os.getenv(name)
    if value is None:
        return default
    return value.strip().lower() in {"1", "true", "yes", "on"}


def running_on_space() -> bool:
    return bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))


def display_path(path: Path) -> str:
    path_text = path.as_posix()
    if path.is_absolute():
        try:
            path_text = path.relative_to(Path.cwd()).as_posix()
        except ValueError:
            return path_text
    if path_text == "." or path_text.startswith("./"):
        return path_text
    return f"./{path_text}"


def get_model_base_dir() -> Path:
    configured = os.getenv("LANCE_MODEL_BASE_DIR")
    if configured:
        configured_path = Path(configured).expanduser()
        if _path_can_be_created_or_written(configured_path):
            return configured_path
    if LOCAL_MODEL_BASE_DIR.exists():
        return LOCAL_MODEL_BASE_DIR
    if running_on_space() and SPACE_MODEL_BASE_DIR.exists() and os.access(SPACE_MODEL_BASE_DIR, os.W_OK):
        return SPACE_MODEL_BASE_DIR
    return LOCAL_MODEL_BASE_DIR


def _path_can_be_created_or_written(path: Path) -> bool:
    if path.exists():
        return path.is_dir() and os.access(path, os.W_OK)
    probe = path.parent
    while not probe.exists() and probe != probe.parent:
        probe = probe.parent
    return probe.exists() and os.access(probe, os.W_OK)


def normalize_model_variant(model_variant: Optional[str] = None) -> str:
    variant = (model_variant or os.getenv("LANCE_MODEL_VARIANT", DEFAULT_MODEL_VARIANT)).strip().lower()
    if variant in {"image", "t2i", "i2t"}:
        return MODEL_VARIANT_IMAGE
    return MODEL_VARIANT_VIDEO


def get_model_path(model_variant: Optional[str] = None) -> Path:
    variant = normalize_model_variant(model_variant)
    variant_env_name = "LANCE_IMAGE_MODEL_PATH" if variant == MODEL_VARIANT_IMAGE else "LANCE_VIDEO_MODEL_PATH"
    variant_configured = os.getenv(variant_env_name)
    if variant_configured:
        return Path(variant_configured).expanduser()

    configured = os.getenv("LANCE_MODEL_PATH")
    if configured:
        return Path(configured).expanduser()

    model_dir_name = MODEL_VARIANT_TO_DIR[variant]
    return get_model_base_dir() / model_dir_name


def get_required_model_asset_paths(model_base_dir: Path, model_path: Path) -> list[Path]:
    return [
        model_path / "llm_config.json",
        model_path / "model.safetensors",
        model_base_dir / "Qwen2.5-VL-ViT" / "vit.safetensors",
        model_base_dir / "Wan2.2_VAE.pth",
    ]


def get_model_download_allow_patterns(model_variant: Optional[str] = None) -> list[str]:
    variant = normalize_model_variant(model_variant)
    model_dir_name = MODEL_VARIANT_TO_DIR[variant]
    return [
        f"{model_dir_name}/**",
        "Qwen2.5-VL-ViT/**",
        "Wan2.2_VAE.pth",
        "generation_config.json",
        "llm_config.json",
        "tokenizer.json",
        "tokenizer_config.json",
        "vocab.json",
        "merges.txt",
        "config.json",
    ]


def _get_safetensors_first_tensor_dtype(path: Path) -> Optional[torch.dtype]:
    if not path.exists():
        return None
    with safe_open(str(path), framework="pt", device="cpu") as f:
        keys = list(f.keys())
        if not keys:
            return None
        return f.get_tensor(keys[0]).dtype


def convert_model_weights_to_bf16_inplace(model_path: Path) -> bool:
    weight_path = model_path / "model.safetensors"
    if not weight_path.exists():
        return False

    first_dtype = _get_safetensors_first_tensor_dtype(weight_path)
    if first_dtype is None or first_dtype == torch.bfloat16:
        return False

    if first_dtype != torch.float32:
        print(
            f"[startup] Skipping bf16 conversion for {weight_path} because the first tensor dtype is {first_dtype}.",
            flush=True,
        )
        return False

    temp_path = weight_path.with_suffix(".bf16.safetensors.tmp")
    print(f"[startup] Converting {weight_path} to bf16 to reduce disk usage.", flush=True)
    with safe_open(str(weight_path), framework="pt", device="cpu") as f:
        metadata = f.metadata()
        tensor_names = list(f.keys())
        tensors = {}
        for name in tensor_names:
            tensor = f.get_tensor(name)
            tensors[name] = tensor.to(torch.bfloat16) if tensor.dtype == torch.float32 else tensor
        save_file(tensors, str(temp_path), metadata=metadata)

    os.replace(temp_path, weight_path)
    print(f"[startup] Replaced original fp32 weights with bf16 weights at {weight_path}.", flush=True)
    return True


def compact_downloaded_model_weights(model_base_dir: Path, variants: Optional[list[str]] = None) -> None:
    model_dir_names = variants or [MODEL_VARIANT_TO_DIR[MODEL_VARIANT_IMAGE], MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]]
    for model_dir_name in model_dir_names:
        model_path = model_base_dir / model_dir_name
        try:
            convert_model_weights_to_bf16_inplace(model_path)
        except Exception as exc:
            print(f"[startup] bf16 compaction skipped for {display_path(model_path)}: {exc}", flush=True)


def ensure_model_assets(model_variant: Optional[str] = None) -> Path:
    model_base_dir = get_model_base_dir()
    os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
    model_path = get_model_path(model_variant)

    required_paths = get_required_model_asset_paths(model_base_dir, model_path)
    if all(path.exists() for path in required_paths):
        compact_downloaded_model_weights(model_base_dir, [MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]])
        return model_path

    downloads_model_base_dir = Path("downloads")
    if model_base_dir == Path(".") and downloads_model_base_dir.exists():
        downloads_model_path = downloads_model_base_dir / MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]
        downloads_required_paths = get_required_model_asset_paths(downloads_model_base_dir, downloads_model_path)
        if all(path.exists() for path in downloads_required_paths):
            model_base_dir = downloads_model_base_dir
            model_path = downloads_model_path
            required_paths = downloads_required_paths
            os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
            compact_downloaded_model_weights(model_base_dir, [MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]])
            return model_path

    auto_download = env_flag("LANCE_AUTO_DOWNLOAD", running_on_space())
    if not auto_download:
        missing = "\n".join(f"- {display_path(path)}" for path in required_paths if not path.exists())
        raise FileNotFoundError(
            "Lance model assets are missing. Set LANCE_MODEL_BASE_DIR or enable "
            f"LANCE_AUTO_DOWNLOAD=1.\nMissing files:\n{missing}"
        )

    model_base_dir.mkdir(parents=True, exist_ok=True)
    repo_id = os.getenv("LANCE_MODEL_REPO_ID", DEFAULT_MODEL_REPO_ID)
    print(f"[startup] Downloading Lance model assets from {repo_id} to {display_path(model_base_dir)}", flush=True)
    hub_token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN")
    snapshot_path = Path(
        snapshot_download(
            repo_id=repo_id,
            local_dir=str(model_base_dir),
            local_dir_use_symlinks=False,
            resume_download=True,
            token=hub_token,
            allow_patterns=get_model_download_allow_patterns(model_variant),
        )
    )
    if snapshot_path != model_base_dir and not model_path.exists():
        os.environ["LANCE_MODEL_BASE_DIR"] = display_path(snapshot_path)
        model_path = get_model_path(model_variant)
    compact_downloaded_model_weights(model_base_dir, [MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]])
    return model_path


def ensure_dirs() -> None:
    TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
    RESULTS_ROOT.mkdir(parents=True, exist_ok=True)


def save_generation_record(record: dict, save_dir: Path) -> None:
    ensure_dirs()
    run_record_path = save_dir / RUN_RECORD_FILENAME
    with run_record_path.open("w", encoding="utf-8") as f:
        json.dump(record, f, ensure_ascii=False, indent=2)

    with RECORD_WRITE_LOCK:
        with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")


def normalize_seed(seed: int) -> int:
    return random.randint(0, 2**31 - 1) if seed == -1 else seed


def video_seconds_to_num_frames(seconds: int) -> int:
    seconds = max(1, min(10, int(seconds)))
    return 12 * seconds + 1


def normalize_task(task: str) -> str:
    task_key = (task or TASK_LABEL_VIDEO_GENERATION).strip()
    task = TASK_LABEL_TO_INTERNAL.get(task_key, TASK_LABEL_TO_INTERNAL.get(task_key.lower(), ""))
    if task not in GENERATION_TASKS | UNDERSTANDING_TASKS:
        raise ValueError(f"Unsupported task type: {task}")
    return task


def normalize_resolution_choice_value(resolution: str, task: str) -> str:
    resolution_text = str(resolution or "").strip()
    for choice in get_resolution_choices_for_task(task):
        if isinstance(choice, tuple):
            label, value = choice
            if resolution_text in {str(label), str(value)}:
                return str(value)
        elif resolution_text == str(choice):
            return str(choice)
    return resolution_text


def get_resolution_choice_values_for_task(task: str) -> list[str]:
    return [choice[1] if isinstance(choice, tuple) else choice for choice in get_resolution_choices_for_task(task)]


def get_resolution_choices_for_task(task: str) -> list[str | tuple[str, str]]:
    internal_task = normalize_task(task)
    if internal_task in IMAGE_TASKS:
        return IMAGE_RESOLUTION_CHOICES
    if internal_task == TASK_T2V:
        return VIDEO_RESOLUTION_DISPLAY_CHOICES
    return VIDEO_EDIT_RESOLUTION_CHOICES if internal_task in VIDEO_TASKS else VIDEO_RESOLUTION_CHOICES


def get_default_resolution_for_task(task: str) -> str:
    internal_task = normalize_task(task)
    if internal_task in IMAGE_TASKS:
        return DEFAULT_IMAGE_RESOLUTION
    if internal_task == TASK_T2V:
        return DEFAULT_RESOLUTION
    return DEFAULT_VIDEO_EDIT_RESOLUTION if internal_task in VIDEO_TASKS else DEFAULT_RESOLUTION


def normalize_resolution_for_backend(resolution: str, task: str) -> str:
    internal_task = normalize_task(task)
    normalized_resolution = normalize_resolution_choice_value(resolution, internal_task)
    return normalized_resolution if normalized_resolution in get_resolution_choice_values_for_task(internal_task) else get_default_resolution_for_task(internal_task)


def get_default_aspect_ratio(task: str) -> str:
    internal_task = normalize_task(task)
    return DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO


def normalize_video_resolution(resolution: Optional[str], task: Optional[str] = None) -> str:
    if task is None:
        return resolution if resolution in VIDEO_RESOLUTION_CHOICES else DEFAULT_RESOLUTION
    normalized_resolution = normalize_resolution_choice_value(resolution, task)
    choices = get_resolution_choice_values_for_task(task)
    return normalized_resolution if normalized_resolution in choices else get_default_resolution_for_task(task)


def get_size_for_aspect_ratio(task: str, aspect_ratio: str, video_resolution: Optional[str] = None) -> tuple[int, int]:
    internal_task = normalize_task(task)
    aspect_ratio = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
    if internal_task in IMAGE_TASKS:
        size_map = IMAGE_ASPECT_RATIO_TO_SIZE
    else:
        size_map = VIDEO_RESOLUTION_TO_SIZE_MAP[normalize_video_resolution(video_resolution, internal_task)]
    return size_map[aspect_ratio]


def format_size_markdown(task: str, width: int, height: int) -> str:
    return "" if normalize_task(task) in UNDERSTANDING_TASKS else f"{width} x {height}"


def get_size_map_for_task(task: str, video_resolution: Optional[str] = None) -> dict[str, tuple[int, int]]:
    internal_task = normalize_task(task)
    if internal_task in IMAGE_TASKS:
        return IMAGE_ASPECT_RATIO_TO_SIZE
    return VIDEO_RESOLUTION_TO_SIZE_MAP[normalize_video_resolution(video_resolution, internal_task)]


def get_output_resolution_choices_for_task(task: str, video_resolution: Optional[str] = None) -> list[tuple[str, str]]:
    """Get Output Resolution choices with a one-to-one mapping to aspect ratios."""
    internal_task = normalize_task(task)
    default_ratio = get_default_aspect_ratio(internal_task)
    size_map = get_size_map_for_task(internal_task, video_resolution)
    choices = []
    for ratio in ASPECT_RATIO_CHOICES:
        width, height = size_map[ratio]
        resolution_text = format_size_markdown(internal_task, width, height)
        label = f"{resolution_text}" if ratio == default_ratio else resolution_text
        choices.append((label, resolution_text))
    return choices




def build_lance_label_html(text: str, *extra_classes: str) -> str:
    class_names = " ".join(["lance-section-label", *extra_classes]).strip()
    return f'<div class="{class_names}">{html.escape(text)}</div>'


def build_lance_icon_label_html(text: str, icon: str, *extra_classes: str) -> str:
    icon_map = {
        "video": """
            <span class="lance-label-icon" aria-hidden="true">
                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
                    <rect x="3.5" y="6" width="11" height="12" rx="2.2"></rect>
                    <path d="M15 10.2 20.5 7v10L15 13.8z" fill="currentColor" stroke="none"></path>
                </svg>
            </span>
        """,
        "image": """
            <span class="lance-label-icon" aria-hidden="true">
                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
                    <rect x="3.5" y="5.5" width="17" height="13" rx="2.2"></rect>
                    <circle cx="9" cy="10" r="1.5" fill="currentColor" stroke="none"></circle>
                    <path d="M5.5 16.5 10 12l2.7 2.7 2.1-2.1 3.7 3.9"></path>
                </svg>
            </span>
        """,
        "text": """
            <span class="lance-label-icon" aria-hidden="true">
                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
                    <rect x="3.5" y="5.5" width="17" height="13" rx="2.2"></rect>
                    <path d="M7 9h10"></path>
                    <path d="M7 12h7.5"></path>
                    <path d="M7 15h5.5"></path>
                </svg>
            </span>
        """,
        "logs": """
            <span class="lance-label-icon" aria-hidden="true">
                <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round">
                    <rect x="3.5" y="5.5" width="17" height="13" rx="2.2"></rect>
                    <path d="M7 10.2 10 12l-3 1.8"></path>
                    <path d="M12.5 15h4"></path>
                </svg>
            </span>
        """,
    }
    icon_html = icon_map.get(icon, "")
    class_names = " ".join(["lance-section-label", "lance-icon-label", *extra_classes]).strip()
    return f'<div class="{class_names}">{icon_html}<span>{html.escape(text)}</span></div>'


def update_size_from_aspect_ratio(task: str, aspect_ratio: str, video_resolution: Optional[str] = None):
    width, height = get_size_for_aspect_ratio(task, aspect_ratio, video_resolution)
    return height, width, gr.update(
        choices=get_output_resolution_choices_for_task(task, video_resolution),
        value=format_size_markdown(task, width, height),
    )


def update_output_resolution_from_video_profile(task: str, aspect_ratio: str, video_resolution: str):
    width, height = get_size_for_aspect_ratio(task, aspect_ratio, video_resolution)
    return (
        gr.update(
            choices=get_output_resolution_choices_for_task(task, video_resolution),
            value=format_size_markdown(task, width, height),
        ),
        height,
        width,
    )


def reset_generation_defaults_for_task(task: str):
    internal_task = normalize_task(task)
    aspect_ratio = get_default_aspect_ratio(internal_task)
    resolution = get_default_resolution_for_task(internal_task)
    width, height = get_size_for_aspect_ratio(internal_task, aspect_ratio, resolution)
    num_frames = DEFAULT_VIDEO_DURATION_SECONDS
    return aspect_ratio, height, width, num_frames, resolution, gr.update(
        choices=get_output_resolution_choices_for_task(internal_task, resolution),
        value=format_size_markdown(internal_task, width, height),
    )


def make_prompt_example_click_handler(prompt_text: str, cache_key: str = ""):
    """Create a click handler for custom text-to-visual prompt-example rows.

    gr.Dataset and gr.Examples render long text through compact preview cells, so
    long prompts/instructions/questions can be truncated before CSS gets a chance
    to wrap them. The custom rows below use normal buttons for display and keep
    the full prompt string in this closure for click-to-fill behavior.
    """

    def _handler(task: str):
        defaults = reset_generation_defaults_for_task(task)
        return (prompt_text, pack_recommended_cache_carrier(cache_key, task), *defaults)

    return _handler


def make_media_prompt_example_click_handler(
    prompt_text: str,
    input_video_path: Optional[str] = None,
    input_image_path: Optional[str] = None,
    cache_key: str = "",
):
    """Create a click handler for edit/understanding example rows.

    The row button renders the complete prompt/instruction/question, while the
    closure also carries the matching media path so one click still fills every
    required input component.
    """

    def _handler(task: str):
        defaults = reset_generation_defaults_for_task(task)
        return (prompt_text, input_video_path, input_image_path, pack_recommended_cache_carrier(cache_key, task), *defaults)

    return _handler


def get_understanding_system_prompt_choices(task: str) -> list[str]:
    internal_task = normalize_task(task)
    if internal_task == TASK_X2T_IMAGE:
        return [I2T_QA_SYSTEM_PROMPT]
    return [V2T_QA_SYSTEM_PROMPT]


def normalize_understanding_system_prompt(task: str, system_prompt: Optional[str]) -> str:
    return get_understanding_system_prompt_choices(task)[0]


RECOMMENDED_CACHE_CARRIER_PREFIX = "__LANCE_RECOMMENDED_CASE_KEY__="


def pack_recommended_cache_carrier(cache_key: str, task: str) -> str:
    """Carry a recommended case key through the existing hidden system_prompt input.

    This keeps Generate at the original Gradio inputs while carrying only the
    example identity. Actual cache hits are validated later with a full request
    signature so user-edited parameters never reuse the wrong output.
    """
    internal_task = normalize_task(task)
    base_prompt = normalize_understanding_system_prompt(internal_task, None) if internal_task in UNDERSTANDING_TASKS else ""
    if not cache_key:
        return base_prompt
    return f"{RECOMMENDED_CACHE_CARRIER_PREFIX}{cache_key}\n{base_prompt}"


def unpack_recommended_cache_carrier(system_prompt: Optional[str]) -> tuple[str, Optional[str]]:
    text = str(system_prompt or "")
    if not text.startswith(RECOMMENDED_CACHE_CARRIER_PREFIX):
        return "", system_prompt
    payload = text[len(RECOMMENDED_CACHE_CARRIER_PREFIX):]
    cache_key, _, base_prompt = payload.partition("\n")
    return cache_key.strip(), (base_prompt if base_prompt else None)


def create_request_json(
    task: str,
    prompt: str,
    input_video: Optional[str],
    input_image: Optional[str],
    system_prompt: Optional[str] = None,
) -> Path:
    ensure_dirs()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"

    if task == TASK_T2V:
        payload = {"000000.mp4": prompt}
    elif task == TASK_T2I:
        payload = {"000000.png": prompt}
    elif task == TASK_VIDEO_EDIT:
        if not input_video:
            raise ValueError("The video edit task requires an input video.")
        payload = {
            "000000": {
                "interleave_array": [prompt, input_video, input_video],
                "element_dtype_array": ["text", "video", "video"],
                "istarget_in_interleave": [0, 0, 1],
            }
        }
    elif task == TASK_IMAGE_EDIT:
        if not input_image:
            raise ValueError("The image edit task requires an input image.")
        payload = {
            "000000": {
                "interleave_array": [prompt, input_image, input_image],
                "element_dtype_array": ["text", "image", "image"],
                "istarget_in_interleave": [0, 0, 1],
            }
        }
    elif task == TASK_X2T_VIDEO:
        if not input_video:
            raise ValueError("The video understanding task requires an input video.")
        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
        payload = {
            "000000": {
                "interleave_array": [input_video, [system_prompt, prompt, ""]],
                "element_dtype_array": ["video", "text"],
                "istarget_in_interleave": [0, 1],
            }
        }
    elif task == TASK_X2T_IMAGE:
        if not input_image:
            raise ValueError("The image understanding task requires an input image.")
        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
        payload = {
            "000000": {
                "interleave_array": [input_image, [system_prompt, prompt, ""]],
                "element_dtype_array": ["image", "text"],
                "istarget_in_interleave": [0, 1],
            }
        }
    else:
        raise ValueError(f"Unsupported task type: {task}")

    with prompt_file.open("w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    return prompt_file


def resolve_example_path(path: str) -> str:
    candidate = Path(path)
    if candidate.is_absolute():
        return str(candidate)
    repo_candidate = (REPO_ROOT / candidate)
    if repo_candidate.exists():
        return str(repo_candidate.resolve())
    if candidate.exists():
        return str(candidate.resolve())
    return path


def resolve_browser_video_example_path(path: str) -> str:
    candidate = Path(path)
    compatible_candidate = candidate.with_name(f"{candidate.stem}_h264{candidate.suffix}")
    repo_compatible_candidate = REPO_ROOT / compatible_candidate
    if not compatible_candidate.is_absolute() and repo_compatible_candidate.exists():
        return str(repo_compatible_candidate.resolve())
    if compatible_candidate.is_absolute() and compatible_candidate.exists():
        return str(compatible_candidate.resolve())
    repo_candidate = REPO_ROOT / candidate
    if not candidate.is_absolute() and repo_candidate.exists():
        return str(repo_candidate.resolve())
    if candidate.is_absolute() and candidate.exists():
        return str(candidate.resolve())
    return resolve_example_path(path)


def resolve_video_example_paths(path: str) -> tuple[str, str]:
    """Return (browser_preview_path, model_input_path) for a reference video."""
    return resolve_browser_video_example_path(path), resolve_example_path(path)


def _resolve_existing_media_path(media_path: Optional[str]) -> Optional[Path]:
    if not media_path:
        return None
    candidate = Path(str(media_path))
    candidates = [candidate] if candidate.is_absolute() else [REPO_ROOT / candidate, candidate]
    for item in candidates:
        try:
            resolved = item.expanduser().resolve()
        except Exception:
            continue
        if resolved.exists():
            return resolved
    return None


def build_gradio_media_url(media_path: Optional[str]) -> str:
    """Build a Gradio file-serving URL for local recommended-case media."""
    existing = _resolve_existing_media_path(media_path)
    source = str(existing if existing else media_path or "")
    if not source:
        return ""
    try:
        from gradio.route_utils import API_PREFIX
    except Exception:
        API_PREFIX = ""
    return f"{API_PREFIX or ''}/file={quote(source, safe='/:')}"


def build_example_media_html(media_path: Optional[str], media_type: str, fallback_media_path: Optional[str] = None) -> str:
    """Build a lightweight complete-fit media preview for recommended cases."""
    if media_type == "video":
        sources = []
        for candidate in (media_path, fallback_media_path):
            url = build_gradio_media_url(candidate)
            if url and url not in sources:
                sources.append(url)
        if not sources:
            return '<div class="reference-media-fallback">Video file not found</div>'
        source_tags = "".join(
            f'<source src="{html.escape(url, quote=True)}" type="video/mp4">'
            for url in sources
        )
        return (
            '<video class="example-preview-video" controls muted preload="metadata" playsinline>'
            + source_tags
            + 'Your browser cannot play this reference video.</video>'
        )

    url = build_gradio_media_url(media_path)
    if not url:
        return '<div class="reference-media-fallback">Image file not found</div>'
    alt_text = html.escape(Path(str(media_path)).name or "example image", quote=True)
    return f'<img class="example-preview-image" src="{html.escape(url, quote=True)}" alt="{alt_text}" loading="lazy" />'


# Recommended-case cache under the app.py directory. Runtime generated caches are
# written here by default, so each case can be committed with the repository.
LOCAL_RECOMMENDED_OUTPUT_CACHE_DIR = Path(
    os.getenv("LANCE_LOCAL_RECOMMENDED_OUTPUT_CACHE_DIR", str(REPO_ROOT / "lance_gradio" / "recommended_outputs"))
).expanduser()

# Space/runtime cache root. This is kept as a read/query fallback so the app can
# still hit caches that were previously saved on the running Space instance.
SPACE_RECOMMENDED_OUTPUT_CACHE_DIR = Path(
    os.getenv("LANCE_SPACE_RECOMMENDED_OUTPUT_CACHE_DIR", str(GRADIO_TMP_ROOT / "recommended_outputs"))
).expanduser()

# Writable cache target used by store_recommended_cached_result(). By default this
# is app.py's directory / lance_gradio / recommended_outputs. Set
# LANCE_RECOMMENDED_OUTPUT_CACHE_DIR to override it explicitly.
RECOMMENDED_OUTPUT_CACHE_DIR = Path(
    os.getenv("LANCE_RECOMMENDED_OUTPUT_CACHE_DIR", str(LOCAL_RECOMMENDED_OUTPUT_CACHE_DIR))
).expanduser()
ASSET_RECOMMENDED_OUTPUT_CACHE_DIR = LOCAL_RECOMMENDED_OUTPUT_CACHE_DIR
RECOMMENDED_CASE_CACHE: dict[str, dict] = {}


def _sanitize_cache_token(value: object) -> str:
    text = str(value or "").strip()
    text = re.sub(r"[^A-Za-z0-9._-]+", "-", text)
    return text.strip("-") or "default"


def _recommended_output_type(task: str) -> str:
    internal_task = normalize_task(task)
    if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
        return "video"
    if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
        return "image"
    return "text"


def _recommended_output_suffixes(output_type: str) -> tuple[str, ...]:
    if output_type == "video":
        return (".mp4", ".webm", ".mov")
    if output_type == "image":
        return (".png", ".jpg", ".jpeg", ".webp")
    return (".txt", ".json")


def _default_recommended_output_name(task: str, example_id: str) -> str:
    output_type = _recommended_output_type(task)
    candidate = Path(str(example_id)).name or _sanitize_cache_token(example_id)
    suffix = Path(candidate).suffix.lower()
    if suffix in _recommended_output_suffixes(output_type):
        return candidate
    return f"{Path(candidate).stem or _sanitize_cache_token(example_id)}{_recommended_output_suffixes(output_type)[0]}"


def _cache_roots() -> list[Path]:
    """Query the new local cache first, then the Space/runtime saved cache."""
    roots = [RECOMMENDED_OUTPUT_CACHE_DIR, SPACE_RECOMMENDED_OUTPUT_CACHE_DIR]
    unique_roots: list[Path] = []
    seen = set()
    for root in roots:
        try:
            key = str(root.expanduser().resolve())
        except Exception:
            key = str(root)
        if key not in seen:
            seen.add(key)
            unique_roots.append(root)
    return unique_roots


def _infer_aspect_ratio_from_size(task: str, width: int, height: int, resolution: Optional[str]) -> str:
    internal_task = normalize_task(task)
    try:
        size_map = get_size_map_for_task(internal_task, resolution)
        requested = (int(width), int(height))
        for ratio, size in size_map.items():
            if tuple(size) == requested:
                return ratio
    except Exception:
        pass
    return get_default_aspect_ratio(internal_task)


def _canonical_float_for_cache(value: object) -> str:
    try:
        number = float(value)
    except Exception:
        return str(value or "")
    # Keep numeric values stable across Gradio/Python representations while still
    # being parameter-sensitive (for example, 3.5 and 3.500 resolve together).
    return f"{number:.10g}"


def _cache_media_content_hash_enabled() -> bool:
    # On Spaces, Gradio may copy example videos to a temporary file before the
    # backend receives them. Path/mtime based identities then differ from local
    # runs even when the media bytes are the same. A content hash makes example
    # media identities stable across repo paths and Gradio temp paths.
    return env_flag("LANCE_CACHE_MEDIA_CONTENT_HASH", True)


def _cache_media_hash_max_bytes() -> int:
    try:
        return int(os.getenv("LANCE_CACHE_MEDIA_HASH_MAX_BYTES", str(512 * 1024 * 1024)))
    except Exception:
        return 512 * 1024 * 1024


def _media_content_identity_for_cache(path: Path) -> str:
    if not _cache_media_content_hash_enabled():
        return ""
    try:
        stat = path.stat()
        max_bytes = _cache_media_hash_max_bytes()
        if max_bytes > 0 and stat.st_size > max_bytes:
            return ""
        digest = hashlib.sha256()
        with path.open("rb") as f:
            for chunk in iter(lambda: f.read(1024 * 1024), b""):
                digest.update(chunk)
        return f"sha256:{digest.hexdigest()}:{stat.st_size}"
    except Exception:
        return ""


def _canonical_media_identity_for_cache(media_path: Optional[str]) -> str:
    """Return a stable identity for media inputs used by recommended-case cache.

    Example files may be passed either as repo-relative paths from JSON, resolved
    absolute paths, or Space/Gradio temp-file paths. Content hashing is attempted
    first so the same example video can match across local and Space even if
    Gradio rewrites the path. If hashing is disabled or too expensive, this
    falls back to repo-relative identity and then path/stat identity.
    """
    if not media_path:
        return ""

    text = str(media_path)
    candidate = Path(text).expanduser()
    candidates = [candidate] if candidate.is_absolute() else [REPO_ROOT / candidate, candidate]
    for item in candidates:
        try:
            resolved = item.resolve()
        except Exception:
            continue
        if not resolved.exists():
            continue

        content_identity = _media_content_identity_for_cache(resolved)
        if content_identity:
            return content_identity

        try:
            rel = resolved.relative_to(REPO_ROOT.resolve()).as_posix()
            return f"repo:{rel}"
        except Exception:
            pass
        try:
            stat = resolved.stat()
            return f"file:{resolved.as_posix()}:{stat.st_size}:{int(stat.st_mtime_ns)}"
        except Exception:
            return f"file:{resolved.as_posix()}"

    return f"path:{text}"


def _stable_json_for_cache(payload: dict) -> str:
    return json.dumps(payload, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _recommended_request_signature_hash(request_signature: Optional[dict]) -> str:
    if not request_signature:
        return ""
    return hashlib.sha256(_stable_json_for_cache(request_signature).encode("utf-8")).hexdigest()[:20]


def _recommended_request_cacheable(request_signature: Optional[dict]) -> bool:
    if not request_signature:
        return False
    # A seed of -1 intentionally means random. The actual seed is sampled inside
    # the generation path, so using a pre-existing cache would be misleading.
    return int(request_signature.get("seed", 0)) != -1


def _recommended_signatures_equal(left: Optional[dict], right: Optional[dict]) -> bool:
    if not left or not right:
        return False
    return _stable_json_for_cache(left) == _stable_json_for_cache(right)


def _recommended_cache_media_alias_enabled() -> bool:
    # Gradio Spaces may copy or transcode example media before the backend sees
    # it. For recommended cases, allow legacy cache files to match when all
    # non-media parameters are identical and only the media identity differs.
    return env_flag("LANCE_RECOMMENDED_CACHE_ALLOW_MEDIA_ALIAS", True)


def _recommended_signatures_equal_ignoring_media(left: Optional[dict], right: Optional[dict]) -> bool:
    if not left or not right:
        return False
    left_copy = dict(left)
    right_copy = dict(right)
    for key in ("input_video", "input_image"):
        left_copy.pop(key, None)
        right_copy.pop(key, None)
    return _stable_json_for_cache(left_copy) == _stable_json_for_cache(right_copy)


def build_recommended_request_signature(
    task: str,
    prompt: Optional[str],
    system_prompt: Optional[str],
    input_video: Optional[str],
    input_image: Optional[str],
    height: int,
    width: int,
    num_frames_ui: int,
    seed: int,
    resolution: Optional[str],
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    enable_frame_interpolation: bool,
) -> dict:
    """Build a complete cache signature for all user-controllable run params."""
    internal_task = normalize_task(task)
    normalized_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
    normalized_height = int(height)
    normalized_width = int(width)
    normalized_num_frames_ui = int(num_frames_ui)
    aspect_ratio = _infer_aspect_ratio_from_size(
        internal_task,
        normalized_width,
        normalized_height,
        normalized_resolution,
    )
    normalized_system_prompt = (
        normalize_understanding_system_prompt(internal_task, system_prompt)
        if internal_task in UNDERSTANDING_TASKS
        else str(system_prompt or "")
    )

    return {
        "signature_version": 2,
        "task": internal_task,
        "prompt": str(prompt or "").strip(),
        "system_prompt": normalized_system_prompt,
        "input_video": _canonical_media_identity_for_cache(input_video),
        "input_image": _canonical_media_identity_for_cache(input_image),
        "resolution": normalized_resolution,
        "aspect_ratio": aspect_ratio,
        "height": normalized_height,
        "width": normalized_width,
        "num_frames_ui": normalized_num_frames_ui,
        "num_frames_backend": video_seconds_to_num_frames(normalized_num_frames_ui)
        if internal_task == TASK_T2V
        else normalized_num_frames_ui,
        "seed": int(seed),
        "validation_num_timesteps": int(validation_num_timesteps),
        "validation_timestep_shift": _canonical_float_for_cache(validation_timestep_shift),
        "cfg_text_scale": _canonical_float_for_cache(cfg_text_scale),
        "enable_frame_interpolation": bool(enable_frame_interpolation),
    }


def _recommended_variant_tokens(
    task: str,
    resolution: Optional[str],
    aspect_ratio: Optional[str],
    duration_seconds: Optional[int] = None,
) -> list[str]:
    internal_task = normalize_task(task)
    normalized_resolution = normalize_resolution_for_backend(
        str(resolution or get_default_resolution_for_task(internal_task)),
        internal_task,
    )
    normalized_aspect = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
    tokens = [
        _sanitize_cache_token(normalized_resolution),
        _sanitize_cache_token(normalized_aspect),
    ]
    # Only Video Generation exposes a user duration selector. Video Editing and
    # Understanding use the input media duration, so the UI duration should not
    # split their cache.
    if internal_task == TASK_T2V:
        seconds = int(duration_seconds if duration_seconds is not None else DEFAULT_VIDEO_DURATION_SECONDS)
        tokens.append(f"{max(1, min(10, seconds))}s")
    return tokens


def _recommended_output_name_for_variant(
    task: str,
    output_name: str,
    resolution: Optional[str],
    aspect_ratio: Optional[str],
    duration_seconds: Optional[int] = None,
) -> str:
    path_obj = Path(str(output_name))
    stem = path_obj.stem or _sanitize_cache_token(output_name)
    suffix = path_obj.suffix or _recommended_output_suffixes(_recommended_output_type(task))[0]
    tokens = "__".join(_recommended_variant_tokens(task, resolution, aspect_ratio, duration_seconds))
    return f"{stem}__{tokens}{suffix}" if tokens else f"{stem}{suffix}"


def _recommended_output_name_for_signature(
    task: str,
    output_name: str,
    request_signature: dict,
) -> str:
    path_obj = Path(str(output_name))
    stem = path_obj.stem or _sanitize_cache_token(output_name)
    suffix = path_obj.suffix or _recommended_output_suffixes(_recommended_output_type(task))[0]
    signature_hash = _recommended_request_signature_hash(request_signature)
    return f"{stem}__sig-{signature_hash}{suffix}"


def register_recommended_case_cache(
    task: str,
    example_id: str,
    output_name: Optional[str] = None,
    aspect_ratio: Optional[str] = None,
    resolution: Optional[str] = None,
    duration_seconds: Optional[int] = None,
    prompt_text: Optional[str] = None,
    input_video_path: Optional[str] = None,
    input_image_path: Optional[str] = None,
) -> str:
    internal_task = normalize_task(task)
    normalized_resolution = normalize_resolution_for_backend(
        str(resolution or get_default_resolution_for_task(internal_task)),
        internal_task,
    )
    normalized_aspect = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
    default_width, default_height = get_size_for_aspect_ratio(internal_task, normalized_aspect, normalized_resolution)
    default_duration = int(duration_seconds if duration_seconds is not None else DEFAULT_VIDEO_DURATION_SECONDS)
    default_request_signature = build_recommended_request_signature(
        task=internal_task,
        prompt=prompt_text,
        system_prompt=normalize_understanding_system_prompt(internal_task, None) if internal_task in UNDERSTANDING_TASKS else "",
        input_video=input_video_path,
        input_image=input_image_path,
        height=default_height,
        width=default_width,
        num_frames_ui=default_duration,
        seed=DEFAULT_BASIC_SEED,
        resolution=normalized_resolution,
        validation_num_timesteps=DEFAULT_TIMESTEPS,
        validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
        cfg_text_scale=DEFAULT_CFG_TEXT_SCALE,
        enable_frame_interpolation=False,
    )
    cache_key = f"{internal_task}:{_sanitize_cache_token(example_id)}"
    RECOMMENDED_CASE_CACHE[cache_key] = {
        "key": cache_key,
        "task": internal_task,
        "example_id": str(example_id),
        "output_name": output_name or _default_recommended_output_name(internal_task, str(example_id)),
        "output_type": _recommended_output_type(internal_task),
        "resolution": normalized_resolution,
        "aspect_ratio": normalized_aspect,
        "duration_seconds": default_duration,
        "prompt_text": str(prompt_text or ""),
        "input_video_path": str(input_video_path or ""),
        "input_image_path": str(input_image_path or ""),
        "default_request_signature": default_request_signature,
        "default_request_signature_hash": _recommended_request_signature_hash(default_request_signature),
    }
    return cache_key


def infer_recommended_case_key_from_request(
    task: str,
    prompt: str,
    input_video: Optional[str] = None,
    input_image: Optional[str] = None,
) -> str:
    """Best-effort fallback for sessions that do not carry the hidden cache key."""
    internal_task = normalize_task(task)
    prompt_text = str(prompt or "").strip()
    input_video_id = _canonical_media_identity_for_cache(input_video)
    input_image_id = _canonical_media_identity_for_cache(input_image)

    for cache_key, meta in RECOMMENDED_CASE_CACHE.items():
        if meta.get("task") != internal_task:
            continue
        if str(meta.get("prompt_text") or "").strip() != prompt_text:
            continue

        meta_video = str(meta.get("input_video_path") or "")
        meta_image = str(meta.get("input_image_path") or "")
        meta_video_id = _canonical_media_identity_for_cache(meta_video)
        meta_image_id = _canonical_media_identity_for_cache(meta_image)
        if meta_video_id and input_video_id and meta_video_id != input_video_id:
            continue
        if meta_image_id and input_image_id and meta_image_id != input_image_id:
            continue
        if meta_video_id and not input_video_id:
            continue
        if meta_image_id and not input_image_id:
            continue
        return cache_key

    return ""


def _recommended_cache_candidates(
    meta: dict,
    resolution: Optional[str] = None,
    aspect_ratio: Optional[str] = None,
    duration_seconds: Optional[int] = None,
    request_signature: Optional[dict] = None,
):
    task = str(meta["task"])
    output_name = str(meta.get("output_name") or _default_recommended_output_name(task, meta.get("example_id", meta["key"])))
    output_type = str(meta.get("output_type") or _recommended_output_type(task))
    requested_resolution = normalize_resolution_for_backend(str(resolution or meta.get("resolution") or ""), task)
    requested_aspect = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else str(meta.get("aspect_ratio") or get_default_aspect_ratio(task))
    requested_duration = int(duration_seconds if duration_seconds is not None else meta.get("duration_seconds", DEFAULT_VIDEO_DURATION_SECONDS))
    default_resolution = str(meta.get("resolution") or "")
    default_aspect = str(meta.get("aspect_ratio") or get_default_aspect_ratio(task))
    default_duration = int(meta.get("duration_seconds") or DEFAULT_VIDEO_DURATION_SECONDS)
    default_signature = meta.get("default_request_signature")
    is_default_signature = _recommended_signatures_equal(request_signature, default_signature)
    is_media_alias_signature = (
        _recommended_cache_media_alias_enabled()
        and _recommended_signatures_equal_ignoring_media(request_signature, default_signature)
    )

    stem = Path(output_name).stem or _sanitize_cache_token(meta.get("example_id", meta.get("key", "case")))
    names = set()

    # New strict cache filenames: every user-controllable parameter is part of
    # request_signature, so a changed seed/steps/CFG/media/size/etc. cannot hit
    # an output generated under different settings.
    if request_signature and _recommended_request_cacheable(request_signature):
        signature_hash = _recommended_request_signature_hash(request_signature)
        signature_name = _recommended_output_name_for_signature(task, output_name, request_signature)
        names.add(signature_name)
        for suffix in _recommended_output_suffixes(output_type):
            names.add(f"{stem}__sig-{signature_hash}{suffix}")
            names.add(f"{_sanitize_cache_token(meta['key'])}__sig-{signature_hash}{suffix}")

    # Legacy recommended assets were named only by resolution/aspect/duration, or
    # sometimes just by case id. They are safe for the exact default request
    # signature registered for that recommended case. On Spaces, Gradio can
    # rewrite recommended example videos to temp/transcoded files; in that case
    # input_video changes while the user-visible recommended case is still the
    # same. Allow legacy candidates when every non-media parameter still matches.
    allow_legacy_candidates = request_signature is None or is_default_signature or is_media_alias_signature
    if allow_legacy_candidates:
        names.add(_recommended_output_name_for_variant(task, output_name, requested_resolution, requested_aspect, requested_duration))

        tokens = "__".join(_recommended_variant_tokens(task, requested_resolution, requested_aspect, requested_duration))
        for suffix in _recommended_output_suffixes(output_type):
            names.add(f"{stem}__{tokens}{suffix}")
            names.add(f"{_sanitize_cache_token(meta['key'])}__{tokens}{suffix}")

        # Backward compatibility with the older width/height/duration filename format:
        # stem__video_360p__640x352__3u.mp4
        try:
            width, height = get_size_for_aspect_ratio(task, requested_aspect, requested_resolution)
            old_tokens = f"{_sanitize_cache_token(requested_resolution)}__{int(width)}x{int(height)}"
            if normalize_task(task) == TASK_T2V:
                old_tokens = f"{old_tokens}__{requested_duration}u"
            for suffix in _recommended_output_suffixes(output_type):
                names.add(f"{stem}__{old_tokens}{suffix}")
                names.add(f"{_sanitize_cache_token(meta['key'])}__{old_tokens}{suffix}")
        except Exception:
            pass

        # Legacy generic filename is only allowed for the case's default visible spec.
        if (
            requested_resolution == default_resolution
            and requested_aspect == default_aspect
            and (normalize_task(task) != TASK_T2V or requested_duration == default_duration)
        ):
            names.add(output_name)
            for suffix in _recommended_output_suffixes(output_type):
                names.add(f"{stem}{suffix}")
                names.add(f"{_sanitize_cache_token(meta['key'])}{suffix}")

    for root in _cache_roots():
        for folder in (root / str(task), root):
            for name in names:
                yield folder / name

def _recommended_cache_debug_enabled() -> bool:
    return env_flag("LANCE_DEBUG_RECOMMENDED_CACHE", False)


def find_recommended_cached_output(
    cache_key: str,
    resolution: Optional[str] = None,
    aspect_ratio: Optional[str] = None,
    duration_seconds: Optional[int] = None,
    request_signature: Optional[dict] = None,
) -> Optional[Path]:
    meta = RECOMMENDED_CASE_CACHE.get(cache_key or "")
    if not meta:
        return None

    debug = _recommended_cache_debug_enabled()
    tried: list[str] = []
    for candidate in _recommended_cache_candidates(
        meta,
        resolution=resolution,
        aspect_ratio=aspect_ratio,
        duration_seconds=duration_seconds,
        request_signature=request_signature,
    ):
        if debug and len(tried) < 24:
            tried.append(str(candidate))
        try:
            if candidate.exists() and candidate.is_file():
                return candidate.resolve()
        except Exception:
            continue

    if debug:
        default_signature = meta.get("default_request_signature")
        print(
            "[recommended-cache] Miss "
            + json.dumps(
                {
                    "cache_key": cache_key,
                    "request_sig": _recommended_request_signature_hash(request_signature),
                    "default_sig": _recommended_request_signature_hash(default_signature),
                    "is_default_signature": _recommended_signatures_equal(request_signature, default_signature),
                    "is_media_alias_signature": _recommended_signatures_equal_ignoring_media(request_signature, default_signature),
                    "media_alias_enabled": _recommended_cache_media_alias_enabled(),
                    "roots": [str(root) for root in _cache_roots()],
                    "sample_candidates": tried,
                    "request_input_video": (request_signature or {}).get("input_video"),
                    "default_input_video": (default_signature or {}).get("input_video"),
                    "request_input_image": (request_signature or {}).get("input_image"),
                    "default_input_image": (default_signature or {}).get("input_image"),
                    "request_system_prompt": (request_signature or {}).get("system_prompt"),
                    "default_system_prompt": (default_signature or {}).get("system_prompt"),
                },
                ensure_ascii=False,
            ),
            flush=True,
        )
    return None


def get_recommended_cached_result(
    cache_key: str,
    task: str,
    resolution: Optional[str],
    aspect_ratio: Optional[str],
    duration_seconds: Optional[int] = None,
    request_signature: Optional[dict] = None,
):
    meta = RECOMMENDED_CASE_CACHE.get(cache_key or "")
    if not meta:
        return None
    if not _recommended_request_cacheable(request_signature):
        return None

    cached_path = find_recommended_cached_output(
        cache_key,
        resolution=resolution,
        aspect_ratio=aspect_ratio,
        duration_seconds=duration_seconds,
        request_signature=request_signature,
    )
    if cached_path is None:
        return None

    signature_hash = _recommended_request_signature_hash(request_signature)
    print(f"[recommended-cache] Hit {cache_key} sig={signature_hash}: {cached_path}", flush=True)
    # Keep cache hits silent in the UI. The output is returned directly without
    # exposing cache paths or cache-matching details to end users. Matching is
    # sensitive to the full request signature: prompt, media, size, seed, steps,
    # shift, CFG scale, duration, resolution, and interpolation flag.
    status = ""
    output_type = str(meta.get("output_type") or _recommended_output_type(task))
    if output_type == "video":
        return str(cached_path), None, "", status
    if output_type == "image":
        return None, str(cached_path), "", status
    try:
        return None, None, cached_path.read_text(encoding="utf-8"), status
    except Exception:
        return None, None, str(cached_path), status

def store_recommended_cached_result(
    cache_key: str,
    result,
    resolution: Optional[str],
    aspect_ratio: Optional[str],
    duration_seconds: Optional[int] = None,
    request_signature: Optional[dict] = None,
) -> None:
    meta = RECOMMENDED_CASE_CACHE.get(cache_key or "")
    if not meta:
        return
    if not _recommended_request_cacheable(request_signature):
        return
    if find_recommended_cached_output(
        cache_key,
        resolution=resolution,
        aspect_ratio=aspect_ratio,
        duration_seconds=duration_seconds,
        request_signature=request_signature,
    ) is not None:
        return

    try:
        output_video, output_image, output_text, _status = result
        target_name = _recommended_output_name_for_signature(
            meta["task"],
            str(meta["output_name"]),
            request_signature,
        )
        target = RECOMMENDED_OUTPUT_CACHE_DIR / str(meta["task"]) / target_name
        target.parent.mkdir(parents=True, exist_ok=True)

        if meta["output_type"] == "video" and output_video and Path(str(output_video)).exists():
            shutil.copy2(str(output_video), str(target))
        elif meta["output_type"] == "image" and output_image and Path(str(output_image)).exists():
            shutil.copy2(str(output_image), str(target))
        elif meta["output_type"] == "text" and output_text:
            target.write_text(str(output_text), encoding="utf-8")
        else:
            return

        print(
            f"[recommended-cache] Stored {cache_key} sig={_recommended_request_signature_hash(request_signature)} "
            f"at {target} (resolution={resolution}, aspect_ratio={aspect_ratio}, duration={duration_seconds})",
            flush=True,
        )
    except Exception as exc:
        print(f"[recommended-cache] Could not store {cache_key}: {exc}", flush=True)

def load_json_examples(relative_path: str) -> dict:
    path = REPO_ROOT / relative_path
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


T2V_EXAMPLE_SUMMARIES = {
    "000000.mp4": "Red panda surfing on a bright seaside wave.",
    "000002.mp4": "Panda cub skateboarding in a creative loft.",
    "000004.mp4": "Young woman shaping clay in a sunlit pottery workshop.",
    "000005.mp4": "Panda boxing a robot in a luxurious palace ring.",
    "000008.mp4": "Fantasy pastel horse stepping through a glowing cloud valley.",
}


def make_generation_examples(
    task_label: str,
    relative_path: str,
    limit: int,
    image_task: bool,
    selected_keys: Optional[list[str]] = None,
    summaries: Optional[dict[str, str]] = None,
) -> list[list]:
    internal_task = normalize_task(task_label)
    data = load_json_examples(relative_path)
    items = [(key, data[key]) for key in selected_keys if key in data] if selected_keys else list(data.items())[:limit]
    examples = []
    for output_name, prompt in items:
        cache_key = register_recommended_case_cache(
            task=internal_task,
            example_id=output_name,
            output_name=output_name,
            aspect_ratio=get_default_aspect_ratio(internal_task),
            resolution=get_default_resolution_for_task(internal_task),
            duration_seconds=DEFAULT_VIDEO_DURATION_SECONDS,
            prompt_text=prompt,
        )
        examples.append([prompt, cache_key])
    return examples


def make_edit_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
    internal_task = normalize_task(task_label)
    data = load_json_examples(relative_path)
    examples = []
    for idx, sample in enumerate(list(data.values())[:limit]):
        interleave = sample["interleave_array"]
        prompt = interleave[0]
        example_id = f"{Path(relative_path).stem}_{idx:06d}"
        cache_key = register_recommended_case_cache(
            task=internal_task,
            example_id=example_id,
            output_name=_default_recommended_output_name(internal_task, example_id),
            aspect_ratio=get_default_aspect_ratio(internal_task),
            resolution=get_default_resolution_for_task(internal_task),
            duration_seconds=DEFAULT_VIDEO_DURATION_SECONDS,
            prompt_text=prompt,
            input_video_path=interleave[1] if media_type == "video" else None,
            input_image_path=interleave[1] if media_type == "image" else None,
        )
        if media_type == "video":
            preview_video_path, input_video_path = resolve_video_example_paths(interleave[1])
            examples.append([prompt, preview_video_path, input_video_path, None, None, cache_key])
        else:
            image_path = resolve_example_path(interleave[1])
            examples.append([prompt, None, None, image_path, image_path, cache_key])
    return examples


def make_understanding_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
    internal_task = normalize_task(task_label)
    data = load_json_examples(relative_path)
    examples = []
    for idx, sample in enumerate(list(data.values())[:limit]):
        interleave = sample["interleave_array"]
        text_payload = interleave[1]
        question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
        example_id = f"{Path(relative_path).stem}_{idx:06d}"
        cache_key = register_recommended_case_cache(
            task=internal_task,
            example_id=example_id,
            output_name=_default_recommended_output_name(internal_task, example_id),
            aspect_ratio=get_default_aspect_ratio(internal_task),
            resolution=get_default_resolution_for_task(internal_task),
            duration_seconds=DEFAULT_VIDEO_DURATION_SECONDS,
            prompt_text=question,
            input_video_path=interleave[0] if media_type == "video" else None,
            input_image_path=interleave[0] if media_type == "image" else None,
        )
        if media_type == "video":
            preview_video_path, input_video_path = resolve_video_example_paths(interleave[0])
            examples.append([question, preview_video_path, input_video_path, None, None, cache_key])
        else:
            image_path = resolve_example_path(interleave[0])
            examples.append([question, None, None, image_path, image_path, cache_key])
    return examples


def make_understanding_system_prompt_map(relative_path: str, task: str) -> dict[str, str]:
    data = load_json_examples(relative_path)
    system_prompts = {}
    for sample in data.values():
        interleave = sample["interleave_array"]
        text_payload = interleave[1]
        if not isinstance(text_payload, list) or len(text_payload) < 2:
            continue
        system_prompts[text_payload[1]] = normalize_understanding_system_prompt(task, text_payload[0])
    return system_prompts


VIDEO_GENERATION_EXAMPLES = make_generation_examples(
    TASK_LABEL_VIDEO_GENERATION,
    "config/examples/t2v_example.json",
    limit=7,
    image_task=False,
    #selected_keys=["000000.mp4", "000002.mp4", "000005.mp4", "000004.mp4", "000008.mp4"],
    selected_keys=["000004.mp4", "000002.mp4", "000000.mp4", "000005.mp4", "000008.mp4", "000007.mp4", "000001.mp4"],
    summaries=T2V_EXAMPLE_SUMMARIES,
)
VIDEO_EDIT_EXAMPLES = make_edit_examples(
    TASK_LABEL_VIDEO_EDIT,
    "config/examples/video_edit_example.json",
    limit=3,
    media_type="video",
)
VIDEO_UNDERSTANDING_EXAMPLES = make_understanding_examples(
    TASK_LABEL_VIDEO_UNDERSTANDING,
    "config/examples/x2t_video_example.json",
    limit=3,
    media_type="video",
)
VIDEO_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
    "config/examples/x2t_video_example.json",
    TASK_X2T_VIDEO,
)
IMAGE_GENERATION_EXAMPLES = make_generation_examples(
    TASK_LABEL_IMAGE_GENERATION,
    "config/examples/t2i_example.json",
    limit=9,
    image_task=True,
    selected_keys=["000000.png", "000003.png", "000002.png", "000005.png", "000006.png", "000007.png", "000008.png", "000009.png", "000010.png"],
)
IMAGE_EDIT_EXAMPLES = make_edit_examples(
    TASK_LABEL_IMAGE_EDIT,
    "config/examples/image_edit_example.json",
    limit=5,
    media_type="image",
)
IMAGE_UNDERSTANDING_EXAMPLES = make_understanding_examples(
    TASK_LABEL_IMAGE_UNDERSTANDING,
    "config/examples/x2t_image_example.json",
    limit=3,
    media_type="image",
)
IMAGE_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
    "config/examples/x2t_image_example.json",
    TASK_X2T_IMAGE,
)


def build_save_dir(task: str) -> Path:
    ensure_dirs()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"


def find_generated_video(save_dir: Path) -> Optional[Path]:
    videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
    return videos[0] if videos else None


def find_generated_image(save_dir: Path) -> Optional[Path]:
    images = sorted(save_dir.glob("*.png"), key=lambda p: p.stat().st_mtime, reverse=True)
    return images[0] if images else None


def extract_text_result(save_dir: Path) -> str:
    prompt_result_path = save_dir / PROMPT_JSON_FILENAME
    if not prompt_result_path.exists():
        return ""
    with prompt_result_path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not data:
        return ""
    first_value = next(iter(data.values()))
    return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)


class LanceT2VV2TPipeline:
    def __init__(self, device_id: int, model_variant: str = MODEL_VARIANT_VIDEO) -> None:
        self._init_lock = threading.Lock()
        self._generate_lock = threading.Lock()
        self.initialized = False
        self.device = device_id
        self.model_variant = normalize_model_variant(model_variant)
        self.logger = get_logger(f"lance_{self.model_variant}_gpu{device_id}")

        self.model: Optional[Lance] = None
        self.vae_model: Optional[WanVideoVAE] = None
        self.vae_config: Optional[AutoEncoderParams] = None
        self.tokenizer: Optional[Qwen2Tokenizer] = None
        self.new_token_ids: Optional[dict] = None
        self.image_token_id: Optional[int] = None
        self.base_model_args: Optional[ModelArguments] = None
        self.base_data_args: Optional[DataArguments] = None
        self.base_inference_args: Optional[InferenceArguments] = None

    def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
        elapsed = time.perf_counter() - start_time
        suffix = f" | {extra}" if extra else ""
        print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)

    def _build_base_model_args(self) -> ModelArguments:
        model_path = str(get_model_path(self.model_variant))
        return ModelArguments(
            model_path=model_path,
            vit_type=DEFAULT_VIT_TYPE,
            llm_qk_norm=True,
            llm_qk_norm_und=True,
            llm_qk_norm_gen=True,
            tie_word_embeddings=False,
            max_num_frames=MAX_VIDEO_NUM_FRAMES,
            max_latent_size=64,
            latent_patch_size=[1, 1, 1],
        )

    def _build_base_inference_args(self) -> InferenceArguments:
        return InferenceArguments(
            validation_num_timesteps=DEFAULT_TIMESTEPS,
            validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
            copy_init_moe=True,
            visual_und=True,
            visual_gen=True,
            vae_model_type="wan",
            apply_qwen_2_5_vl_pos_emb=True,
            apply_chat_template=False,
            cfg_type=0,
            validation_data_seed=42,
            video_height=DEFAULT_HEIGHT,
            video_width=DEFAULT_WIDTH,
            num_frames=DEFAULT_NUM_FRAMES,
            task=DEFAULT_TASK,
            save_path_gen=str(RESULTS_ROOT),
            resolution=DEFAULT_RESOLUTION,
            text_template=TEXT_TEMPLATE,
            use_KVcache=USE_KVCACHE,
        )

    def initialize(self) -> None:
        with self._init_lock:
            if self.initialized:
                return

            ensure_dirs()
            resolved_model_path = ensure_model_assets(self.model_variant)
            print(
                f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
                flush=True,
            )
            if not torch.cuda.is_available():
                raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
            if self.device >= torch.cuda.device_count():
                raise RuntimeError(
                    f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
                )
            torch.cuda.set_device(self.device)

            model_args = self._build_base_model_args()
            data_args = DataArguments()
            inference_args = self._build_base_inference_args()
            apply_inference_defaults(model_args, data_args, inference_args)
            inference_args.validation_noise_seed = inference_args.validation_data_seed

            self.base_model_args = model_args
            self.base_data_args = data_args
            self.base_inference_args = inference_args

            set_seed(inference_args.global_seed)

            stage_start = time.perf_counter()
            print(
                f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
                flush=True,
            )
            llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
            self._log_stage("LLM config load", stage_start)

            llm_config.layer_module = model_args.layer_module
            llm_config.qk_norm = model_args.llm_qk_norm
            llm_config.qk_norm_und = model_args.llm_qk_norm_und
            llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
            llm_config.tie_word_embeddings = model_args.tie_word_embeddings
            llm_config.freeze_und = inference_args.freeze_und
            llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb

            stage_start = time.perf_counter()
            print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
            language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
            self._log_stage("LLM weight init", stage_start)

            vit_model = None
            vit_config = None
            if inference_args.visual_und:
                if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
                    raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
                stage_start = time.perf_counter()
                print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
                vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
                self._log_stage("VIT config load", stage_start)

                stage_start = time.perf_counter()
                print(
                    f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
                    flush=True,
                )
                vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
                vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
                vit_model.load_state_dict(vit_weights, strict=True)
                self._log_stage("VIT weight load", stage_start)
                clean_memory(vit_weights)

            if inference_args.visual_gen:
                stage_start = time.perf_counter()
                print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
                vae_model = WanVideoVAE(device=torch.device("cuda", self.device))
                vae_config = deepcopy(vae_model.vae_config)
                self._log_stage("VAE init", stage_start)
            else:
                vae_model = None
                vae_config = None

            config = LanceConfig(
                visual_gen=inference_args.visual_gen,
                visual_und=inference_args.visual_und,
                llm_config=llm_config,
                vit_config=vit_config if inference_args.visual_und else None,
                vae_config=vae_config if inference_args.visual_gen else None,
                latent_patch_size=model_args.latent_patch_size,
                max_num_frames=model_args.max_num_frames,
                max_latent_size=model_args.max_latent_size,
                vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
                connector_act=model_args.connector_act,
                interpolate_pos=model_args.interpolate_pos,
                timestep_shift=inference_args.timestep_shift,
            )
            model: Lance = Lance(
                language_model=language_model,
                vit_model=vit_model if inference_args.visual_und else None,
                vit_type=model_args.vit_type,
                config=config,
                training_args=inference_args,
            )

            stage_start = time.perf_counter()
            print(f"[startup][gpu:{self.device}] Casting Lance model to bf16 on CPU", flush=True)
            model = model.to(dtype=torch.bfloat16)
            self._log_stage("Lance model bf16 cast", stage_start)

            stage_start = time.perf_counter()
            print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
            tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
            tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
            self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")

            if inference_args.copy_init_moe:
                language_model.init_moe()

            init_from_model_path_if_needed(model, model_args)

            if num_new_tokens > 0:
                model.language_model.resize_token_embeddings(len(tokenizer))
                model.config.llm_config.vocab_size = len(tokenizer)
                model.language_model.config.vocab_size = len(tokenizer)

            if model_args.vit_type.lower() == "qwen2_5_vl":
                from common.model.hacks import hack_qwen2_5_vl_config

                language_model = hack_qwen2_5_vl_config(language_model)

            image_token_id = language_model.config.video_token_id
            new_token_ids.update({"image_token_id": image_token_id})
            model.update_tokenizer(tokenizer=tokenizer)

            if model_args.tie_word_embeddings:
                model.language_model.untie_lm_head()
                model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
                model_args.tie_word_embeddings = False
                llm_config.tie_word_embeddings = False
            else:
                assert (
                    model.language_model.get_input_embeddings().weight.data.data_ptr()
                    != model.language_model.get_output_embeddings().weight.data.data_ptr()
                ), "tie_word_embeddings conflict"

            stage_start = time.perf_counter()
            print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
            model = model.to(device=self.device)
            self._log_stage("Lance model move to GPU", stage_start)
            model.eval()
            if vae_model is not None and hasattr(vae_model, "eval"):
                vae_model.eval()

            self.model = model
            self.vae_model = vae_model
            self.vae_config = vae_config
            self.tokenizer = tokenizer
            self.new_token_ids = new_token_ids
            self.image_token_id = image_token_id
            self.initialized = True
            print(
                f"[startup][gpu:{self.device}][{self.model_variant}] Lance multimodal Gradio model loaded and ready for reuse.",
                flush=True,
            )

    def unload(self) -> None:
        with self._init_lock:
            if self.model is not None:
                self.model.cpu()
            if self.vae_model is not None and hasattr(self.vae_model, "vae"):
                vae_inner = self.vae_model.vae
                if hasattr(vae_inner, "model"):
                    vae_inner.model.cpu()

            self.model = None
            self.vae_model = None
            self.vae_config = None
            self.tokenizer = None
            self.new_token_ids = None
            self.image_token_id = None
            self.base_model_args = None
            self.base_data_args = None
            self.base_inference_args = None
            self.initialized = False
            gc.collect()
            if torch.cuda.is_available():
                with torch.cuda.device(self.device):
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()

    def _build_request_batch(
        self,
        prompt_file: Path,
        model_args: ModelArguments,
        data_args: DataArguments,
        inference_args: InferenceArguments,
    ):
        assert self.tokenizer is not None
        assert self.new_token_ids is not None
        assert self.vae_config is not None

        dataset_config = DataConfig.from_yaml(str(prompt_file))
        if inference_args.visual_und:
            dataset_config.vit_patch_size = model_args.vit_patch_size
            dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
            dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
        if inference_args.visual_gen:
            vae_downsample = tuple_mul(
                tuple(model_args.latent_patch_size),
                (
                    self.vae_config.downsample_temporal,
                    self.vae_config.downsample_spatial,
                    self.vae_config.downsample_spatial,
                ),
            )
            dataset_config.latent_patch_size = model_args.latent_patch_size
            dataset_config.vae_downsample = vae_downsample
            dataset_config.max_latent_size = model_args.max_latent_size
            dataset_config.max_num_frames = model_args.max_num_frames

        dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
        dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
        dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob

        dataset_config.num_frames = inference_args.num_frames
        dataset_config.H = inference_args.video_height
        dataset_config.W = inference_args.video_width
        dataset_config.task = inference_args.task
        dataset_config.resolution = inference_args.resolution
        dataset_config.text_template = inference_args.text_template

        val_dataset = ValidationDataset(
            jsonl_path=str(prompt_file),
            tokenizer=self.tokenizer,
            data_args=data_args,
            model_args=model_args,
            training_args=inference_args,
            new_token_ids=self.new_token_ids,
            dataset_config=dataset_config,
            local_rank=0,
            world_size=1,
        )
        return simple_custom_collate([val_dataset[0]])

    def generate(
        self,
        task: str,
        prompt: str,
        system_prompt: Optional[str],
        input_video: Optional[str],
        input_image: Optional[str],
        height: int,
        width: int,
        num_frames: int,
        seed: int,
        resolution: str,
        validation_num_timesteps: int,
        validation_timestep_shift: float,
        cfg_text_scale: float,
        enable_frame_interpolation: bool,
    ):
        self.initialize()
        internal_task = normalize_task(task)
        prompt = (prompt or "").strip()
        input_video = str(input_video).strip() if input_video else ""
        input_image = str(input_image).strip() if input_image else ""

        if internal_task in GENERATION_TASKS and not prompt:
            return None, None, "", "Please enter a prompt."
        if internal_task in UNDERSTANDING_TASKS and not prompt:
            return None, None, "", "Please enter a question."
        if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
            return None, None, "", "Please upload an input video."
        if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
            return None, None, "", "Please upload an input image."
        if height <= 0 or width <= 0:
            return None, None, "", "Height and width must be greater than 0."
        if num_frames <= 0:
            return None, None, "", "The number of frames must be greater than 0."

        assert self.model is not None
        assert self.tokenizer is not None
        assert self.new_token_ids is not None
        assert self.image_token_id is not None
        assert self.base_model_args is not None
        assert self.base_data_args is not None
        assert self.base_inference_args is not None
        active_model_path = self.base_model_args.model_path

        with self._generate_lock:
            torch.cuda.set_device(self.device)
            actual_seed = normalize_seed(int(seed))
            prompt_file = create_request_json(
                task=internal_task,
                prompt=prompt,
                input_video=input_video,
                input_image=input_image,
                system_prompt=system_prompt,
            )
            save_dir = build_save_dir(internal_task)
            save_dir.mkdir(parents=True, exist_ok=True)
            request_started_at = datetime.now().isoformat(timespec="seconds")

            request_model_args = deepcopy(self.base_model_args)
            request_model_args.cfg_text_scale = float(cfg_text_scale)

            request_data_args = deepcopy(self.base_data_args)
            request_data_args.val_dataset_config_file = str(prompt_file)

            request_inference_args = deepcopy(self.base_inference_args)
            request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
            request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
            request_inference_args.validation_data_seed = actual_seed
            request_inference_args.validation_noise_seed = actual_seed
            request_inference_args.video_height = int(height)
            request_inference_args.video_width = int(width)
            request_inference_args.num_frames = int(num_frames)
            display_resolution = str(resolution)
            backend_resolution = normalize_resolution_for_backend(display_resolution, internal_task)
            request_inference_args.resolution = backend_resolution
            request_inference_args.save_path_gen = str(save_dir)
            request_inference_args.task = internal_task
            request_inference_args.text_template = TEXT_TEMPLATE
            request_inference_args.prompt_data_dict = {}

            try:
                print(
                    "[lance_gradio_t2v_v2t] Start generation "
                    f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
                    f"size={height}x{width} | frames={num_frames} | resolution={display_resolution}",
                    flush=True,
                )
                val_data_cpu = self._build_request_batch(
                    prompt_file=prompt_file,
                    model_args=request_model_args,
                    data_args=request_data_args,
                    inference_args=request_inference_args,
                )
                # Keep the allocator from fragmenting before the heavy forward pass.
                clean_memory()
                generate_start = time.perf_counter()
                validate_on_fixed_batch(
                    fsdp_model=self.model,
                    vae_model=self.vae_model,
                    tokenizer=self.tokenizer,
                    val_data_cpu=val_data_cpu,
                    training_args=request_inference_args,
                    model_args=request_model_args,
                    inference_args=request_inference_args,
                    new_token_ids=self.new_token_ids,
                    image_token_id=self.image_token_id,
                    device=self.device,
                    save_source_video=False,
                    save_path_gen=request_inference_args.save_path_gen,
                    save_path_gt="",
                )
                elapsed = time.perf_counter() - generate_start
                save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
                clean_memory()

                video_path = find_generated_video(save_dir) if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} else None
                original_video_path = video_path
                frame_interpolation_enabled = False
                image_path = find_generated_image(save_dir) if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} else None
                text_result = extract_text_result(save_dir) if internal_task in UNDERSTANDING_TASKS else ""
                record = {
                    "request_started_at": request_started_at,
                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
                    "status": "success",
                    "task": internal_task,
                    "model_variant": self.model_variant,
                    "model_path": active_model_path,
                    "gpu": self.device,
                    "prompt": prompt,
                    "system_prompt": normalize_understanding_system_prompt(internal_task, system_prompt)
                    if internal_task in UNDERSTANDING_TASKS
                    else "",
                    "input_video": input_video,
                    "input_image": input_image,
                    "seed": actual_seed,
                    "height": int(height),
                    "width": int(width),
                    "num_frames": int(num_frames),
                    "resolution": display_resolution,
                    "backend_resolution": backend_resolution,
                    "validation_num_timesteps": int(validation_num_timesteps),
                    "validation_timestep_shift": float(validation_timestep_shift),
                    "cfg_text_scale": float(cfg_text_scale),
                    "frame_interpolation": frame_interpolation_enabled,
                    "elapsed_seconds": round(elapsed, 3),
                    "prompt_file": str(prompt_file),
                    "output_dir": str(save_dir),
                    "original_video_path": str(original_video_path) if original_video_path is not None else "",
                    "video_path": str(video_path) if video_path is not None else "",
                    "image_path": str(image_path) if image_path is not None else "",
                    "text_result": text_result,
                    "rife_error": "",
                }
                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} and video_path is None:
                    record["status"] = "completed_without_video"
                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} and image_path is None:
                    record["status"] = "completed_without_image"
                if internal_task in UNDERSTANDING_TASKS and not text_result:
                    record["status"] = "completed_without_text"
                save_generation_record(record, save_dir)

                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
                    if video_path is None:
                        status = (
                            "Inference completed, but no output video was found.\n\n"
                            f"- Task: `{internal_task}`\n"
                            f"- Model: `{self.model_variant}`\n"
                            f"- Model path: `{active_model_path}`\n"
                            f"- GPU: `{self.device}`\n"
                            f"- Actual seed: `{actual_seed}`\n"
                            f"- Output directory: `{save_dir}`"
                        )
                        return None, None, "", status
                    return str(video_path), None, "", ""

                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
                    if image_path is None:
                        status = (
                            "Inference completed, but no output image was found.\n\n"
                            f"- Task: `{internal_task}`\n"
                            f"- Model: `{self.model_variant}`\n"
                            f"- Model path: `{active_model_path}`\n"
                            f"- GPU: `{self.device}`\n"
                            f"- Actual seed: `{actual_seed}`\n"
                            f"- Output directory: `{save_dir}`"
                        )
                        return None, None, "", status
                    return None, str(image_path), "", ""

                return None, None, text_result, ""
            except Exception:
                error_trace = traceback.format_exc()
                print(error_trace, flush=True)
                record = {
                    "request_started_at": request_started_at,
                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
                    "status": "failed",
                    "task": internal_task,
                    "model_variant": self.model_variant,
                    "model_path": active_model_path,
                    "gpu": self.device,
                    "prompt": prompt,
                    "input_video": input_video,
                    "input_image": input_image,
                    "seed": actual_seed,
                    "height": int(height),
                    "width": int(width),
                    "num_frames": int(num_frames),
                    "resolution": display_resolution,
                    "backend_resolution": backend_resolution,
                    "validation_num_timesteps": int(validation_num_timesteps),
                    "validation_timestep_shift": float(validation_timestep_shift),
                    "cfg_text_scale": float(cfg_text_scale),
                    "prompt_file": str(prompt_file),
                    "output_dir": str(save_dir),
                    "video_path": "",
                    "image_path": "",
                    "text_result": "",
                    "error": error_trace,
                }
                save_generation_record(record, save_dir)
                status = (
                    "Inference failed.\n\n"
                    f"- Task: `{internal_task}`\n"
                    f"- Model: `{self.model_variant}`\n"
                    f"- Model path: `{active_model_path}`\n"
                    f"- GPU: `{self.device}`\n"
                    f"- Actual seed: `{actual_seed}`\n"
                    f"- Resolution: `{display_resolution}`\n"
                    f"- Output directory: `{save_dir}`"
                )
                return None, None, "", status


class PipelinePool:
    def __init__(self, gpu_ids: list[int], model_variant: str = MODEL_VARIANT_VIDEO) -> None:
        if not gpu_ids:
            raise ValueError("At least one GPU must be configured.")
        self.gpu_ids = gpu_ids
        self.model_variant = normalize_model_variant(model_variant)
        self.pipelines = [
            LanceT2VV2TPipeline(device_id=gpu_id, model_variant=self.model_variant)
            for gpu_id in gpu_ids
        ]
        self._available = deque(self.pipelines)
        self._condition = threading.Condition()

    @property
    def size(self) -> int:
        return len(self.pipelines)

    @property
    def gpu_summary(self) -> str:
        return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)

    @property
    def is_initialized(self) -> bool:
        return all(pipeline.initialized for pipeline in self.pipelines)

    def initialize_all(self) -> None:
        if self.is_initialized:
            return
        print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
        exceptions: list[Exception] = []
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
            futures = {
                executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
            }
            for future in concurrent.futures.as_completed(futures):
                gpu_id = futures[future]
                try:
                    future.result()
                except Exception as exc:
                    print(f"[startup][gpu:{gpu_id}][{self.model_variant}] Preload failed: {exc}", flush=True)
                    exceptions.append(exc)
        if exceptions:
            raise RuntimeError(
                f"{self.model_variant} preload failed on {len(exceptions)} GPU(s). Please check the terminal logs."
            ) from exceptions[0]
        print(
            f"[startup][{self.model_variant}] GPU preload finished. Ready to handle {self.size} concurrent request(s).",
            flush=True,
        )

    def acquire(self) -> LanceT2VV2TPipeline:
        with self._condition:
            while not self._available:
                self._condition.wait()
            return self._available.popleft()

    def release(self, pipeline: LanceT2VV2TPipeline) -> None:
        with self._condition:
            self._available.append(pipeline)
            self._condition.notify()

    def unload_all(self) -> None:
        print(f"[runtime][{self.model_variant}] Unloading model pool from GPU(s): {self.gpu_ids}", flush=True)
        with self._condition:
            while len(self._available) != len(self.pipelines):
                self._condition.wait()

        for pipeline in self.pipelines:
            pipeline.unload()

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        print(f"[runtime][{self.model_variant}] Model pool unloaded.", flush=True)

    def generate(
        self,
        task: str,
        prompt: str,
        system_prompt: Optional[str],
        input_video: Optional[str],
        input_image: Optional[str],
        height: int,
        width: int,
        num_frames: int,
        seed: int,
        resolution: str,
        validation_num_timesteps: int,
        validation_timestep_shift: float,
        cfg_text_scale: float,
        enable_frame_interpolation: bool,
    ):
        pipeline = self.acquire()
        try:
            return pipeline.generate(
                task=task,
                prompt=prompt,
                system_prompt=system_prompt,
                input_video=input_video,
                input_image=input_image,
                height=height,
                width=width,
                num_frames=num_frames,
                seed=seed,
                resolution=resolution,
                validation_num_timesteps=validation_num_timesteps,
                validation_timestep_shift=validation_timestep_shift,
                cfg_text_scale=cfg_text_scale,
                enable_frame_interpolation=enable_frame_interpolation,
            )
        finally:
            self.release(pipeline)


ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
ACTIVE_POOL_LOCK = threading.Lock()
QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
QUEUE_CONCURRENCY_LIMIT = DEFAULT_CONCURRENCY_LIMIT


def get_task_model_variant(task: str) -> str:
    internal_task = normalize_task(task)
    return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO


def get_env_int(name: str, default: int) -> int:
    """Read an integer environment variable, falling back safely on invalid values."""
    try:
        return int(os.getenv(name, str(default)))
    except (TypeError, ValueError):
        return default


def ensure_flash_attn_installed() -> None:
    try:
        from importlib.metadata import PackageNotFoundError, version as package_version
        current_version = package_version("flash_attn")
        if current_version == DEFAULT_FLASH_ATTN_VERSION:
            print(f"[startup] flash-attn {current_version} already installed.", flush=True)
            return
        print(
            f"[startup] flash-attn {current_version} detected; reinstalling {DEFAULT_FLASH_ATTN_VERSION} from wheel.",
            flush=True,
        )
    except Exception:
        print(
            f"[startup] flash-attn not available; installing {DEFAULT_FLASH_ATTN_VERSION} from wheel.",
            flush=True,
        )

    command = [
        sys.executable,
        "-m",
        "pip",
        "install",
        "--no-cache-dir",
        "--no-deps",
        "--force-reinstall",
        DEFAULT_FLASH_ATTN_WHEEL_URL,
    ]
    subprocess.check_call(command)
    print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed from wheel.", flush=True)


def get_zerogpu_duration_cap() -> int:
    """Fixed duration requested from ZeroGPU for each run.

    The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
    improve queue priority and reduce wasted quota, but the value must still cover
    model warm-up plus inference. Override per deployment when needed:
        LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
    """
    return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 300))


def clamp_zerogpu_duration(seconds: int) -> int:
    return max(1, min(int(seconds), get_zerogpu_duration_cap()))


ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()


def is_pipeline_pool_ready_for_variant(model_variant: str) -> bool:
    normalized_variant = normalize_model_variant(model_variant)
    with ACTIVE_POOL_LOCK:
        return bool(
            ACTIVE_PIPELINE_POOL is not None
            and ACTIVE_PIPELINE_POOL.model_variant == normalized_variant
            and ACTIVE_PIPELINE_POOL.is_initialized
        )


def is_pipeline_pool_ready_for_task(task: str) -> bool:
    return is_pipeline_pool_ready_for_variant(get_task_model_variant(task))


def get_pipeline_pool(task: str) -> PipelinePool:
    global ACTIVE_PIPELINE_POOL
    if not torch.cuda.is_available():
        raise RuntimeError(
            "Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
            "until GPU hardware is attached."
        )
    model_variant = get_task_model_variant(task)
    gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
    with ACTIVE_POOL_LOCK:
        if ACTIVE_PIPELINE_POOL is not None and ACTIVE_PIPELINE_POOL.model_variant == model_variant:
            if not ACTIVE_PIPELINE_POOL.is_initialized:
                ACTIVE_PIPELINE_POOL.initialize_all()
            return ACTIVE_PIPELINE_POOL

        if ACTIVE_PIPELINE_POOL is not None:
            previous_variant = ACTIVE_PIPELINE_POOL.model_variant
            print(
                f"[runtime] Switching Lance model from {previous_variant} to {model_variant}.",
                flush=True,
            )
            ACTIVE_PIPELINE_POOL.unload_all()
            ACTIVE_PIPELINE_POOL = None

        ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=model_variant)
        ACTIVE_PIPELINE_POOL.initialize_all()
        return ACTIVE_PIPELINE_POOL


def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
    """Clamp a heuristic duration to the deployment cap with a small safety margin."""
    task_key = normalize_task(task)
    raw_seconds = float(estimated_seconds)
    if raw_seconds <= 0:
        raw_seconds = _estimate_zerogpu_duration_seconds(
            task_key,
            prompt="",
            system_prompt=None,
            input_video=None,
            input_image=None,
            height=0,
            width=0,
            num_frames=0,
            seed=0,
            resolution="",
            validation_num_timesteps=0,
            validation_timestep_shift=0.0,
            cfg_text_scale=0.0,
            enable_frame_interpolation=False,
        )
    return clamp_zerogpu_duration(math.ceil(raw_seconds * 1.15) + 5)


def _estimate_zerogpu_duration_seconds(
    task: str,
    prompt: str,
    system_prompt: Optional[str],
    input_video: Optional[str],
    input_image: Optional[str],
    height: int,
    width: int,
    num_frames: int,
    seed: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    enable_frame_interpolation: bool,
) -> int:
    internal_task = normalize_task(task)
    prompt_length = len((prompt or "").strip())
    has_video_input = bool((input_video or "").strip())
    has_image_input = bool((input_image or "").strip())
    pool_ready = is_pipeline_pool_ready_for_task(internal_task)
    is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
    is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}

    if internal_task == TASK_T2I:
        return 90 if pool_ready else 150

    if internal_task == TASK_IMAGE_EDIT:
        return 100 if pool_ready else 150

    if internal_task == TASK_X2T_IMAGE:
        return 90 if pool_ready else 150

    if internal_task == TASK_X2T_VIDEO:
        return 120 if pool_ready else 200

    if internal_task == TASK_VIDEO_EDIT:
        base = 170 if pool_ready else 300
        base += min(30 if pool_ready else 48, max(0, num_frames - 37) // 3)
        base += 24 if enable_frame_interpolation else 0
        base += 16 if has_video_input else 0
        base += 10 if resolution == "video_480p" else 0
        return base

    if internal_task == TASK_T2V:
        if pool_ready:
            base = 130 if resolution == "video_360p" else 150
            base += min(36, max(0, num_frames - 37) // 3)
            base += 18 if enable_frame_interpolation else 0
            base += min(12, prompt_length // 320)
            return base
        base = 224 if resolution == "video_360p" else 264
        base += min(56, max(0, num_frames - 37) // 2)
        base += 28 if enable_frame_interpolation else 0
        base += min(20, prompt_length // 260)
        return base

    if is_video_task:
        base = 150 if pool_ready else 240
        base += min(28 if pool_ready else 40, max(0, num_frames - 37) // 3)
        base += 18 if enable_frame_interpolation else 0
        return base

    if is_image_task:
        return 100 if pool_ready else 120

    return 160


def get_run_task_gpu_duration(
    task: str,
    prompt: str,
    system_prompt: Optional[str],
    input_video: Optional[str],
    input_image: Optional[str],
    height: int,
    width: int,
    num_frames: int,
    seed: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    enable_frame_interpolation: bool,
) -> int:
    enable_frame_interpolation = False
    estimated_seconds = _estimate_zerogpu_duration_seconds(
        task=task,
        prompt=prompt,
        system_prompt=system_prompt,
        input_video=input_video,
        input_image=input_image,
        height=height,
        width=width,
        num_frames=num_frames,
        seed=seed,
        resolution=resolution,
        validation_num_timesteps=validation_num_timesteps,
        validation_timestep_shift=validation_timestep_shift,
        cfg_text_scale=cfg_text_scale,
        enable_frame_interpolation=enable_frame_interpolation,
    )
    return finalize_zerogpu_duration(estimated_seconds, task)


def run_task(
    task: str,
    prompt: str,
    system_prompt: Optional[str],
    input_video: Optional[str],
    input_image: Optional[str],
    height: int,
    width: int,
    num_frames: int,
    seed: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    enable_frame_interpolation: bool,
):
    internal_task = normalize_task(task)
    recommended_case_key, clean_system_prompt = unpack_recommended_cache_carrier(system_prompt)
    system_prompt = clean_system_prompt
    if not recommended_case_key:
        recommended_case_key = infer_recommended_case_key_from_request(internal_task, prompt, input_video, input_image)

    if internal_task in UNDERSTANDING_TASKS and not prompt:
        return None, None, "", "Please enter a question."
    if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
        return None, None, "", "Please upload an input video."
    if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
        return None, None, "", "Please upload an input image."
    if height <= 0 or width <= 0:
        return None, None, "", "Height and width must be greater than 0."
    if num_frames <= 0:
        return None, None, "", "The number of frames must be greater than 0."

    num_frames_ui = int(num_frames)
    normalized_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
    aspect_ratio = _infer_aspect_ratio_from_size(internal_task, int(width), int(height), normalized_resolution)

    # Ignore any stale interpolation value from old browser sessions before
    # building the cache signature, because interpolation is disabled in this UI.
    enable_frame_interpolation = False

    request_signature = build_recommended_request_signature(
        task=internal_task,
        prompt=prompt,
        system_prompt=system_prompt,
        input_video=input_video,
        input_image=input_image,
        height=int(height),
        width=int(width),
        num_frames_ui=num_frames_ui,
        seed=int(seed),
        resolution=normalized_resolution,
        validation_num_timesteps=int(validation_num_timesteps),
        validation_timestep_shift=float(validation_timestep_shift),
        cfg_text_scale=float(cfg_text_scale),
        enable_frame_interpolation=enable_frame_interpolation,
    )

    cached_result = get_recommended_cached_result(
        recommended_case_key,
        internal_task,
        resolution=normalized_resolution,
        aspect_ratio=aspect_ratio,
        duration_seconds=num_frames_ui,
        request_signature=request_signature,
    )
    if cached_result is not None:
        return cached_result

    if internal_task == TASK_T2V:
        num_frames = video_seconds_to_num_frames(num_frames_ui)
    result = run_task_gpu(
        task=task,
        prompt=prompt,
        system_prompt=system_prompt,
        input_video=input_video,
        input_image=input_image,
        height=height,
        width=width,
        num_frames=num_frames,
        seed=seed,
        resolution=normalized_resolution,
        validation_num_timesteps=validation_num_timesteps,
        validation_timestep_shift=validation_timestep_shift,
        cfg_text_scale=cfg_text_scale,
        enable_frame_interpolation=enable_frame_interpolation,
    )
    store_recommended_cached_result(
        recommended_case_key,
        result,
        resolution=normalized_resolution,
        aspect_ratio=aspect_ratio,
        duration_seconds=num_frames_ui,
        request_signature=request_signature,
    )
    return result


@spaces.GPU(size="large", duration=get_run_task_gpu_duration)
def run_task_gpu(
    task: str,
    prompt: str,
    system_prompt: Optional[str],
    input_video: Optional[str],
    input_image: Optional[str],
    height: int,
    width: int,
    num_frames: int,
    seed: int,
    resolution: str,
    validation_num_timesteps: int,
    validation_timestep_shift: float,
    cfg_text_scale: float,
    enable_frame_interpolation: bool,
):
    pipeline_pool = get_pipeline_pool(task)
    return pipeline_pool.generate(
        task=task,
        prompt=prompt,
        system_prompt=system_prompt,
        input_video=input_video,
        input_image=input_image,
        height=height,
        width=width,
        num_frames=num_frames,
        seed=seed,
        resolution=resolution,
        validation_num_timesteps=validation_num_timesteps,
        validation_timestep_shift=validation_timestep_shift,
        cfg_text_scale=cfg_text_scale,
        enable_frame_interpolation=enable_frame_interpolation,
    )


def build_status_markdown() -> str:
    gpu_text = "unknown"
    pipeline_slots = 0
    active_variant = "none"
    with ACTIVE_POOL_LOCK:
        if ACTIVE_PIPELINE_POOL is not None:
            active_variant = ACTIVE_PIPELINE_POOL.model_variant
            gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
            pipeline_slots = ACTIVE_PIPELINE_POOL.size
    return (
        f"**Status**  GPU: `{gpu_text}`  |  Queue concurrency: `{QUEUE_CONCURRENCY_LIMIT}`  |  "
        f"Pipeline slots: `{pipeline_slots}`  |  Queue limit: `{QUEUE_MAX_SIZE}`  |  "
        f"Active model: `{active_variant}`"
    )


def build_running_status_markdown() -> str:
    return "Running..."


def get_logo_data_uri() -> str:
    if not LANCE_LOGO_PATH.exists():
        return ""
    encoded_logo = base64.b64encode(LANCE_LOGO_PATH.read_bytes()).decode("ascii")
    return f"data:image/webp;base64,{encoded_logo}"


def build_header_html() -> str:
    logo_data_uri = get_logo_data_uri()
    logo_html = (
        f'<img class="lance-logo" src="{logo_data_uri}" alt="Lance logo">'
        if logo_data_uri
        else ""
    )
    return f"""
    <div class="lance-hero">
        {logo_html}
        <h1 class="lance-title">Lance: Unified Multimodal Modeling by Multi-Task Synergy</h1>
        <div class="lance-badges">
            <a href="{LANCE_HOMEPAGE_URL}" target="_blank" rel="noopener noreferrer">
                <img alt="Homepage" src="https://img.shields.io/badge/Homepage-Lance-2563eb?style=flat&labelColor=475569">
            </a>
            <a href="{LANCE_PAPER_URL}" target="_blank" rel="noopener noreferrer">
                <img alt="Paper" src="https://img.shields.io/badge/Paper-arXiv-2563eb?style=flat&labelColor=475569&logo=arxiv">
            </a>
            <a href="{LANCE_HUGGING_FACE_URL}" target="_blank" rel="noopener noreferrer">
                <img alt="Hugging Face" src="https://img.shields.io/badge/Model-HuggingFace-2563eb?style=flat&labelColor=475569&logo=huggingface">
            </a>
            <a href="{LANCE_GITHUB_URL}" target="_blank" rel="noopener noreferrer">
                <img alt="GitHub" src="https://img.shields.io/badge/Code-GitHub-2563eb?style=flat&labelColor=475569&logo=github">
            </a>
        </div>
    </div>
    """


def update_task_ui(task: str):
    internal_task = normalize_task(task)
    is_image_task = internal_task in IMAGE_TASKS
    is_video_task = internal_task in VIDEO_TASKS
    is_edit_task = internal_task in EDIT_TASKS
    is_understanding_task = internal_task in UNDERSTANDING_TASKS
    is_generation_task = internal_task in GENERATION_TASKS
    is_text_to_visual_task = internal_task in {TASK_T2V, TASK_T2I}
    show_media_input = is_edit_task or is_understanding_task
    resolution_choices = get_resolution_choices_for_task(internal_task)
    resolution_value = get_default_resolution_for_task(internal_task)
    aspect_ratio_value = DEFAULT_IMAGE_ASPECT_RATIO if is_image_task else DEFAULT_VIDEO_ASPECT_RATIO
    width_value, height_value = get_size_for_aspect_ratio(internal_task, aspect_ratio_value, resolution_value)
    size_markdown = format_size_markdown(internal_task, width_value, height_value)
    system_prompt_choices = get_understanding_system_prompt_choices(internal_task)

    if is_text_to_visual_task:
        text_label = "Prompt"
        text_placeholder = "Describe what you want to generate..."
    elif is_edit_task:
        text_label = "Instruction"
        text_placeholder = "Describe the edit you want..."
    else:
        text_label = "Question"
        text_placeholder = "Ask a question about the input..."

    if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
        output_label = "Output Video"
    elif internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
        output_label = "Output Image"
    else:
        output_label = "Output Text"

    output_icon = "video" if output_label == "Output Video" else "image" if output_label == "Output Image" else "text"
    show_generation_settings = is_generation_task or is_edit_task
    show_aspect_ratio = is_text_to_visual_task
    show_input_video = internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
    show_input_image = internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
    show_frame_interpolation_settings = False
    show_video_resolution_settings = internal_task == TASK_T2V

    return (
        gr.update(value=build_lance_label_html(text_label, "lance-prompt-label")),
        gr.update(
            label=text_label,
            placeholder=text_placeholder,
            visible=True,
            value="",
        ),
        gr.update(
            choices=system_prompt_choices,
            value=system_prompt_choices[0],
            visible=False,
        ),
        # Switching task pages should always start from a clean input state.
        # Clear both visual input boxes even if one of them stays visible across tasks.
        gr.update(label="Input Video", visible=show_input_video, value=None),
        gr.update(label="Input Image", visible=show_input_image, value=None),
        gr.update(visible=False),
        gr.update(visible=show_aspect_ratio),
        gr.update(visible=False),
        gr.update(visible=internal_task == TASK_T2V),
        gr.update(visible=show_video_resolution_settings),
        gr.update(choices=get_aspect_ratio_choices_for_task(internal_task), value=aspect_ratio_value, visible=show_aspect_ratio),
        gr.update(value=height_value),
        gr.update(value=width_value),
        gr.update(visible=False, value=False),
        gr.update(choices=get_output_resolution_choices_for_task(internal_task, resolution_value), value=size_markdown, visible=False),
        gr.update(visible=internal_task == TASK_T2V, value=DEFAULT_VIDEO_DURATION_SECONDS),
        gr.update(choices=resolution_choices, value=resolution_value, visible=show_video_resolution_settings),
        gr.update(value=build_lance_icon_label_html(output_label, output_icon, "lance-output-label")),
        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}),
        gr.update(visible=internal_task in {TASK_T2I, TASK_IMAGE_EDIT}),
        gr.update(visible=is_understanding_task, value=""),
        gr.update(visible=internal_task == TASK_T2V),
        gr.update(visible=internal_task == TASK_VIDEO_EDIT),
        gr.update(visible=internal_task == TASK_X2T_VIDEO),
        gr.update(visible=internal_task == TASK_T2I),
        gr.update(visible=internal_task == TASK_IMAGE_EDIT),
        gr.update(visible=internal_task == TASK_X2T_IMAGE),
        "",
    )


def build_demo() -> gr.Blocks:
    with gr.Blocks(title="Lance", css=APP_CSS, js=APP_JS) as demo:
        gr.HTML(build_header_html())
        gr.Markdown(build_status_markdown(), elem_classes=["lance-status"], visible=False)

        with gr.Column(elem_classes=["lance-taskbar-wrap"]):
            task = gr.Radio(
                label="Task",
                show_label=False,
                choices=TASK_CHOICES,
                value=TASK_LABEL_VIDEO_GENERATION,
                elem_classes=["task-selector"],
            )

        with gr.Row(elem_classes=["lance-main-row"]):
            with gr.Column(scale=1, elem_classes=["lance-main-column", "lance-input-column"]):
                with gr.Column(elem_classes=["lance-panel", "lance-task-prompt-panel"]):
                    prompt_label = gr.HTML(build_lance_label_html("Prompt", "lance-prompt-label"), elem_classes=["lance-label-html"])
                    prompt = gr.Textbox(
                        label="Prompt",
                        show_label=False,
                        lines=6,
                        placeholder="Describe the video you want to generate...",
                        elem_classes=["main-prompt-control"],
                    )
                    with gr.Row(elem_classes=["prompt-options"]):
                        with gr.Group(elem_classes=["prompt-chip", "video-resolution-row"]) as video_resolution_row:
                            resolution = gr.Dropdown(
                                label="Video Resolution",
                                show_label=False,
                                choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
                                value=DEFAULT_RESOLUTION,
                                allow_custom_value=True,
                                elem_classes=["generation-control"],
                            )
                        with gr.Group(elem_classes=["prompt-chip", "aspect-ratio-row"]) as aspect_ratio_row:
                            aspect_ratio = gr.Dropdown(
                                label="Aspect Ratio",
                                show_label=False,
                                choices=get_aspect_ratio_choices_for_task(TASK_T2V),
                                value=DEFAULT_VIDEO_ASPECT_RATIO,
                                elem_classes=["generation-control"],
                            )
                        with gr.Group(elem_classes=["prompt-chip", "video-duration-row"]) as video_duration_row:
                            num_frames = gr.Dropdown(
                                label="Video Duration",
                                show_label=False,
                                choices=get_video_duration_choices(),
                                value=DEFAULT_VIDEO_DURATION_SECONDS,
                                elem_classes=["generation-control"],
                            )
                        with gr.Group(visible=False, elem_classes=["prompt-chip", "output-resolution-row"]) as output_resolution_row:
                            real_size = gr.Dropdown(
                                label="Output Resolution",
                                show_label=False,
                                choices=get_output_resolution_choices_for_task(TASK_T2V),
                                value=format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT),
                                interactive=False,
                                visible=False,
                                allow_custom_value=True,
                                elem_classes=["generation-control"],
                            )

                # Hidden compatibility components for old callbacks; frame interpolation is disabled.
                with gr.Group(visible=False, elem_classes=["frame-interpolation-row", "frame-interpolation-disabled"]) as frame_interpolation_row:
                    enable_frame_interpolation = gr.Checkbox(value=False, visible=False)

                system_prompt = gr.Dropdown(
                    label="System Prompt",
                    choices=get_understanding_system_prompt_choices(TASK_X2T_VIDEO),
                    value=V2T_QA_SYSTEM_PROMPT,
                    visible=False,
                    allow_custom_value=True,
                )
                input_video = gr.Video(label="Input Video", visible=False, elem_classes=["lance-display-frame"])
                input_image = gr.Image(label="Input Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
                height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
                width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)

                with gr.Accordion("Advanced Parameters", open=False, elem_classes=["lance-advanced-accordion"]):
                    seed = gr.Number(label="Seed (-1 for random seed)", value=DEFAULT_BASIC_SEED, precision=0)
                    validation_num_timesteps = gr.Slider(
                        minimum=1,
                        maximum=50,
                        step=1,
                        value=DEFAULT_TIMESTEPS,
                        label="Validation Num Timesteps",
                    )
                    with gr.Row():
                        validation_timestep_shift = gr.Number(label="Validation Timestep Shift", value=DEFAULT_TIMESTEP_SHIFT)
                        cfg_text_scale = gr.Number(label="CFG Text Scale", value=DEFAULT_CFG_TEXT_SCALE)

            with gr.Column(scale=1, elem_classes=["lance-main-column", "lance-output-column"]):
                with gr.Column(elem_classes=["lance-panel", "lance-output-panel"]):
                    output_label = gr.HTML(
                        build_lance_icon_label_html("Output Video", "video", "lance-output-label"),
                        elem_classes=["lance-label-html"],
                    )
                    output_video = gr.Video(label="Output Video", show_label=False, elem_classes=["lance-display-frame", "output-media-control"])
                    output_image = gr.Image(label="Output Image", show_label=False, type="filepath", visible=False, elem_classes=["lance-display-frame", "output-media-control"])
                    output_text = gr.Textbox(label="Output Text", show_label=False, lines=3, visible=False, elem_classes=["lance-display-frame", "output-text-control"])
                status = gr.Markdown("", elem_classes=["lance-run-status"])

        recommended_case_key = gr.State("")

        run_button = gr.Button("🚀 Generate", variant="primary", elem_classes=["lance-run-button"])
        gr.Markdown(
            "**Note**: Video-related features may consume more GPU quota and take longer. Cached recommended cases and image tasks are lighter.",
            elem_classes=["lance-quota-note"],
        )

        def build_prompt_example_table(examples: list[list], media_type: Optional[str] = None):
            """Recommended example list with complete-fit reference media previews."""
            example_buttons = []
            with gr.Column(elem_classes=["prompt-example-full-table"]):
                for row in examples:
                    example_prompt = str(row[0]) if row else ""
                    example_cache_key = str(row[-1]) if row and str(row[-1]) in RECOMMENDED_CASE_CACHE else ""

                    preview_video_path = input_video_path = None
                    preview_image_path = input_image_path = None
                    if media_type == "video":
                        preview_video_path = str(row[1]) if len(row) > 1 and row[1] else None
                        input_video_path = str(row[2]) if len(row) > 2 and row[2] else preview_video_path
                    elif media_type == "image":
                        preview_image_path = str(row[3]) if len(row) > 3 and row[3] else (str(row[2]) if len(row) > 2 and row[2] else None)
                        input_image_path = str(row[4]) if len(row) > 4 and row[4] else preview_image_path

                    button_label = example_prompt if len(example_prompt) <= 360 else f"{example_prompt[:357]}..."

                    if media_type in {"video", "image"}:
                        with gr.Row(elem_classes=["prompt-example-multimodal-row"]):
                            with gr.Column(elem_classes=["prompt-example-prompt-cell"]):
                                example_button = gr.Button(
                                    button_label,
                                    variant="secondary",
                                    elem_classes=["prompt-example-row-button"],
                                )
                            with gr.Column(elem_classes=["prompt-example-media-cell"]):
                                if media_type == "video":
                                    gr.HTML(
                                        build_example_media_html(preview_video_path, "video", fallback_media_path=input_video_path),
                                        elem_classes=["prompt-example-media-html"],
                                    )
                                else:
                                    gr.HTML(
                                        build_example_media_html(preview_image_path, "image"),
                                        elem_classes=["prompt-example-media-html"],
                                    )
                    else:
                        example_button = gr.Button(
                            button_label,
                            variant="secondary",
                            elem_classes=["prompt-example-row-button"],
                        )

                    example_buttons.append((example_button, example_prompt, input_video_path, input_image_path, example_cache_key))
            return example_buttons

        def examples_section(title: str, examples: list[list], media_type: Optional[str] = None, visible: bool = False):
            with gr.Column(visible=visible, elem_classes=["lance-recommended-section"]) as group:
                gr.HTML(build_lance_label_html(title, "lance-section-label"), elem_classes=["lance-label-html"])
                with gr.Group(elem_classes=["example-panel", "prompt-examples"]):
                    buttons = build_prompt_example_table(examples, media_type=media_type)
            return group, buttons

        video_generation_examples_group, video_generation_example_buttons = examples_section(
            "Video generation recommended cases", VIDEO_GENERATION_EXAMPLES, visible=True
        )
        video_edit_examples_group, video_edit_example_buttons = examples_section(
            "Video edit recommended cases", VIDEO_EDIT_EXAMPLES, media_type="video"
        )
        video_understanding_examples_group, video_understanding_example_buttons = examples_section(
            "Video understanding recommended cases", VIDEO_UNDERSTANDING_EXAMPLES, media_type="video"
        )
        image_generation_examples_group, image_generation_example_buttons = examples_section(
            "Image generation recommended cases", IMAGE_GENERATION_EXAMPLES
        )
        image_edit_examples_group, image_edit_example_buttons = examples_section(
            "Image edit recommended cases", IMAGE_EDIT_EXAMPLES, media_type="image"
        )
        image_understanding_examples_group, image_understanding_example_buttons = examples_section(
            "Image understanding recommended cases", IMAGE_UNDERSTANDING_EXAMPLES, media_type="image"
        )

        task.change(
            fn=update_task_ui,
            inputs=[task],
            outputs=[
                prompt_label,
                prompt,
                system_prompt,
                input_video,
                input_image,
                frame_interpolation_row,
                aspect_ratio_row,
                output_resolution_row,
                video_duration_row,
                video_resolution_row,
                aspect_ratio,
                height,
                width,
                enable_frame_interpolation,
                real_size,
                num_frames,
                resolution,
                output_label,
                output_video,
                output_image,
                output_text,
                video_generation_examples_group,
                video_edit_examples_group,
                video_understanding_examples_group,
                image_generation_examples_group,
                image_edit_examples_group,
                image_understanding_examples_group,
                recommended_case_key,
            ],
        )

        aspect_ratio.change(
            fn=update_size_from_aspect_ratio,
            inputs=[task, aspect_ratio, resolution],
            outputs=[height, width, real_size],
            queue=False,
            show_api=False,
        )
        # real_size is hidden and derived from task/resolution/aspect_ratio.
        # Do not attach a .change handler here: dynamic Dropdown choices can briefly
        # contain 360p values while the selected value is 480p (or vice versa),
        # which makes Gradio reject the stale value during preprocessing.
        resolution.change(
            fn=update_output_resolution_from_video_profile,
            inputs=[task, aspect_ratio, resolution],
            outputs=[real_size, height, width],
            queue=False,
            show_api=False,
        )

        for example_button, example_prompt, _, _, example_cache_key in video_generation_example_buttons + image_generation_example_buttons:
            example_button.click(
                fn=make_prompt_example_click_handler(example_prompt, example_cache_key),
                inputs=[task],
                outputs=[prompt, system_prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
                queue=False,
                show_api=False,
            )

        for example_button, example_prompt, example_video, example_image, example_cache_key in (
            video_edit_example_buttons
            + video_understanding_example_buttons
            + image_edit_example_buttons
            + image_understanding_example_buttons
        ):
            example_button.click(
                fn=make_media_prompt_example_click_handler(example_prompt, example_video, example_image, example_cache_key),
                inputs=[task],
                outputs=[prompt, input_video, input_image, system_prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
                queue=False,
                show_api=False,
            )

        run_button.click(
            fn=build_running_status_markdown,
            inputs=[],
            outputs=[status],
            queue=False,
            show_api=False,
        ).then(
            fn=run_task,
            inputs=[
                task,
                prompt,
                system_prompt,
                input_video,
                input_image,
                height,
                width,
                num_frames,
                seed,
                resolution,
                validation_num_timesteps,
                validation_timestep_shift,
                cfg_text_scale,
                enable_frame_interpolation,
            ],
            outputs=[output_video, output_image, output_text, status],
            show_progress="minimal",
        )

    return demo


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Lance multimodal Gradio")
    parser.add_argument("--server-name", default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
    parser.add_argument("--server-port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
    parser.add_argument("--share", action="store_true", default=env_flag("GRADIO_SHARE", False))
    parser.add_argument(
        "--gpus",
        default=os.getenv("LANCE_GPUS", DEFAULT_GPUS),
        help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
    )
    parser.add_argument(
        "--queue-size",
        type=int,
        default=int(os.getenv("LANCE_QUEUE_SIZE", str(DEFAULT_QUEUE_SIZE))),
        help="Maximum number of queued Gradio requests.",
    )
    parser.add_argument(
        "--concurrency-limit",
        type=int,
        default=int(os.getenv("LANCE_CONCURRENCY_LIMIT", str(DEFAULT_CONCURRENCY_LIMIT))),
        help="Maximum number of Gradio jobs that may execute concurrently. Use 2 for most GPU Spaces; raise it only when enough GPU memory/pipeline slots are available.",
    )
    return parser.parse_args()


def parse_gpu_ids(gpu_string: str) -> list[int]:
    gpu_ids: list[int] = []
    for item in gpu_string.split(","):
        item = item.strip()
        if not item:
            continue
        gpu_ids.append(int(item))
    if not gpu_ids:
        raise ValueError("No valid GPU IDs were parsed.")
    return gpu_ids


def prefetch_model_assets_before_launch() -> None:
    """Download and compact model files before the first ZeroGPU request.

    On ZeroGPU, time spent downloading model snapshots inside @spaces.GPU burns
    the first user's GPU reservation. Prefetching only touches CPU/disk and keeps
    the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
    Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
    """
    if running_on_space() or env_flag("LANCE_INSTALL_FLASH_ATTN_ON_STARTUP", False):
        try:
            ensure_flash_attn_installed()
        except Exception as exc:
            print(f"[startup] flash-attn startup install failed and will be retried lazily during inference: {exc}", flush=True)

    if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
        print("[startup] Model asset prefetch disabled.", flush=True)
        return

    variants_text = os.getenv("LANCE_PREFETCH_MODEL_VARIANTS", f"{MODEL_VARIANT_VIDEO},{MODEL_VARIANT_IMAGE}")
    variants: list[str] = []
    for raw_variant in variants_text.split(","):
        raw_variant = raw_variant.strip()
        if not raw_variant:
            continue
        variant = normalize_model_variant(raw_variant)
        if variant not in variants:
            variants.append(variant)

    for variant in variants:
        try:
            start = time.perf_counter()
            model_path = ensure_model_assets(variant)
            elapsed = time.perf_counter() - start
            print(
                f"[startup][{variant}] Model assets are ready at {display_path(model_path)} "
                f"before ZeroGPU inference. elapsed={elapsed:.2f}s",
                flush=True,
            )
        except Exception as exc:
            print(
                f"[startup][{variant}] Model asset prefetch failed and will be retried lazily during inference: {exc}",
                flush=True,
            )


if __name__ == "__main__":
    args = parse_args()
    os.environ["LANCE_GPUS"] = args.gpus
    QUEUE_MAX_SIZE = args.queue_size
    QUEUE_CONCURRENCY_LIMIT = max(1, args.concurrency_limit)
    prefetch_model_assets_before_launch()
    print(
        "[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be prefetched on CPU before ZeroGPU inference. If that prefetch fails, inference will fall back to lazy loading.",
        flush=True,
    )
    print(
        f"[startup] Gradio queue configured with max_size={QUEUE_MAX_SIZE}, default_concurrency_limit={QUEUE_CONCURRENCY_LIMIT}.",
        flush=True,
    )
    demo = build_demo()
    demo.queue(
        max_size=QUEUE_MAX_SIZE,
        default_concurrency_limit=QUEUE_CONCURRENCY_LIMIT,
    ).launch(
        server_name=args.server_name,
        server_port=args.server_port,
        share=args.share,
        allowed_paths=[str(REPO_ROOT.resolve()), str(GRADIO_TMP_ROOT.resolve())],
        ssr_mode=False,
    )