Spaces:

bytedance-research
/

Lance

Running on Zero

App Files Files Community

ffy2000 commited on 2 days ago

Commit

afd380b

1 Parent(s): a14b598

Vendor RIFE into repo

Browse files

Files changed (8) hide show

app.py +0 -0
app_save.py +0 -0
app_wrong.py +0 -2247
assets/video-understanding/videos/video-understanding-caption-long-01_h264.mp4 +3 -0
assets/video-understanding/videos/video-understanding-caption-short-01_h264.mp4 +3 -0
assets/video-understanding/videos/video-understanding-vqa-01_h264.mp4 +3 -0
config/examples/video_edit_examples/edit_source_car_h264.mp4 +3 -0
config/examples/video_edit_examples/edit_source_woman_h264.mp4 +3 -0

app.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

app_save.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

app_wrong.py DELETED Viewed

@@ -1,2247 +0,0 @@
-from __future__ import annotations
-import argparse
-import base64
-import concurrent.futures
-import gc
-import json
-import os
-import random
-import subprocess
-import threading
-import time
-import traceback
-from collections import deque
-from copy import deepcopy
-from datetime import datetime
-from pathlib import Path
-from typing import Optional
-import gradio as gr
-import torch
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file
-from transformers import set_seed
-from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
-from common.utils.logging import get_logger
-from common.utils.misc import AutoEncoderParams, tuple_mul
-from config.config_factory import DataArguments, InferenceArguments, ModelArguments
-from data.data_utils import add_special_tokens
-from data.dataset_base import DataConfig, simple_custom_collate
-from data.datasets_custom import ValidationDataset
-from inference_lance import (
-    PROMPT_JSON_FILENAME,
-    apply_inference_defaults,
-    clean_memory,
-    init_from_model_path_if_needed,
-    save_prompt_results,
-    validate_on_fixed_batch,
-)
-from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
-from modeling.qwen2 import Qwen2Tokenizer
-from modeling.qwen2.modeling_qwen2 import Qwen2Config
-from modeling.vae.wan.model import WanVideoVAE
-from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
-REPO_ROOT = Path(__file__).resolve().parent
-GRADIO_TMP_ROOT = Path(os.getenv("LANCE_GRADIO_TMP_ROOT", "/tmp/lance_gradio")).expanduser()
-TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
-RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
-GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
-RUN_RECORD_FILENAME = "generation_record.json"
-LOCAL_MODEL_BASE_DIR = Path("downloads")
-SPACE_MODEL_BASE_DIR = Path("/data/lance_models")
-DEFAULT_MODEL_REPO_ID = "bytedance-research/Lance"
-DEFAULT_MODEL_VARIANT = "video"
-MODEL_VARIANT_VIDEO = "video"
-MODEL_VARIANT_IMAGE = "image"
-MODEL_VARIANT_TO_DIR = {
-    MODEL_VARIANT_VIDEO: "Lance_3B_Video",
-    MODEL_VARIANT_IMAGE: "Lance_3B",
-}
-DEFAULT_MODEL_PATH = LOCAL_MODEL_BASE_DIR / MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]
-DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
-DEFAULT_TASK = "t2v"
-DEFAULT_TIMESTEPS = 30
-DEFAULT_TIMESTEP_SHIFT = 3.5
-DEFAULT_CFG_TEXT_SCALE = 4.0
-DEFAULT_RESOLUTION = "video_848x480"
-DEFAULT_IMAGE_RESOLUTION = "image_768x768"
-DEFAULT_BASIC_SEED = 42
-DEFAULT_HEIGHT = 480
-DEFAULT_WIDTH = 848
-DEFAULT_IMAGE_SIZE = 768
-DEFAULT_VIDEO_DURATION_SECONDS = 5
-DEFAULT_NUM_FRAMES = 12 * DEFAULT_VIDEO_DURATION_SECONDS + 1
-DEFAULT_VIDEO_ASPECT_RATIO = "16:9"
-DEFAULT_IMAGE_ASPECT_RATIO = "1:1"
-FRAME_INTERPOLATION_YES = "Yes"
-FRAME_INTERPOLATION_NO = "No"
-DEFAULT_FRAME_INTERPOLATION = FRAME_INTERPOLATION_YES
-ASPECT_RATIO_CHOICES = ["21:9", "16:9", "3:2", "4:3", "1:1", "3:4", "2:3", "9:16", "9:21"]
-VIDEO_ASPECT_RATIO_TO_SIZE = {
-    "21:9": (976, 416),
-    "16:9": (848, 480),
-    "3:2": (784, 528),
-    "4:3": (736, 560),
-    "1:1": (640, 640),
-    "3:4": (560, 736),
-    "2:3": (528, 784),
-    "9:16": (480, 848),
-    "9:21": (416, 976),
-}
-IMAGE_ASPECT_RATIO_TO_SIZE = {
-    "21:9": (1168, 496),
-    "16:9": (1024, 576),
-    "3:2": (944, 624),
-    "4:3": (880, 672),
-    "1:1": (768, 768),
-    "3:4": (672, 880),
-    "2:3": (624, 944),
-    "9:16": (576, 1024),
-    "9:21": (496, 1168),
-}
-DEFAULT_GPUS = "0"
-DEFAULT_QUEUE_SIZE = 32
-USE_KVCACHE = True
-TEXT_TEMPLATE = True
-RECORD_WRITE_LOCK = threading.Lock()
-LANCE_HOMEPAGE_URL = "https://lance-project.github.io/"
-LANCE_PAPER_URL = "http://arxiv.org/abs/2605.18678"
-LANCE_HUGGING_FACE_URL = "https://huggingface.co/bytedance-research/Lance"
-LANCE_GITHUB_URL = "https://github.com/bytedance/Lance"
-LANCE_LOGO_PATH = REPO_ROOT / "assets" / "logo" / "lance-logo.webp"
-APP_CSS = """
-.gradio-container {
-    max-width: 1680px !important;
-    margin-left: auto !important;
-    margin-right: auto !important;
-}
-.contain {
-    max-width: 1680px !important;
-    margin-left: auto !important;
-    margin-right: auto !important;
-}
-.lance-hero {
-    text-align: center;
-    padding: 8px 12px 6px;
-}
-.lance-logo {
-    width: min(160px, 36vw);
-    height: auto;
-    display: block;
-    margin: 0 auto 4px;
-}
-.lance-title {
-    margin: 0 auto 5px;
-    font-size: clamp(20px, 2.4vw, 30px);
-    line-height: 1.08;
-    font-weight: 800;
-    letter-spacing: 0;
-}
-.lance-authors {
-    margin: 0 auto 6px;
-    max-width: 1280px;
-    font-size: 20px;
-    line-height: 1.24;
-    color: var(--body-text-color-subdued);
-}
-.lance-authors a {
-    color: inherit;
-    text-decoration: none;
-}
-.lance-authors a:hover {
-    text-decoration: underline;
-}
-.lance-badges {
-    display: flex;
-    flex-wrap: wrap;
-    justify-content: center;
-    gap: 5px;
-    margin: 4px auto 0;
-}
-.lance-badges a {
-    line-height: 0;
-}
-.lance-badges img {
-    height: 20px;
-    width: auto;
-    display: block;
-}
-.lance-status {
-    max-width: 1180px;
-    margin: 0 auto 18px;
-}
-.task-selector {
-    overflow-x: auto;
-}
-.lance-main-column > label span,
-.lance-main-column > .block-title,
-.lance-main-column > .label-wrap span,
-.lance-main-column > .form > label span,
-.lance-main-column > .form > .block-title,
-.lance-main-column > .form > .label-wrap span {
-    font-size: 20px !important;
-    font-weight: 700 !important;
-}
-.task-selector .wrap {
-    display: grid;
-    grid-template-columns: repeat(3, minmax(220px, 1fr));
-    gap: 8px;
-    min-width: 680px;
-}
-.task-selector label {
-    justify-content: center;
-    min-height: 38px;
-    white-space: nowrap;
-    border-radius: 10px !important;
-}
-.task-selector span {
-    font-size: 20px !important;
-}
-.recommended-title {
-    text-align: center !important;
-    margin: 14px auto 10px !important;
-}
-.recommended-title h3,
-.recommended-title p {
-    text-align: center !important;
-    font-size: 22px !important;
-    font-weight: 800 !important;
-    color: var(--body-text-color) !important;
-}
-.example-panel {
-    margin-top: 14px !important;
-    padding: 10px 12px !important;
-    border-radius: 8px !important;
-    background: rgba(248, 250, 252, 0.72) !important;
-    border: 1px solid var(--border-color-primary) !important;
-}
-.prompt-examples table,
-.prompt-examples th,
-.prompt-examples td {
-    border: 1px solid var(--border-color-primary) !important;
-}
-.prompt-examples table {
-    border-collapse: collapse !important;
-    width: 100% !important;
-}
-.prompt-examples td {
-    border-bottom: 1px solid var(--border-color-primary) !important;
-    padding: 12px !important;
-    vertical-align: top !important;
-}
-.example-panel th,
-.example-panel .block-label,
-.example-panel label span,
-.example-panel .label-wrap span {
-    font-size: 18px !important;
-    font-weight: 700 !important;
-}
-.prompt-dataset {
-    max-height: 420px !important;
-    overflow-y: auto !important;
-    overscroll-behavior: contain !important;
-    scrollbar-gutter: stable !important;
-}
-.prompt-dataset button {
-    height: auto !important;
-    min-height: 48px !important;
-    white-space: normal !important;
-    text-align: left !important;
-    align-items: flex-start !important;
-}
-.prompt-dataset .paginate {
-    display: none !important;
-}
-.prompt-example-proxy {
-    display: none !important;
-}
-.lance-main-row {
-    display: grid !important;
-    grid-template-columns: minmax(0, 1fr) minmax(0, 1fr) !important;
-    gap: 16px !important;
-    align-items: start !important;
-}
-.lance-main-column {
-    min-width: 0 !important;
-    width: 100% !important;
-}
-.lance-display-frame,
-.lance-display-frame > div,
-.lance-display-frame textarea {
-    width: 100% !important;
-}
-.lance-display-frame textarea {
-    min-height: 360px !important;
-}
-.lance-run-button {
-    font-size: 18px !important;
-    font-weight: 800 !important;
-}
-.generation-controls-row {
-    width: 100% !important;
-    max-width: 100% !important;
-    overflow-x: hidden !important;
-}
-.generation-controls-row > .form {
-    display: grid !important;
-    grid-template-columns:
-        minmax(0, 1.25fr)
-        minmax(0, 1.3fr)
-        minmax(0, 1fr)
-        minmax(0, 1.25fr) !important;
-    gap: 12px !important;
-    align-items: start !important;
-    width: 100% !important;
-    max-width: 100% !important;
-    overflow: visible !important;
-}
-.generation-control,
-.generation-control > div,
-.generation-controls-row > .form > div {
-    min-width: 0 !important;
-    max-width: 100% !important;
-}
-.generation-controls-row .generation-control label,
-.generation-controls-row .generation-control label span,
-.generation-controls-row .generation-control .block-label,
-.generation-controls-row .generation-control .block-title,
-.generation-controls-row .generation-control > label,
-.generation-controls-row .generation-control .label-wrap,
-.generation-controls-row .generation-control .label-wrap span {
-    font-size: 22px !important;
-    font-weight: 700 !important;
-    line-height: 1.15 !important;
-    letter-spacing: 0 !important;
-    white-space: normal !important;
-}
-.generation-controls-row .generation-value-control input,
-.generation-controls-row .generation-value-control textarea,
-.generation-controls-row .generation-value-control [data-testid="textbox"],
-.generation-controls-row .generation-dropdown-control input[role="listbox"],
-.generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
-.generation-controls-row .generation-dropdown-control .secondary-wrap input {
-    font-size: 22px !important;
-    font-weight: 700 !important;
-    line-height: 1.2 !important;
-    letter-spacing: 0 !important;
-    text-align: left !important;
-}
-.generation-controls-row .generation-value-control input,
-.generation-controls-row .generation-value-control textarea,
-.generation-controls-row .generation-dropdown-control input[role="listbox"],
-.generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
-.generation-controls-row .generation-dropdown-control .secondary-wrap input {
-    min-height: 64px !important;
-    width: 100% !important;
-    box-sizing: border-box !important;
-}
-@media (max-width: 1100px) {
-    .generation-controls-row > .form {
-        grid-template-columns: repeat(2, minmax(0, 1fr)) !important;
-    }
-}
-@media (max-width: 900px) {
-    .lance-main-row {
-        grid-template-columns: minmax(0, 1fr) !important;
-    }
-}
-"""
-TASK_T2V = "t2v"
-TASK_T2I = "t2i"
-TASK_V2T = "v2t"
-TASK_X2T = "x2t"
-TASK_X2T_VIDEO = "x2t_video"
-TASK_X2T_IMAGE = "x2t_image"
-TASK_IMAGE_EDIT = "image_edit"
-TASK_VIDEO_EDIT = "video_edit"
-TASK_LABEL_VIDEO_GENERATION = "Video Generation"
-TASK_LABEL_VIDEO_EDIT = "Video Edit"
-TASK_LABEL_VIDEO_UNDERSTANDING = "Video Understanding"
-TASK_LABEL_IMAGE_GENERATION = "Image Generation"
-TASK_LABEL_IMAGE_EDIT = "Image Edit"
-TASK_LABEL_IMAGE_UNDERSTANDING = "Image Understanding"
-TASK_CHOICES = [
-    TASK_LABEL_VIDEO_GENERATION,
-    TASK_LABEL_VIDEO_EDIT,
-    TASK_LABEL_VIDEO_UNDERSTANDING,
-    TASK_LABEL_IMAGE_GENERATION,
-    TASK_LABEL_IMAGE_EDIT,
-    TASK_LABEL_IMAGE_UNDERSTANDING,
-]
-TASK_LABEL_TO_INTERNAL = {
-    TASK_LABEL_VIDEO_GENERATION: TASK_T2V,
-    TASK_LABEL_VIDEO_EDIT: TASK_VIDEO_EDIT,
-    TASK_LABEL_VIDEO_UNDERSTANDING: TASK_X2T_VIDEO,
-    TASK_LABEL_IMAGE_GENERATION: TASK_T2I,
-    TASK_LABEL_IMAGE_EDIT: TASK_IMAGE_EDIT,
-    TASK_LABEL_IMAGE_UNDERSTANDING: TASK_X2T_IMAGE,
-    TASK_T2V: TASK_T2V,
-    TASK_VIDEO_EDIT: TASK_VIDEO_EDIT,
-    TASK_V2T: TASK_X2T_VIDEO,
-    TASK_X2T: TASK_X2T_VIDEO,
-    TASK_X2T_VIDEO: TASK_X2T_VIDEO,
-    TASK_T2I: TASK_T2I,
-    TASK_IMAGE_EDIT: TASK_IMAGE_EDIT,
-    TASK_X2T_IMAGE: TASK_X2T_IMAGE,
-}
-GENERATION_TASKS = {TASK_T2V, TASK_T2I, TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
-UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
-IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
-VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
-EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
-VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
-IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
-RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
-CAPTION_SYSTEM_PROMPT_TEMPLATE = (
-    "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
-)
-V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
-I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
-V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
-I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
-def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
-    """Get Aspect Ratio choices with default/recommended marker for the given task."""
-    internal_task = normalize_task(task)
-    default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
-    return [
-        (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
-        for ratio in ASPECT_RATIO_CHOICES
-    ]
-def env_flag(name: str, default: bool) -> bool:
-    value = os.getenv(name)
-    if value is None:
-        return default
-    return value.strip().lower() in {"1", "true", "yes", "on"}
-def running_on_space() -> bool:
-    return bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
-def display_path(path: Path) -> str:
-    path_text = path.as_posix()
-    if path.is_absolute():
-        try:
-            path_text = path.relative_to(Path.cwd()).as_posix()
-        except ValueError:
-            return path_text
-    if path_text == "." or path_text.startswith("./"):
-        return path_text
-    return f"./{path_text}"
-def get_model_base_dir() -> Path:
-    configured = os.getenv("LANCE_MODEL_BASE_DIR")
-    if configured:
-        return Path(configured).expanduser()
-    if LOCAL_MODEL_BASE_DIR.exists():
-        return LOCAL_MODEL_BASE_DIR
-    return SPACE_MODEL_BASE_DIR if running_on_space() else LOCAL_MODEL_BASE_DIR
-def normalize_model_variant(model_variant: Optional[str] = None) -> str:
-    variant = (model_variant or os.getenv("LANCE_MODEL_VARIANT", DEFAULT_MODEL_VARIANT)).strip().lower()
-    if variant in {"image", "t2i", "i2t"}:
-        return MODEL_VARIANT_IMAGE
-    return MODEL_VARIANT_VIDEO
-def get_model_path(model_variant: Optional[str] = None) -> Path:
-    variant = normalize_model_variant(model_variant)
-    variant_env_name = "LANCE_IMAGE_MODEL_PATH" if variant == MODEL_VARIANT_IMAGE else "LANCE_VIDEO_MODEL_PATH"
-    variant_configured = os.getenv(variant_env_name)
-    if variant_configured:
-        return Path(variant_configured).expanduser()
-    configured = os.getenv("LANCE_MODEL_PATH")
-    if configured:
-        return Path(configured).expanduser()
-    model_dir_name = MODEL_VARIANT_TO_DIR[variant]
-    return get_model_base_dir() / model_dir_name
-def get_required_model_asset_paths(model_base_dir: Path, model_path: Path) -> list[Path]:
-    return [
-        model_path / "llm_config.json",
-        model_path / "model.safetensors",
-        model_base_dir / "Qwen2.5-VL-ViT" / "vit.safetensors",
-        model_base_dir / "Wan2.2_VAE.pth",
-    ]
-def ensure_model_assets(model_variant: Optional[str] = None) -> Path:
-    model_base_dir = get_model_base_dir()
-    os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
-    model_path = get_model_path(model_variant)
-    required_paths = get_required_model_asset_paths(model_base_dir, model_path)
-    if all(path.exists() for path in required_paths):
-        return model_path
-    downloads_model_base_dir = Path("downloads")
-    if model_base_dir == Path(".") and downloads_model_base_dir.exists():
-        downloads_model_path = downloads_model_base_dir / MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]
-        downloads_required_paths = get_required_model_asset_paths(downloads_model_base_dir, downloads_model_path)
-        if all(path.exists() for path in downloads_required_paths):
-            model_base_dir = downloads_model_base_dir
-            model_path = downloads_model_path
-            required_paths = downloads_required_paths
-            os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
-            return model_path
-    auto_download = env_flag("LANCE_AUTO_DOWNLOAD", running_on_space())
-    if not auto_download:
-        missing = "\n".join(f"- {display_path(path)}" for path in required_paths if not path.exists())
-        raise FileNotFoundError(
-            "Lance model assets are missing. Set LANCE_MODEL_BASE_DIR or enable "
-            f"LANCE_AUTO_DOWNLOAD=1.\nMissing files:\n{missing}"
-        )
-    model_base_dir.mkdir(parents=True, exist_ok=True)
-    repo_id = os.getenv("LANCE_MODEL_REPO_ID", DEFAULT_MODEL_REPO_ID)
-    print(f"[startup] Downloading Lance model assets from {repo_id} to {display_path(model_base_dir)}", flush=True)
-    snapshot_path = Path(
-        snapshot_download(
-            repo_id=repo_id,
-            local_dir=str(model_base_dir),
-            local_dir_use_symlinks=False,
-            resume_download=True,
-        )
-    )
-    if snapshot_path != model_base_dir and not model_path.exists():
-        os.environ["LANCE_MODEL_BASE_DIR"] = display_path(snapshot_path)
-        model_path = get_model_path(model_variant)
-    return model_path
-def ensure_dirs() -> None:
-    TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
-    RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
-def save_generation_record(record: dict, save_dir: Path) -> None:
-    ensure_dirs()
-    run_record_path = save_dir / RUN_RECORD_FILENAME
-    with run_record_path.open("w", encoding="utf-8") as f:
-        json.dump(record, f, ensure_ascii=False, indent=2)
-    with RECORD_WRITE_LOCK:
-        with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
-            f.write(json.dumps(record, ensure_ascii=False) + "\n")
-def normalize_seed(seed: int) -> int:
-    return random.randint(0, 2**31 - 1) if seed == -1 else seed
-def normalize_task(task: str) -> str:
-    task_key = (task or TASK_LABEL_VIDEO_GENERATION).strip()
-    task = TASK_LABEL_TO_INTERNAL.get(task_key, TASK_LABEL_TO_INTERNAL.get(task_key.lower(), ""))
-    if task not in GENERATION_TASKS | UNDERSTANDING_TASKS:
-        raise ValueError(f"Unsupported task type: {task}")
-    return task
-def normalize_resolution_for_backend(resolution: str, task: str) -> str:
-    internal_task = normalize_task(task)
-    if internal_task in IMAGE_TASKS:
-        return DEFAULT_IMAGE_RESOLUTION
-    if internal_task in VIDEO_TASKS:
-        return DEFAULT_RESOLUTION
-    return str(resolution)
-def get_default_aspect_ratio(task: str) -> str:
-    internal_task = normalize_task(task)
-    return DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
-def get_size_for_aspect_ratio(task: str, aspect_ratio: str) -> tuple[int, int]:
-    internal_task = normalize_task(task)
-    aspect_ratio = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
-    size_map = IMAGE_ASPECT_RATIO_TO_SIZE if internal_task in IMAGE_TASKS else VIDEO_ASPECT_RATIO_TO_SIZE
-    return size_map[aspect_ratio]
-def format_size_markdown(task: str, width: int, height: int) -> str:
-    internal_task = normalize_task(task)
-    if internal_task in UNDERSTANDING_TASKS:
-        return ""
-    return f"{width} x {height}"
-def normalize_frame_interpolation(value) -> bool:
-    if isinstance(value, bool):
-        return value
-    return str(value or "").strip().lower() in {"1", "true", "yes", "on", "open"}
-def video_seconds_to_num_frames(seconds: int) -> int:
-    seconds = max(1, min(10, int(seconds)))
-    return 12 * seconds + 1
-def update_size_from_aspect_ratio(task: str, aspect_ratio: str):
-    width, height = get_size_for_aspect_ratio(task, aspect_ratio)
-    return height, width, format_size_markdown(task, width, height)
-def reset_generation_defaults_for_task(task: str):
-    internal_task = normalize_task(task)
-    aspect_ratio = get_default_aspect_ratio(internal_task)
-    width, height = get_size_for_aspect_ratio(internal_task, aspect_ratio)
-    resolution = DEFAULT_IMAGE_RESOLUTION if internal_task in IMAGE_TASKS else DEFAULT_RESOLUTION
-    num_frames = DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1
-    return aspect_ratio, height, width, num_frames, resolution, format_size_markdown(internal_task, width, height)
-def apply_prompt_example(task: str, evt: gr.SelectData):
-    prompt_text = ""
-    if isinstance(evt.row_value, list) and evt.row_value:
-        prompt_text = str(evt.row_value[0])
-    elif isinstance(evt.value, list) and evt.value:
-        prompt_text = str(evt.value[0])
-    elif evt.value is not None:
-        prompt_text = str(evt.value)
-    defaults = reset_generation_defaults_for_task(task)
-    return (prompt_text, *defaults)
-def get_understanding_system_prompt_choices(task: str) -> list[str]:
-    internal_task = normalize_task(task)
-    if internal_task == TASK_X2T_IMAGE:
-        return [I2T_QA_SYSTEM_PROMPT]
-    return [V2T_QA_SYSTEM_PROMPT]
-def normalize_understanding_system_prompt(task: str, system_prompt: Optional[str]) -> str:
-    return get_understanding_system_prompt_choices(task)[0]
-def create_request_json(
-    task: str,
-    prompt: str,
-    input_video: Optional[str],
-    input_image: Optional[str],
-    system_prompt: Optional[str] = None,
-) -> Path:
-    ensure_dirs()
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-    prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"
-    if task == TASK_T2V:
-        payload = {"000000.mp4": prompt}
-    elif task == TASK_T2I:
-        payload = {"000000.png": prompt}
-    elif task == TASK_VIDEO_EDIT:
-        if not input_video:
-            raise ValueError("The video edit task requires an input video.")
-        payload = {
-            "000000": {
-                "interleave_array": [prompt, input_video, input_video],
-                "element_dtype_array": ["text", "video", "video"],
-                "istarget_in_interleave": [0, 0, 1],
-            }
-        }
-    elif task == TASK_IMAGE_EDIT:
-        if not input_image:
-            raise ValueError("The image edit task requires an input image.")
-        payload = {
-            "000000": {
-                "interleave_array": [prompt, input_image, input_image],
-                "element_dtype_array": ["text", "image", "image"],
-                "istarget_in_interleave": [0, 0, 1],
-            }
-        }
-    elif task == TASK_X2T_VIDEO:
-        if not input_video:
-            raise ValueError("The video understanding task requires an input video.")
-        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
-        payload = {
-            "000000": {
-                "interleave_array": [input_video, [system_prompt, prompt, ""]],
-                "element_dtype_array": ["video", "text"],
-                "istarget_in_interleave": [0, 1],
-            }
-        }
-    elif task == TASK_X2T_IMAGE:
-        if not input_image:
-            raise ValueError("The image understanding task requires an input image.")
-        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
-        payload = {
-            "000000": {
-                "interleave_array": [input_image, [system_prompt, prompt, ""]],
-                "element_dtype_array": ["image", "text"],
-                "istarget_in_interleave": [0, 1],
-            }
-        }
-    else:
-        raise ValueError(f"Unsupported task type: {task}")
-    with prompt_file.open("w", encoding="utf-8") as f:
-        json.dump(payload, f, ensure_ascii=False, indent=2)
-    return prompt_file
-def resolve_example_path(path: str) -> str:
-    candidate = Path(path)
-    if candidate.is_absolute():
-        return str(candidate)
-    repo_candidate = (REPO_ROOT / candidate)
-    if repo_candidate.exists():
-        return str(repo_candidate.resolve())
-    if candidate.exists():
-        return str(candidate.resolve())
-    return path
-def resolve_browser_video_example_path(path: str) -> str:
-    candidate = Path(path)
-    compatible_candidate = candidate.with_name(f"{candidate.stem}_h264{candidate.suffix}")
-    repo_compatible_candidate = REPO_ROOT / compatible_candidate
-    if not compatible_candidate.is_absolute() and repo_compatible_candidate.exists():
-        return str(repo_compatible_candidate.resolve())
-    if compatible_candidate.is_absolute() and compatible_candidate.exists():
-        return str(compatible_candidate.resolve())
-    repo_candidate = REPO_ROOT / candidate
-    if not candidate.is_absolute() and repo_candidate.exists():
-        return str(repo_candidate.resolve())
-    if candidate.is_absolute() and candidate.exists():
-        return str(candidate.resolve())
-    return resolve_example_path(path)
-def load_json_examples(relative_path: str) -> dict:
-    path = REPO_ROOT / relative_path
-    with path.open("r", encoding="utf-8") as f:
-        return json.load(f)
-T2V_EXAMPLE_SUMMARIES = {
-    "000000.mp4": "Red panda surfing on a bright seaside wave.",
-    "000002.mp4": "Panda cub skateboarding in a creative loft.",
-    "000004.mp4": "Young woman shaping clay in a sunlit pottery workshop.",
-    "000005.mp4": "Panda boxing a robot in a luxurious palace ring.",
-    "000008.mp4": "Fantasy pastel horse stepping through a glowing cloud valley.",
-}
-def make_generation_examples(
-    task_label: str,
-    relative_path: str,
-    limit: int,
-    image_task: bool,
-    selected_keys: Optional[list[str]] = None,
-    summaries: Optional[dict[str, str]] = None,
-) -> list[list]:
-    data = load_json_examples(relative_path)
-    items = [(key, data[key]) for key in selected_keys if key in data] if selected_keys else list(data.items())[:limit]
-    examples = []
-    for output_name, prompt in items:
-        examples.append([prompt])
-    return examples
-def make_edit_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
-    data = load_json_examples(relative_path)
-    examples = []
-    for sample in list(data.values())[:limit]:
-        interleave = sample["interleave_array"]
-        prompt = interleave[0]
-        media_path = resolve_example_path(interleave[1])
-        examples.append([
-            prompt,
-            media_path if media_type == "video" else None,
-            media_path if media_type == "image" else None,
-        ])
-    return examples
-def make_understanding_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
-    data = load_json_examples(relative_path)
-    examples = []
-    for sample in list(data.values())[:limit]:
-        interleave = sample["interleave_array"]
-        media_path = (
-            resolve_browser_video_example_path(interleave[0])
-            if media_type == "video"
-            else resolve_example_path(interleave[0])
-        )
-        text_payload = interleave[1]
-        question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
-        examples.append([
-            question,
-            media_path if media_type == "video" else None,
-            media_path if media_type == "image" else None,
-        ])
-    return examples
-def make_understanding_system_prompt_map(relative_path: str, task: str) -> dict[str, str]:
-    data = load_json_examples(relative_path)
-    system_prompts = {}
-    for sample in data.values():
-        interleave = sample["interleave_array"]
-        text_payload = interleave[1]
-        if not isinstance(text_payload, list) or len(text_payload) < 2:
-            continue
-        system_prompts[text_payload[1]] = normalize_understanding_system_prompt(task, text_payload[0])
-    return system_prompts
-VIDEO_GENERATION_EXAMPLES = make_generation_examples(
-    TASK_LABEL_VIDEO_GENERATION,
-    "config/examples/t2v_example.json",
-    limit=6,
-    image_task=False,
-    #selected_keys=["000000.mp4", "000002.mp4", "000005.mp4", "000004.mp4", "000008.mp4"],
-    selected_keys=["000004.mp4", "000002.mp4", "000000.mp4", "000005.mp4", "000008.mp4", "000007.mp4"],
-    summaries=T2V_EXAMPLE_SUMMARIES,
-)
-VIDEO_EDIT_EXAMPLES = make_edit_examples(
-    TASK_LABEL_VIDEO_EDIT,
-    "config/examples/video_edit_example.json",
-    limit=3,
-    media_type="video",
-)
-VIDEO_UNDERSTANDING_EXAMPLES = make_understanding_examples(
-    TASK_LABEL_VIDEO_UNDERSTANDING,
-    "config/examples/x2t_video_example.json",
-    limit=3,
-    media_type="video",
-)
-VIDEO_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
-    "config/examples/x2t_video_example.json",
-    TASK_X2T_VIDEO,
-)
-IMAGE_GENERATION_EXAMPLES = make_generation_examples(
-    TASK_LABEL_IMAGE_GENERATION,
-    "config/examples/t2i_example.json",
-    limit=5,
-    image_task=True,
-    selected_keys=["000000.png", "000003.png", "000006.png", "000008.png", "000009.png"],
-)
-IMAGE_EDIT_EXAMPLES = make_edit_examples(
-    TASK_LABEL_IMAGE_EDIT,
-    "config/examples/image_edit_example.json",
-    limit=5,
-    media_type="image",
-)
-IMAGE_UNDERSTANDING_EXAMPLES = make_understanding_examples(
-    TASK_LABEL_IMAGE_UNDERSTANDING,
-    "config/examples/x2t_image_example.json",
-    limit=3,
-    media_type="image",
-)
-IMAGE_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
-    "config/examples/x2t_image_example.json",
-    TASK_X2T_IMAGE,
-)
-def build_save_dir(task: str) -> Path:
-    ensure_dirs()
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"
-def find_generated_video(save_dir: Path) -> Optional[Path]:
-    videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
-    return videos[0] if videos else None
-def find_generated_image(save_dir: Path) -> Optional[Path]:
-    images = sorted(save_dir.glob("*.png"), key=lambda p: p.stat().st_mtime, reverse=True)
-    return images[0] if images else None
-def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tuple[Path, str]:
-    rife_dir = REPO_ROOT / "RIFE"
-    rife_script = rife_dir / "inference_video.py"
-    if not rife_script.exists():
-        raise FileNotFoundError(f"RIFE inference script not found: {rife_script}")
-    output_path = video_path.with_name(f"{video_path.stem}_rife_{2 ** exp}x{video_path.suffix}")
-    env = os.environ.copy()
-    env["CUDA_VISIBLE_DEVICES"] = str(device_id)
-    command = [
-        "python3",
-        str(rife_script),
-        "--exp",
-        str(exp),
-        "--video",
-        str(video_path),
-        "--output",
-        str(output_path),
-        "--model",
-        str(rife_dir / "train_log"),
-    ]
-    rife_start = time.perf_counter()
-    try:
-        completed = subprocess.run(
-            command,
-            cwd=str(video_path.parent),
-            env=env,
-            check=True,
-            capture_output=True,
-            text=True,
-        )
-    except subprocess.CalledProcessError as exc:
-        raise RuntimeError(
-            "\n".join(
-                [
-                    f"RIFE failed with exit code {exc.returncode}.",
-                    f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
-                    exc.stdout.strip() if exc.stdout else "",
-                    exc.stderr.strip() if exc.stderr else "",
-                ]
-            ).strip()
-        ) from exc
-    if not output_path.exists():
-        raise FileNotFoundError(f"RIFE completed but output video was not found: {output_path}")
-    elapsed = time.perf_counter() - rife_start
-    log = "\n".join(
-        [
-            "[rife] Frame interpolation finished.",
-            f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
-            f"elapsed={elapsed:.2f}s",
-            f"output={output_path}",
-            completed.stdout.strip(),
-            completed.stderr.strip(),
-        ]
-    ).strip()
-    return output_path, log
-def extract_text_result(save_dir: Path) -> str:
-    prompt_result_path = save_dir / PROMPT_JSON_FILENAME
-    if not prompt_result_path.exists():
-        return ""
-    with prompt_result_path.open("r", encoding="utf-8") as f:
-        data = json.load(f)
-    if not data:
-        return ""
-    first_value = next(iter(data.values()))
-    return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)
-class LanceT2VV2TPipeline:
-    def __init__(self, device_id: int, model_variant: str = MODEL_VARIANT_VIDEO) -> None:
-        self._init_lock = threading.Lock()
-        self._generate_lock = threading.Lock()
-        self.initialized = False
-        self.device = device_id
-        self.model_variant = normalize_model_variant(model_variant)
-        self.logger = get_logger(f"lance_{self.model_variant}_gpu{device_id}")
-        self.model: Optional[Lance] = None
-        self.vae_model: Optional[WanVideoVAE] = None
-        self.vae_config: Optional[AutoEncoderParams] = None
-        self.tokenizer: Optional[Qwen2Tokenizer] = None
-        self.new_token_ids: Optional[dict] = None
-        self.image_token_id: Optional[int] = None
-        self.base_model_args: Optional[ModelArguments] = None
-        self.base_data_args: Optional[DataArguments] = None
-        self.base_inference_args: Optional[InferenceArguments] = None
-    def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
-        elapsed = time.perf_counter() - start_time
-        suffix = f" | {extra}" if extra else ""
-        print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)
-    def _build_base_model_args(self) -> ModelArguments:
-        model_path = str(get_model_path(self.model_variant))
-        return ModelArguments(
-            model_path=model_path,
-            vit_type=DEFAULT_VIT_TYPE,
-            llm_qk_norm=True,
-            llm_qk_norm_und=True,
-            llm_qk_norm_gen=True,
-            tie_word_embeddings=False,
-            max_num_frames=121,
-            max_latent_size=64,
-            latent_patch_size=[1, 1, 1],
-        )
-    def _build_base_inference_args(self) -> InferenceArguments:
-        return InferenceArguments(
-            validation_num_timesteps=DEFAULT_TIMESTEPS,
-            validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
-            copy_init_moe=True,
-            visual_und=True,
-            visual_gen=True,
-            vae_model_type="wan",
-            apply_qwen_2_5_vl_pos_emb=True,
-            apply_chat_template=False,
-            cfg_type=0,
-            validation_data_seed=42,
-            video_height=DEFAULT_HEIGHT,
-            video_width=DEFAULT_WIDTH,
-            num_frames=DEFAULT_NUM_FRAMES,
-            task=DEFAULT_TASK,
-            save_path_gen=str(RESULTS_ROOT),
-            resolution=DEFAULT_RESOLUTION,
-            text_template=TEXT_TEMPLATE,
-            use_KVcache=USE_KVCACHE,
-        )
-    def initialize(self) -> None:
-        with self._init_lock:
-            if self.initialized:
-                return
-            ensure_dirs()
-            resolved_model_path = ensure_model_assets(self.model_variant)
-            print(
-                f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
-                flush=True,
-            )
-            if not torch.cuda.is_available():
-                raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
-            if self.device >= torch.cuda.device_count():
-                raise RuntimeError(
-                    f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
-                )
-            torch.cuda.set_device(self.device)
-            model_args = self._build_base_model_args()
-            data_args = DataArguments()
-            inference_args = self._build_base_inference_args()
-            apply_inference_defaults(model_args, data_args, inference_args)
-            inference_args.validation_noise_seed = inference_args.validation_data_seed
-            self.base_model_args = model_args
-            self.base_data_args = data_args
-            self.base_inference_args = inference_args
-            set_seed(inference_args.global_seed)
-            stage_start = time.perf_counter()
-            print(
-                f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
-                flush=True,
-            )
-            llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
-            self._log_stage("LLM config load", stage_start)
-            llm_config.layer_module = model_args.layer_module
-            llm_config.qk_norm = model_args.llm_qk_norm
-            llm_config.qk_norm_und = model_args.llm_qk_norm_und
-            llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
-            llm_config.tie_word_embeddings = model_args.tie_word_embeddings
-            llm_config.freeze_und = inference_args.freeze_und
-            llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
-            stage_start = time.perf_counter()
-            print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
-            language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
-            self._log_stage("LLM weight init", stage_start)
-            vit_model = None
-            vit_config = None
-            if inference_args.visual_und:
-                if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
-                    raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
-                stage_start = time.perf_counter()
-                print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
-                vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
-                self._log_stage("VIT config load", stage_start)
-                stage_start = time.perf_counter()
-                print(
-                    f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
-                    flush=True,
-                )
-                vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
-                vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
-                vit_model.load_state_dict(vit_weights, strict=True)
-                self._log_stage("VIT weight load", stage_start)
-                clean_memory(vit_weights)
-            if inference_args.visual_gen:
-                stage_start = time.perf_counter()
-                print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
-                vae_model = WanVideoVAE()
-                vae_config = deepcopy(vae_model.vae_config)
-                self._log_stage("VAE init", stage_start)
-            else:
-                vae_model = None
-                vae_config = None
-            config = LanceConfig(
-                visual_gen=inference_args.visual_gen,
-                visual_und=inference_args.visual_und,
-                llm_config=llm_config,
-                vit_config=vit_config if inference_args.visual_und else None,
-                vae_config=vae_config if inference_args.visual_gen else None,
-                latent_patch_size=model_args.latent_patch_size,
-                max_num_frames=model_args.max_num_frames,
-                max_latent_size=model_args.max_latent_size,
-                vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
-                connector_act=model_args.connector_act,
-                interpolate_pos=model_args.interpolate_pos,
-                timestep_shift=inference_args.timestep_shift,
-            )
-            model: Lance = Lance(
-                language_model=language_model,
-                vit_model=vit_model if inference_args.visual_und else None,
-                vit_type=model_args.vit_type,
-                config=config,
-                training_args=inference_args,
-            )
-            stage_start = time.perf_counter()
-            print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
-            model = model.to(self.device)
-            self._log_stage("Lance model move to GPU", stage_start)
-            stage_start = time.perf_counter()
-            print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
-            tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
-            tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
-            self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
-            if inference_args.copy_init_moe:
-                language_model.init_moe()
-            init_from_model_path_if_needed(model, model_args)
-            if num_new_tokens > 0:
-                model.language_model.resize_token_embeddings(len(tokenizer))
-                model.config.llm_config.vocab_size = len(tokenizer)
-                model.language_model.config.vocab_size = len(tokenizer)
-            if model_args.vit_type.lower() == "qwen2_5_vl":
-                from common.model.hacks import hack_qwen2_5_vl_config
-                language_model = hack_qwen2_5_vl_config(language_model)
-            image_token_id = language_model.config.video_token_id
-            new_token_ids.update({"image_token_id": image_token_id})
-            model.update_tokenizer(tokenizer=tokenizer)
-            if model_args.tie_word_embeddings:
-                model.language_model.untie_lm_head()
-                model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
-                model_args.tie_word_embeddings = False
-                llm_config.tie_word_embeddings = False
-            else:
-                assert (
-                    model.language_model.get_input_embeddings().weight.data.data_ptr()
-                    != model.language_model.get_output_embeddings().weight.data.data_ptr()
-                ), "tie_word_embeddings conflict"
-            model = model.to(device=self.device, dtype=torch.bfloat16)
-            model.eval()
-            if vae_model is not None and hasattr(vae_model, "eval"):
-                vae_model.eval()
-            self.model = model
-            self.vae_model = vae_model
-            self.vae_config = vae_config
-            self.tokenizer = tokenizer
-            self.new_token_ids = new_token_ids
-            self.image_token_id = image_token_id
-            self.initialized = True
-            print(
-                f"[startup][gpu:{self.device}][{self.model_variant}] Lance multimodal Gradio model loaded and ready for reuse.",
-                flush=True,
-            )
-    def unload(self) -> None:
-        with self._init_lock:
-            if self.model is not None:
-                self.model.cpu()
-            if self.vae_model is not None and hasattr(self.vae_model, "vae"):
-                vae_inner = self.vae_model.vae
-                if hasattr(vae_inner, "model"):
-                    vae_inner.model.cpu()
-            self.model = None
-            self.vae_model = None
-            self.vae_config = None
-            self.tokenizer = None
-            self.new_token_ids = None
-            self.image_token_id = None
-            self.base_model_args = None
-            self.base_data_args = None
-            self.base_inference_args = None
-            self.initialized = False
-            gc.collect()
-            if torch.cuda.is_available():
-                with torch.cuda.device(self.device):
-                    torch.cuda.empty_cache()
-                    torch.cuda.ipc_collect()
-    def _build_request_batch(
-        self,
-        prompt_file: Path,
-        model_args: ModelArguments,
-        data_args: DataArguments,
-        inference_args: InferenceArguments,
-    ):
-        assert self.tokenizer is not None
-        assert self.new_token_ids is not None
-        assert self.vae_config is not None
-        dataset_config = DataConfig.from_yaml(str(prompt_file))
-        if inference_args.visual_und:
-            dataset_config.vit_patch_size = model_args.vit_patch_size
-            dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
-            dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
-        if inference_args.visual_gen:
-            vae_downsample = tuple_mul(
-                tuple(model_args.latent_patch_size),
-                (
-                    self.vae_config.downsample_temporal,
-                    self.vae_config.downsample_spatial,
-                    self.vae_config.downsample_spatial,
-                ),
-            )
-            dataset_config.latent_patch_size = model_args.latent_patch_size
-            dataset_config.vae_downsample = vae_downsample
-            dataset_config.max_latent_size = model_args.max_latent_size
-            dataset_config.max_num_frames = model_args.max_num_frames
-        dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
-        dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
-        dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
-        dataset_config.num_frames = inference_args.num_frames
-        dataset_config.H = inference_args.video_height
-        dataset_config.W = inference_args.video_width
-        dataset_config.task = inference_args.task
-        dataset_config.resolution = inference_args.resolution
-        dataset_config.text_template = inference_args.text_template
-        val_dataset = ValidationDataset(
-            jsonl_path=str(prompt_file),
-            tokenizer=self.tokenizer,
-            data_args=data_args,
-            model_args=model_args,
-            training_args=inference_args,
-            new_token_ids=self.new_token_ids,
-            dataset_config=dataset_config,
-            local_rank=0,
-            world_size=1,
-        )
-        return simple_custom_collate([val_dataset[0]])
-    def generate(
-        self,
-        task: str,
-        prompt: str,
-        system_prompt: Optional[str],
-        input_video: Optional[str],
-        input_image: Optional[str],
-        height: int,
-        width: int,
-        num_frames: int,
-        seed: int,
-        resolution: str,
-        validation_num_timesteps: int,
-        validation_timestep_shift: float,
-        cfg_text_scale: float,
-        enable_frame_interpolation: bool,
-    ):
-        self.initialize()
-        internal_task = normalize_task(task)
-        prompt = (prompt or "").strip()
-        input_video = str(input_video).strip() if input_video else ""
-        input_image = str(input_image).strip() if input_image else ""
-        if internal_task in GENERATION_TASKS and not prompt:
-            return None, None, "", "Please enter a prompt.", ""
-        if internal_task in UNDERSTANDING_TASKS and not prompt:
-            return None, None, "", "Please enter a question.", ""
-        if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
-            return None, None, "", "Please upload an input video.", ""
-        if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
-            return None, None, "", "Please upload an input image.", ""
-        if height <= 0 or width <= 0:
-            return None, None, "", "Height and width must be greater than 0.", ""
-        if num_frames <= 0:
-            return None, None, "", "The number of frames must be greater than 0.", ""
-        assert self.model is not None
-        assert self.tokenizer is not None
-        assert self.new_token_ids is not None
-        assert self.image_token_id is not None
-        assert self.base_model_args is not None
-        assert self.base_data_args is not None
-        assert self.base_inference_args is not None
-        active_model_path = self.base_model_args.model_path
-        with self._generate_lock:
-            torch.cuda.set_device(self.device)
-            actual_seed = normalize_seed(int(seed))
-            prompt_file = create_request_json(
-                task=internal_task,
-                prompt=prompt,
-                input_video=input_video,
-                input_image=input_image,
-                system_prompt=system_prompt,
-            )
-            save_dir = build_save_dir(internal_task)
-            save_dir.mkdir(parents=True, exist_ok=True)
-            request_started_at = datetime.now().isoformat(timespec="seconds")
-            request_model_args = deepcopy(self.base_model_args)
-            request_model_args.cfg_text_scale = float(cfg_text_scale)
-            request_data_args = deepcopy(self.base_data_args)
-            request_data_args.val_dataset_config_file = str(prompt_file)
-            request_inference_args = deepcopy(self.base_inference_args)
-            request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
-            request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
-            request_inference_args.validation_data_seed = actual_seed
-            request_inference_args.validation_noise_seed = actual_seed
-            request_inference_args.video_height = int(height)
-            request_inference_args.video_width = int(width)
-            request_inference_args.num_frames = int(num_frames)
-            display_resolution = str(resolution)
-            backend_resolution = normalize_resolution_for_backend(display_resolution, internal_task)
-            request_inference_args.resolution = backend_resolution
-            request_inference_args.save_path_gen = str(save_dir)
-            request_inference_args.task = internal_task
-            request_inference_args.text_template = TEXT_TEMPLATE
-            request_inference_args.prompt_data_dict = {}
-            try:
-                print(
-                    "[lance_gradio_t2v_v2t] Start generation "
-                    f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
-                    f"size={height}x{width} | frames={num_frames} | resolution={display_resolution}",
-                    flush=True,
-                )
-                val_data_cpu = self._build_request_batch(
-                    prompt_file=prompt_file,
-                    model_args=request_model_args,
-                    data_args=request_data_args,
-                    inference_args=request_inference_args,
-                )
-                generate_start = time.perf_counter()
-                validate_on_fixed_batch(
-                    fsdp_model=self.model,
-                    vae_model=self.vae_model,
-                    tokenizer=self.tokenizer,
-                    val_data_cpu=val_data_cpu,
-                    training_args=request_inference_args,
-                    model_args=request_model_args,
-                    inference_args=request_inference_args,
-                    new_token_ids=self.new_token_ids,
-                    image_token_id=self.image_token_id,
-                    device=self.device,
-                    save_source_video=False,
-                    save_path_gen=request_inference_args.save_path_gen,
-                    save_path_gt="",
-                )
-                elapsed = time.perf_counter() - generate_start
-                save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
-                clean_memory()
-                video_path = find_generated_video(save_dir) if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} else None
-                original_video_path = video_path
-                rife_log = ""
-                rife_error = ""
-                frame_interpolation_enabled = normalize_frame_interpolation(enable_frame_interpolation) and internal_task in {TASK_T2V, TASK_VIDEO_EDIT}
-                if frame_interpolation_enabled and video_path is not None:
-                    try:
-                        clean_memory()
-                        print(
-                            "[rife] Start frame interpolation "
-                            f"| task={internal_task} | gpu={self.device} | input={video_path}",
-                            flush=True,
-                        )
-                        video_path, rife_log = run_rife_interpolation(video_path, self.device, exp=1)
-                    except Exception:
-                        rife_error = traceback.format_exc()
-                        print(rife_error, flush=True)
-                image_path = find_generated_image(save_dir) if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} else None
-                text_result = extract_text_result(save_dir) if internal_task in UNDERSTANDING_TASKS else ""
-                record = {
-                    "request_started_at": request_started_at,
-                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
-                    "status": "success",
-                    "task": internal_task,
-                    "model_variant": self.model_variant,
-                    "model_path": active_model_path,
-                    "gpu": self.device,
-                    "prompt": prompt,
-                    "system_prompt": normalize_understanding_system_prompt(internal_task, system_prompt)
-                    if internal_task in UNDERSTANDING_TASKS
-                    else "",
-                    "input_video": input_video,
-                    "input_image": input_image,
-                    "seed": actual_seed,
-                    "height": int(height),
-                    "width": int(width),
-                    "num_frames": int(num_frames),
-                    "resolution": display_resolution,
-                    "backend_resolution": backend_resolution,
-                    "validation_num_timesteps": int(validation_num_timesteps),
-                    "validation_timestep_shift": float(validation_timestep_shift),
-                    "cfg_text_scale": float(cfg_text_scale),
-                    "frame_interpolation": frame_interpolation_enabled,
-                    "elapsed_seconds": round(elapsed, 3),
-                    "prompt_file": str(prompt_file),
-                    "output_dir": str(save_dir),
-                    "original_video_path": str(original_video_path) if original_video_path is not None else "",
-                    "video_path": str(video_path) if video_path is not None else "",
-                    "image_path": str(image_path) if image_path is not None else "",
-                    "text_result": text_result,
-                    "rife_error": rife_error,
-                }
-                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} and video_path is None:
-                    record["status"] = "completed_without_video"
-                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} and image_path is None:
-                    record["status"] = "completed_without_image"
-                if internal_task in UNDERSTANDING_TASKS and not text_result:
-                    record["status"] = "completed_without_text"
-                save_generation_record(record, save_dir)
-                logs = "\n".join(
-                    [
-                        "[lance_gradio_t2v_v2t] Inference finished in-process.",
-                        f"task={internal_task}",
-                        f"model_variant={self.model_variant}",
-                        f"model_path={active_model_path}",
-                        f"gpu={self.device}",
-                        f"seed={actual_seed}",
-                        f"height={height}",
-                        f"width={width}",
-                        f"num_frames={num_frames}",
-                        f"resolution={display_resolution}",
-                        f"backend_resolution={backend_resolution}",
-                        f"validation_num_timesteps={validation_num_timesteps}",
-                        f"validation_timestep_shift={validation_timestep_shift}",
-                        f"cfg_text_scale={cfg_text_scale}",
-                        f"frame_interpolation={frame_interpolation_enabled}",
-                        f"original_video_path={original_video_path or ''}",
-                        f"rife_error={rife_error.strip() if rife_error else ''}",
-                        f"elapsed={elapsed:.2f}s",
-                        f"output_dir={save_dir}",
-                        rife_log,
-                    ]
-                )
-                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
-                    if video_path is None:
-                        status = (
-                            "Inference completed, but no output video was found.\n\n"
-                            f"- Task: `{internal_task}`\n"
-                            f"- Model: `{self.model_variant}`\n"
-                            f"- Model path: `{active_model_path}`\n"
-                            f"- GPU: `{self.device}`\n"
-                            f"- Actual seed: `{actual_seed}`\n"
-                            f"- Output directory: `{save_dir}`"
-                        )
-                        return None, None, "", status, logs
-                    # status = (
-                    #     "Inference completed.\n\n"
-                    #     f"- Task: `{internal_task}`\n"
-                    #     f"- Model: `{self.model_variant}`\n"
-                    #     f"- Model path: `{active_model_path}`\n"
-                    #     f"- GPU: `{self.device}`\n"
-                    #     f"- Actual seed: `{actual_seed}`\n"
-                    #     f"- Output directory: `{save_dir}`\n"
-                    #     f"- Result file: `{video_path}`"
-                    # )
-                    status = ""
-                    return str(video_path), None, "", status, logs
-                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
-                    if image_path is None:
-                        status = (
-                            "Inference completed, but no output image was found.\n\n"
-                            f"- Task: `{internal_task}`\n"
-                            f"- Model: `{self.model_variant}`\n"
-                            f"- Model path: `{active_model_path}`\n"
-                            f"- GPU: `{self.device}`\n"
-                            f"- Actual seed: `{actual_seed}`\n"
-                            f"- Output directory: `{save_dir}`"
-                        )
-                        return None, None, "", status, logs
-                    # status = (
-                    #     "Inference completed.\n\n"
-                    #     f"- Task: `{internal_task}`\n"
-                    #     f"- Model: `{self.model_variant}`\n"
-                    #     f"- Model path: `{active_model_path}`\n"
-                    #     f"- GPU: `{self.device}`\n"
-                    #     f"- Actual seed: `{actual_seed}`\n"
-                    #     f"- Output directory: `{save_dir}`\n"
-                    #     f"- Result file: `{image_path}`"
-                    # )
-                    status = ""
-                    return None, str(image_path), "", status, logs
-                # status = (
-                #     "Understanding completed.\n\n"
-                #     f"- Task: `{task}`\n"
-                #     f"- Model: `{self.model_variant}`\n"
-                #     f"- Model path: `{active_model_path}`\n"
-                #     f"- GPU: `{self.device}`\n"
-                #     f"- Actual seed: `{actual_seed}`\n"
-                #     f"- Output directory: `{save_dir}`"
-                # )
-                status = ""
-                return None, None, text_result, status, logs
-            except Exception:
-                error_trace = traceback.format_exc()
-                print(error_trace, flush=True)
-                record = {
-                    "request_started_at": request_started_at,
-                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
-                    "status": "failed",
-                    "task": internal_task,
-                    "model_variant": self.model_variant,
-                    "model_path": active_model_path,
-                    "gpu": self.device,
-                    "prompt": prompt,
-                    "input_video": input_video,
-                    "input_image": input_image,
-                    "seed": actual_seed,
-                    "height": int(height),
-                    "width": int(width),
-                    "num_frames": int(num_frames),
-                    "resolution": display_resolution,
-                    "backend_resolution": backend_resolution,
-                    "validation_num_timesteps": int(validation_num_timesteps),
-                    "validation_timestep_shift": float(validation_timestep_shift),
-                    "cfg_text_scale": float(cfg_text_scale),
-                    "prompt_file": str(prompt_file),
-                    "output_dir": str(save_dir),
-                    "video_path": "",
-                    "image_path": "",
-                    "text_result": "",
-                    "error": error_trace,
-                }
-                save_generation_record(record, save_dir)
-                status = (
-                    "Inference failed.\n\n"
-                    f"- Task: `{internal_task}`\n"
-                    f"- Model: `{self.model_variant}`\n"
-                    f"- Model path: `{active_model_path}`\n"
-                    f"- GPU: `{self.device}`\n"
-                    f"- Actual seed: `{actual_seed}`\n"
-                    f"- Resolution: `{display_resolution}`\n"
-                    f"- Output directory: `{save_dir}`"
-                )
-                return None, None, "", status, error_trace
-class PipelinePool:
-    def __init__(self, gpu_ids: list[int], model_variant: str = MODEL_VARIANT_VIDEO) -> None:
-        if not gpu_ids:
-            raise ValueError("At least one GPU must be configured.")
-        self.gpu_ids = gpu_ids
-        self.model_variant = normalize_model_variant(model_variant)
-        self.pipelines = [
-            LanceT2VV2TPipeline(device_id=gpu_id, model_variant=self.model_variant)
-            for gpu_id in gpu_ids
-        ]
-        self._available = deque(self.pipelines)
-        self._condition = threading.Condition()
-    @property
-    def size(self) -> int:
-        return len(self.pipelines)
-    @property
-    def gpu_summary(self) -> str:
-        return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
-    def initialize_all(self) -> None:
-        print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
-        exceptions: list[Exception] = []
-        with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
-            futures = {
-                executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
-            }
-            for future in concurrent.futures.as_completed(futures):
-                gpu_id = futures[future]
-                try:
-                    future.result()
-                except Exception as exc:
-                    print(f"[startup][gpu:{gpu_id}][{self.model_variant}] Preload failed: {exc}", flush=True)
-                    exceptions.append(exc)
-        if exceptions:
-            raise RuntimeError(
-                f"{self.model_variant} preload failed on {len(exceptions)} GPU(s). Please check the terminal logs."
-            ) from exceptions[0]
-        print(
-            f"[startup][{self.model_variant}] GPU preload finished. Ready to handle {self.size} concurrent request(s).",
-            flush=True,
-        )
-    def acquire(self) -> LanceT2VV2TPipeline:
-        with self._condition:
-            while not self._available:
-                self._condition.wait()
-            return self._available.popleft()
-    def release(self, pipeline: LanceT2VV2TPipeline) -> None:
-        with self._condition:
-            self._available.append(pipeline)
-            self._condition.notify()
-    def unload_all(self) -> None:
-        print(f"[runtime][{self.model_variant}] Unloading model pool from GPU(s): {self.gpu_ids}", flush=True)
-        with self._condition:
-            while len(self._available) != len(self.pipelines):
-                self._condition.wait()
-        for pipeline in self.pipelines:
-            pipeline.unload()
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-        print(f"[runtime][{self.model_variant}] Model pool unloaded.", flush=True)
-    def generate(
-        self,
-        task: str,
-        prompt: str,
-        system_prompt: Optional[str],
-        input_video: Optional[str],
-        input_image: Optional[str],
-        height: int,
-        width: int,
-        num_frames: int,
-        seed: int,
-        resolution: str,
-        validation_num_timesteps: int,
-        validation_timestep_shift: float,
-        cfg_text_scale: float,
-        enable_frame_interpolation: bool,
-    ):
-        pipeline = self.acquire()
-        try:
-            with get_gpu_runtime_lock(pipeline.device):
-                return pipeline.generate(
-                    task=task,
-                    prompt=prompt,
-                    system_prompt=system_prompt,
-                    input_video=input_video,
-                    input_image=input_image,
-                    height=height,
-                    width=width,
-                    num_frames=num_frames,
-                    seed=seed,
-                    resolution=resolution,
-                    validation_num_timesteps=validation_num_timesteps,
-                    validation_timestep_shift=validation_timestep_shift,
-                    cfg_text_scale=cfg_text_scale,
-                    enable_frame_interpolation=enable_frame_interpolation,
-                )
-        finally:
-            self.release(pipeline)
-ACTIVE_PIPELINE_POOLS: dict[str, PipelinePool] = {}
-ACTIVE_POOL_LOCK = threading.Lock()
-GPU_RUNTIME_LOCKS: dict[int, threading.Lock] = {}
-GPU_RUNTIME_LOCKS_LOCK = threading.Lock()
-QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
-PRELOAD_MODEL_VARIANTS = [MODEL_VARIANT_VIDEO, MODEL_VARIANT_IMAGE]
-def get_gpu_runtime_lock(device_id: int) -> threading.Lock:
-    with GPU_RUNTIME_LOCKS_LOCK:
-        lock = GPU_RUNTIME_LOCKS.get(device_id)
-        if lock is None:
-            lock = threading.Lock()
-            GPU_RUNTIME_LOCKS[device_id] = lock
-        return lock
-def get_task_model_variant(task: str) -> str:
-    internal_task = normalize_task(task)
-    return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
-def get_pipeline_pool(task: str) -> PipelinePool:
-    model_variant = get_task_model_variant(task)
-    with ACTIVE_POOL_LOCK:
-        pipeline_pool = ACTIVE_PIPELINE_POOLS.get(model_variant)
-        if pipeline_pool is not None:
-            return pipeline_pool
-        gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
-        print(
-            f"[runtime] Loading Lance {model_variant} model pool without unloading existing pools.",
-            flush=True,
-        )
-        pipeline_pool = PipelinePool(gpu_ids, model_variant=model_variant)
-        pipeline_pool.initialize_all()
-        ACTIVE_PIPELINE_POOLS[model_variant] = pipeline_pool
-        return pipeline_pool
-def preload_pipeline_pools(gpu_ids: list[int], model_variants: list[str]) -> None:
-    for model_variant in model_variants:
-        normalized_variant = normalize_model_variant(model_variant)
-        if normalized_variant in ACTIVE_PIPELINE_POOLS:
-            continue
-        resolved_model_path = ensure_model_assets(normalized_variant)
-        print(
-            f"[startup][{normalized_variant}] Using Lance model path: {resolved_model_path}",
-            flush=True,
-        )
-        pipeline_pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
-        pipeline_pool.initialize_all()
-        ACTIVE_PIPELINE_POOLS[normalized_variant] = pipeline_pool
-def run_task(
-    task: str,
-    prompt: str,
-    system_prompt: Optional[str],
-    input_video: Optional[str],
-    input_image: Optional[str],
-    height: int,
-    width: int,
-    num_frames: int,
-    seed: int,
-    resolution: str,
-    validation_num_timesteps: int,
-    validation_timestep_shift: float,
-    cfg_text_scale: float,
-    enable_frame_interpolation: bool,
-):
-    internal_task = normalize_task(task)
-    if internal_task == TASK_T2V:
-        num_frames = video_seconds_to_num_frames(num_frames)
-    pipeline_pool = get_pipeline_pool(task)
-    return pipeline_pool.generate(
-        task=task,
-        prompt=prompt,
-        system_prompt=system_prompt,
-        input_video=input_video,
-        input_image=input_image,
-        height=height,
-        width=width,
-        num_frames=num_frames,
-        seed=seed,
-        resolution=resolution,
-        validation_num_timesteps=validation_num_timesteps,
-        validation_timestep_shift=validation_timestep_shift,
-        cfg_text_scale=cfg_text_scale,
-        enable_frame_interpolation=enable_frame_interpolation,
-    )
-def build_status_markdown() -> str:
-    gpu_text = "unknown"
-    concurrency = 1
-    loaded_variants = "none"
-    if ACTIVE_PIPELINE_POOLS:
-        loaded_variants = ",".join(sorted(ACTIVE_PIPELINE_POOLS))
-        gpu_ids = sorted({gpu_id for pool in ACTIVE_PIPELINE_POOLS.values() for gpu_id in pool.gpu_ids})
-        gpu_text = ",".join(str(gpu_id) for gpu_id in gpu_ids)
-        concurrency = len(gpu_ids)
-    return (
-        f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
-        f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Loaded models: `{loaded_variants}`  |  "
-        f"Switch mode: `dual resident`"
-    )
-def get_logo_data_uri() -> str:
-    if not LANCE_LOGO_PATH.exists():
-        return ""
-    encoded_logo = base64.b64encode(LANCE_LOGO_PATH.read_bytes()).decode("ascii")
-    return f"data:image/webp;base64,{encoded_logo}"
-def build_header_html() -> str:
-    logo_data_uri = get_logo_data_uri()
-    logo_html = (
-        f'<img class="lance-logo" src="{logo_data_uri}" alt="Lance logo">'
-        if logo_data_uri
-        else ""
-    )
-    return f"""
-    <div class="lance-hero">
-        {logo_html}
-        <h1 class="lance-title">Lance: Unified Multimodal Modeling by Multi-Task Synergy</h1>
-        <div class="lance-authors">
-            <strong>
-                <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" target="_blank">Fengyi Fu</a><sup>*</sup>,
-                <a href="https://corleone-huang.github.io/" target="_blank">Mengqi Huang</a><sup>*,✉</sup>,
-                <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" target="_blank">Shaojin Wu</a><sup>*</sup>,
-                Yunsheng Jiang<sup>*</sup>,
-                Yufei Huo,
-                <a href="https://guojianzhu.com/" target="_blank">Jianzhu Guo</a><sup>✉,§</sup>
-            </strong><br>
-            Hao Li, Yinghang Song, Fei Ding, Qian He, Zheren Fu, Zhendong Mao, Yongdong Zhang<br>
-            <em>ByteDance</em>
-        </div>
-        <div class="lance-badges">
-            <a href="{LANCE_HOMEPAGE_URL}" target="_blank" rel="noopener noreferrer">
-                <img alt="Homepage" src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat">
-            </a>
-            <a href="{LANCE_PAPER_URL}" target="_blank" rel="noopener noreferrer">
-                <img alt="Paper" src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv">
-            </a>
-            <a href="{LANCE_HUGGING_FACE_URL}" target="_blank" rel="noopener noreferrer">
-                <img alt="Hugging Face" src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface">
-            </a>
-            <a href="{LANCE_GITHUB_URL}" target="_blank" rel="noopener noreferrer">
-                <img alt="GitHub" src="https://img.shields.io/badge/Code-GitHub-536af5?color=536af5&logo=github">
-            </a>
-        </div>
-    </div>
-    """
-def update_task_ui(task: str):
-    internal_task = normalize_task(task)
-    is_image_task = internal_task in IMAGE_TASKS
-    is_video_task = internal_task in VIDEO_TASKS
-    is_edit_task = internal_task in EDIT_TASKS
-    is_understanding_task = internal_task in UNDERSTANDING_TASKS
-    is_generation_task = internal_task in GENERATION_TASKS
-    show_media_input = is_edit_task or is_understanding_task
-    resolution_choices = IMAGE_RESOLUTION_CHOICES if is_image_task else VIDEO_RESOLUTION_CHOICES
-    resolution_value = DEFAULT_IMAGE_RESOLUTION if is_image_task else DEFAULT_RESOLUTION
-    aspect_ratio_value = DEFAULT_IMAGE_ASPECT_RATIO if is_image_task else DEFAULT_VIDEO_ASPECT_RATIO
-    width_value, height_value = get_size_for_aspect_ratio(internal_task, aspect_ratio_value)
-    size_markdown = format_size_markdown(internal_task, width_value, height_value)
-    system_prompt_choices = get_understanding_system_prompt_choices(internal_task)
-    if is_generation_task:
-        text_label = "Prompt"
-        text_placeholder = "Describe what you want to generate..."
-    elif is_edit_task:
-        text_label = "Instruction"
-        text_placeholder = "Describe the edit you want..."
-    else:
-        text_label = "Question"
-        text_placeholder = "Ask a question about the input..."
-    return (
-        gr.update(
-            label=text_label,
-            placeholder=text_placeholder,
-            visible=True,
-        ),
-        gr.update(
-            choices=system_prompt_choices,
-            value=system_prompt_choices[0],
-            visible=False,
-        ),
-        gr.update(label="Input Video", visible=show_media_input and is_video_task),
-        gr.update(label="Input Image", visible=show_media_input and is_image_task),
-        gr.update(value=aspect_ratio_value, visible=is_generation_task or is_edit_task),
-        gr.update(value=height_value),
-        gr.update(value=width_value),
-        gr.update(value=size_markdown, visible=is_generation_task or is_edit_task),
-        gr.update(visible=internal_task == TASK_T2V, value=DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1),
-        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}, value=DEFAULT_FRAME_INTERPOLATION),
-        gr.update(choices=resolution_choices, value=resolution_value, visible=False),
-        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}),
-        gr.update(visible=internal_task in {TASK_T2I, TASK_IMAGE_EDIT}),
-        gr.update(visible=is_understanding_task, value=""),
-        gr.update(visible=internal_task == TASK_T2V),
-        gr.update(visible=internal_task == TASK_VIDEO_EDIT),
-        gr.update(visible=internal_task == TASK_X2T_VIDEO),
-        gr.update(visible=internal_task == TASK_T2I),
-        gr.update(visible=internal_task == TASK_IMAGE_EDIT),
-        gr.update(visible=internal_task == TASK_X2T_IMAGE),
-    )
-def keep_example_clicks_from_changing_visibility(*examples_components) -> None:
-    for examples_component in examples_components:
-        dataset = getattr(examples_component, "dataset", None)
-        component_props = getattr(dataset, "component_props", None)
-        if not component_props:
-            continue
-        for props in component_props:
-            props.pop("visible", None)
-def build_demo() -> gr.Blocks:
-    with gr.Blocks(title="Lance", css=APP_CSS) as demo:
-        gr.HTML(build_header_html())
-        gr.Markdown(build_status_markdown(), elem_classes=["lance-status"], visible=False)
-        with gr.Row(elem_classes=["lance-main-row"]):
-            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
-                task = gr.Radio(
-                    label="Task",
-                    choices=TASK_CHOICES,
-                    value=TASK_LABEL_VIDEO_GENERATION,
-                    elem_classes=["task-selector"],
-                )
-                prompt = gr.Textbox(
-                    label="Prompt",
-                    lines=6,
-                    placeholder="Describe the video you want to generate...",
-                )
-                system_prompt = gr.Dropdown(
-                    label="System Prompt",
-                    choices=get_understanding_system_prompt_choices(TASK_X2T_VIDEO),
-                    value=V2T_QA_SYSTEM_PROMPT,
-                    visible=False,
-                )
-                input_video = gr.Video(label="Input Video", visible=False, elem_classes=["lance-display-frame"])
-                input_image = gr.Image(label="Input Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
-                with gr.Row(elem_classes=["generation-controls-row"]):
-                    enable_frame_interpolation = gr.Dropdown(
-                        label="Frame Interpolation",
-                        choices=[FRAME_INTERPOLATION_YES, FRAME_INTERPOLATION_NO],
-                        value=DEFAULT_FRAME_INTERPOLATION,
-                        elem_classes=["generation-control", "generation-dropdown-control"],
-                        min_width=0,
-                    )
-                    seed = gr.Number(
-                        label="Seed (-1 for random seed)",
-                        value=DEFAULT_BASIC_SEED,
-                        precision=0,
-                        elem_classes=["generation-control", "generation-value-control"],
-                        min_width=0,
-                        # info="-1 for random seed",
-                    )
-                    aspect_ratio = gr.Dropdown(
-                        label="Aspect Ratio",
-                        # choices=ASPECT_RATIO_CHOICES, # 原始版本，不显示 是否为 default
-                        choices=get_aspect_ratio_choices_for_task(TASK_T2V),
-                        value=DEFAULT_VIDEO_ASPECT_RATIO,
-                        elem_classes=["generation-control", "generation-dropdown-control"],
-                        min_width=0,
-                    )
-                    # real_size = gr.Markdown(format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT))
-                    real_size = gr.Textbox(
-                        label="Output Resolution",
-                        value=format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT),
-                        interactive=False,
-                        elem_classes=["generation-control", "generation-value-control"],
-                        min_width=0,
-                    )
-                resolution = gr.Dropdown(
-                    label="Resolution",
-                    choices=RESOLUTION_CHOICES,
-                    value=DEFAULT_RESOLUTION,
-                    visible=False,
-                )
-                height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
-                width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
-                num_frames = gr.Slider(
-                    minimum=1,
-                    maximum=10,
-                    step=1,
-                    value=DEFAULT_VIDEO_DURATION_SECONDS,
-                    label="Video Duration (seconds)",
-                )
-                # seed = gr.Number(
-                #     label="Seed",
-                #     value=DEFAULT_BASIC_SEED,
-                #     precision=0,
-                #     info="-1 means using a random seed each time",
-                # )
-                with gr.Accordion("Advanced Parameters", open=False):
-                    validation_num_timesteps = gr.Slider(
-                        minimum=1,
-                        maximum=100,
-                        step=1,
-                        value=DEFAULT_TIMESTEPS,
-                        label="Validation Num Timesteps",
-                    )
-                    with gr.Row():
-                        validation_timestep_shift = gr.Number(
-                            label="Validation Timestep Shift",
-                            value=DEFAULT_TIMESTEP_SHIFT,
-                        )
-                        cfg_text_scale = gr.Number(
-                            label="CFG Text Scale",
-                            value=DEFAULT_CFG_TEXT_SCALE,
-                        )
-                generation_example_inputs = [
-                    prompt,
-                    input_video,
-                    input_image,
-                ]
-            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
-                output_video = gr.Video(label="Output Video", elem_classes=["lance-display-frame"])
-                output_image = gr.Image(label="Output Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
-                output_text = gr.Textbox(label="Output Text", lines=8, visible=False, elem_classes=["lance-display-frame"])
-                status = gr.Markdown("WAITING TO RUN.")
-                logs = gr.Textbox(label="Run Logs", lines=22, max_lines=30)
-        run_button = gr.Button("🚀 Generate", variant="primary", elem_classes=["lance-run-button"])
-        with gr.Group(visible=True, elem_classes=["prompt-examples", "example-panel"]) as video_generation_examples_group:
-            gr.Markdown("### Video generation recommended cases", elem_classes=["recommended-title"])
-            video_generation_examples = gr.Dataset(
-                samples=VIDEO_GENERATION_EXAMPLES,
-                components=[gr.Textbox(label="Prompt", visible=False)],
-                headers=["Prompt"],
-                show_label=False,
-                type="values",
-                layout="table",
-                samples_per_page=len(VIDEO_GENERATION_EXAMPLES),
-                elem_classes=["prompt-dataset"],
-            )
-        with gr.Group(visible=False, elem_classes=["example-panel"]) as video_edit_examples_group:
-            gr.Markdown("### Video edit recommended cases", elem_classes=["recommended-title"])
-            video_edit_examples = gr.Examples(
-                examples=VIDEO_EDIT_EXAMPLES,
-                inputs=generation_example_inputs,
-                label="",
-                examples_per_page=3,
-                cache_examples=False,
-                preprocess=False,
-                postprocess=False,
-            )
-        with gr.Group(visible=False, elem_classes=["example-panel"]) as video_understanding_examples_group:
-            gr.Markdown("### Video understanding recommended cases", elem_classes=["recommended-title"])
-            video_understanding_examples = gr.Examples(
-                examples=VIDEO_UNDERSTANDING_EXAMPLES,
-                inputs=generation_example_inputs,
-                label="",
-                examples_per_page=4,
-                cache_examples=False,
-                preprocess=False,
-                postprocess=False,
-            )
-        with gr.Group(visible=False, elem_classes=["prompt-examples", "example-panel"]) as image_generation_examples_group:
-            gr.Markdown("### Image generation recommended cases", elem_classes=["recommended-title"])
-            image_generation_examples = gr.Dataset(
-                samples=IMAGE_GENERATION_EXAMPLES,
-                components=[gr.Textbox(label="Prompt", visible=False)],
-                headers=["Prompt"],
-                show_label=False,
-                type="values",
-                layout="table",
-                samples_per_page=len(IMAGE_GENERATION_EXAMPLES),
-                elem_classes=["prompt-dataset"],
-            )
-        with gr.Group(visible=False, elem_classes=["example-panel"]) as image_edit_examples_group:
-            gr.Markdown("### Image edit recommended cases", elem_classes=["recommended-title"])
-            image_edit_examples = gr.Examples(
-                examples=IMAGE_EDIT_EXAMPLES,
-                inputs=generation_example_inputs,
-                label="",
-                examples_per_page=5,
-                cache_examples=False,
-                preprocess=False,
-                postprocess=False,
-            )
-        with gr.Group(visible=False, elem_classes=["example-panel"]) as image_understanding_examples_group:
-            gr.Markdown("### Image understanding recommended cases", elem_classes=["recommended-title"])
-            image_understanding_examples = gr.Examples(
-                examples=IMAGE_UNDERSTANDING_EXAMPLES,
-                inputs=generation_example_inputs,
-                label="",
-                examples_per_page=4,
-                cache_examples=False,
-                preprocess=False,
-                postprocess=False,
-            )
-        keep_example_clicks_from_changing_visibility(
-            video_generation_examples,
-            video_edit_examples,
-            video_understanding_examples,
-            image_generation_examples,
-            image_edit_examples,
-            image_understanding_examples,
-        )
-        task.change(
-            fn=update_task_ui,
-            inputs=[task],
-            outputs=[
-                prompt,
-                system_prompt,
-                input_video,
-                input_image,
-                aspect_ratio,
-                height,
-                width,
-                real_size,
-                num_frames,
-                enable_frame_interpolation,
-                resolution,
-                output_video,
-                output_image,
-                output_text,
-                video_generation_examples_group,
-                video_edit_examples_group,
-                video_understanding_examples_group,
-                image_generation_examples_group,
-                image_edit_examples_group,
-                image_understanding_examples_group,
-            ],
-        )
-        aspect_ratio.change(
-            fn=update_size_from_aspect_ratio,
-            inputs=[task, aspect_ratio],
-            outputs=[height, width, real_size],
-            queue=False,
-            show_api=False,
-        )
-        for examples_component in (video_edit_examples, video_understanding_examples, image_edit_examples, image_understanding_examples):
-            examples_component.load_input_event.then(
-                fn=reset_generation_defaults_for_task,
-                inputs=[task],
-                outputs=[aspect_ratio, height, width, num_frames, resolution, real_size],
-                queue=False,
-                show_api=False,
-            )
-        video_generation_examples.select(
-            fn=apply_prompt_example,
-            inputs=[task],
-            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
-            queue=False,
-            show_api=False,
-        )
-        image_generation_examples.select(
-            fn=apply_prompt_example,
-            inputs=[task],
-            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
-            queue=False,
-            show_api=False,
-        )
-        run_button.click(
-            fn=run_task,
-            inputs=[
-                task,
-                prompt,
-                system_prompt,
-                input_video,
-                input_image,
-                height,
-                width,
-                num_frames,
-                seed,
-                resolution,
-                validation_num_timesteps,
-                validation_timestep_shift,
-                cfg_text_scale,
-                enable_frame_interpolation,
-            ],
-            outputs=[output_video, output_image, output_text, status, logs],
-        )
-    return demo
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Lance multimodal Gradio")
-    parser.add_argument("--server-name", default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
-    parser.add_argument("--server-port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
-    parser.add_argument("--share", action="store_true", default=env_flag("GRADIO_SHARE", False))
-    parser.add_argument(
-        "--gpus",
-        default=os.getenv("LANCE_GPUS", DEFAULT_GPUS),
-        help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=int(os.getenv("LANCE_QUEUE_SIZE", str(DEFAULT_QUEUE_SIZE))),
-        help="Maximum number of queued Gradio requests.",
-    )
-    return parser.parse_args()
-def parse_gpu_ids(gpu_string: str) -> list[int]:
-    gpu_ids: list[int] = []
-    for item in gpu_string.split(","):
-        item = item.strip()
-        if not item:
-            continue
-        gpu_ids.append(int(item))
-    if not gpu_ids:
-        raise ValueError("No valid GPU IDs were parsed.")
-    return gpu_ids
-if __name__ == "__main__":
-    args = parse_args()
-    os.environ["LANCE_GPUS"] = args.gpus
-    QUEUE_MAX_SIZE = args.queue_size
-    gpu_ids = parse_gpu_ids(args.gpus)
-    preload_pipeline_pools(gpu_ids, PRELOAD_MODEL_VARIANTS)
-    default_concurrency_limit = max(1, len(gpu_ids))
-    demo = build_demo()
-    demo.queue(
-        max_size=args.queue_size,
-        default_concurrency_limit=default_concurrency_limit,
-    ).launch(
-        server_name=args.server_name,
-        server_port=args.server_port,
-        share=args.share,
-    )

assets/video-understanding/videos/video-understanding-caption-long-01_h264.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7387de84940c96d7ed5e50cd0ee78de3e1b5062903466cb0861f497bd95efc52
+size 679220

assets/video-understanding/videos/video-understanding-caption-short-01_h264.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62a8ca1c0f50dc0ba08ed33814031494b7f6eac9fc889f241b1a52789cff8eed
+size 381609

assets/video-understanding/videos/video-understanding-vqa-01_h264.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b7185e2b75fa656f45b439a01064ee0ac411057449079da4d36fd08306f2dad
+size 284350

config/examples/video_edit_examples/edit_source_car_h264.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e436c2954f3a19be39248ba48c7b98edffbe0f3b7eeaeb3c44d8168e722d433d
+size 220126

config/examples/video_edit_examples/edit_source_woman_h264.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e665d11d1b6b0a45aa44cb930fdc4ea125f67ea692b0882e5fa3e9b282b1b4ba
+size 56974