Spaces:

bytedance-research
/

Lance

Running on Zero

App Files Files Community

ffy2000 commited on 5 days ago

Commit

90ce156

0 Parent(s):

Upload Space with Xet-managed assets

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +22 -0
.gitattributes +6 -0
.gitignore +34 -0
Dockerfile +36 -0
LICENSE +201 -0
README.md +670 -0
README_zh.md +660 -0
RIFE +1 -0
SECURITY.md +11 -0
SPACE_DEPLOYMENT.md +76 -0
app.py +0 -0
app_save.py +2064 -0
app_wrong.py +2247 -0
assets/image-understanding/cases/image-understanding-case-02.png +3 -0
assets/image-understanding/cases/image-understanding-case-05.png +3 -0
assets/image-understanding/cases/image-understanding-case-06.png +3 -0
assets/logo/lance-logo.webp +3 -0
assets/video-understanding/videos/video-understanding-caption-long-01.mp4 +3 -0
assets/video-understanding/videos/video-understanding-caption-short-01.mp4 +3 -0
assets/video-understanding/videos/video-understanding-vqa-01.mp4 +3 -0
benchmarks/image_gen/DPG/DPG.jsonl +0 -0
benchmarks/image_gen/DPG/README.md +57 -0
benchmarks/image_gen/DPG/README_zh.md +57 -0
benchmarks/image_gen/DPG/sample_DPG.py +509 -0
benchmarks/image_gen/DPG/sample_DPG.sh +113 -0
benchmarks/image_gen/GEdit/GEdit_en.json +0 -0
benchmarks/image_gen/GEdit/README.md +68 -0
benchmarks/image_gen/GEdit/README_zh.md +67 -0
benchmarks/image_gen/GEdit/sample_GEdit.py +425 -0
benchmarks/image_gen/GEdit/sample_GEdit.sh +106 -0
benchmarks/image_gen/GenEVAL/GenEVAL.jsonl +0 -0
benchmarks/image_gen/GenEVAL/README.md +73 -0
benchmarks/image_gen/GenEVAL/README_zh.md +73 -0
benchmarks/image_gen/GenEVAL/sample_GenEVAL.py +463 -0
benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh +110 -0
benchmarks/sample_env.sh +107 -0
benchmarks/video_gen/Vbench/README.md +72 -0
benchmarks/video_gen/Vbench/README_zh.md +72 -0
benchmarks/video_gen/Vbench/Vbench_recaption.jsonl +0 -0
benchmarks/video_gen/Vbench/sample_vbench.py +559 -0
benchmarks/video_gen/Vbench/sample_vbench.sh +127 -0
benchmarks/video_gen/Vbench/temporal_flickering_prompts.json +77 -0
common/__init__.py +16 -0
common/model/__init__.py +20 -0
common/model/checks.py +14 -0
common/model/hacks.py +54 -0
common/utils/__init__.py +55 -0
common/utils/distributed.py +62 -0
common/utils/logging.py +44 -0
common/utils/misc.py +40 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,22 @@

+.git
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+.venv/
+venv/
+env/
+ENV/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+downloads/
+results/
+tmps/
+*.log
+.DS_Store

.gitattributes ADDED Viewed

	@@ -0,0 +1,6 @@

+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+.Python
+.python-version
+.venv/
+venv/
+env/
+ENV/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+build/
+dist/
+*.egg-info/
+.eggs/
+.ipynb_checkpoints/
+.DS_Store
+# custom ignore
+results/
+downloads/
+tmps/
+*.log

Dockerfile ADDED Viewed

	@@ -0,0 +1,36 @@

+FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
+ENV DEBIAN_FRONTEND=noninteractive \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_SERVER_PORT=7860 \
+    LANCE_AUTO_DOWNLOAD=1 \
+    LANCE_MODEL_BASE_DIR=/data/lance_models \
+    LANCE_GRADIO_TMP_ROOT=/tmp/lance_gradio
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ffmpeg \
+    git \
+    libgl1 \
+    libglib2.0-0 \
+    libsndfile1 \
+    libsm6 \
+    libxext6 \
+    ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt /app/requirements.txt
+RUN python -m pip install --upgrade pip setuptools wheel \
+    && grep -v '^flash-attn==' requirements.txt > /tmp/requirements-no-flash-attn.txt \
+    && python -m pip install -r /tmp/requirements-no-flash-attn.txt \
+    && python -m pip install flash-attn==2.6.3 --no-build-isolation
+COPY . /app
+EXPOSE 7860
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,670 @@

+---
+title: Lance
+emoji: 🎬
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 7860
+suggested_hardware: l40s
+---
+<div align="center">
+  <img src="assets/logo/lance-logo.webp" alt="Lance logo" width="300">
+  <h1 align="center"><sup>Lance: Unified Multimodal Modeling by Multi-Task Synergy</sup></h1>
+  <p>
+    <strong>
+    <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Fengyi Fu</a><sup>*</sup>,
+    <a href="https://corleone-huang.github.io/" style="text-decoration: none; color: inherit;">Mengqi Huang</a><sup>*,✉</sup>,
+    <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Shaojin Wu</a><sup>*</sup>,
+    Yunsheng Jiang<sup>*</sup>,
+    Yufei Huo,
+    <a href="https://guojianzhu.com/" style="text-decoration: none; color: inherit;">Jianzhu Guo</a><sup>✉,§</sup>
+    </strong><br>
+    Hao Li,
+    Yinghang Song,
+    Fei Ding,
+    Qian He,
+    Zheren Fu,
+    Zhendong Mao,
+    Yongdong Zhang
+    <br>
+    <em>ByteDance</em>
+    <br>
+    <sup>*</sup> Equal contribution &nbsp;&nbsp; <sup>✉</sup> Corresponding authors &nbsp;&nbsp; <sup>§</sup> Project lead
+  </p>
+  <p>
+    <a href="https://lance-project.github.io/" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat" alt="Homepage"></a>
+    <a href="http://arxiv.org/abs/2605.18678" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv" alt="arXiv"></a>
+    <a href="https://huggingface.co/bytedance-research/Lance" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface" alt="Model"></a>
+    <br>
+    English | <a href="./README_zh.md"><ins>简体中文</ins></a>
+  </p>
+</div>
+## 🌟 Highlights
+**Lance** is a 3B native unified multimodal model that supports **image and video understanding, generation, and editing** within a single framework.
+- **Efficient at 3B scale.** With only **3B active parameters**, Lance delivers strong performance across image generation, image editing, and video generation benchmarks.
+- **Trained from scratch.** Lance is built with a staged multi-task recipe and trained entirely from scratch (except for the ViT and VAE encoders; the transformer backbone is trained entirely from scratch) within a **128-A100-GPU** budget.
+<div align="center">
+  <img src="assets/benchmarks/benchmark-overview.png" alt="Lance benchmark overview across image generation, image editing, video generation, and video understanding" width="980">
+</div>
+## 🎨 Demo
+### Text-to-Video
+<table align="center">
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-01.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-02.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-03.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-04.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-05.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-06.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-07.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-08.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+### Video Editing
+<table align="center">
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-01.mp4"><img src="assets/video-editing/previews/video-editing-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-02.mp4"><img src="assets/video-editing/previews/video-editing-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-03.mp4"><img src="assets/video-editing/previews/video-editing-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-04.mp4"><img src="assets/video-editing/previews/video-editing-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-05.mp4"><img src="assets/video-editing/previews/video-editing-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-06.mp4"><img src="assets/video-editing/previews/video-editing-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-07.mp4"><img src="assets/video-editing/previews/video-editing-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-08.mp4"><img src="assets/video-editing/previews/video-editing-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+### Multi-turn Consistency Editing
+<div align="center">
+  <a href="assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4">
+    <img src="assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif" width="100%">
+  </a>
+</div>
+### Intelligent Video Generation
+<table align="center">
+  <tr>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-01.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-02.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-03.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-04.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+</table>
+### Video Understanding
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> How many times did the person launch objects on the table? Options: (A) 3 (B) 2 (C) 4</p>
+        <p><strong>Response:</strong> (A) 3</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-02.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-02.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> The person makes sets of repeated actions. How many distinct repeated actions did the person do? Options: (A) 2 (B) 3 (C) 4</p>
+        <p><strong>Response:</strong> (A) 2</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-03.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-03.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> In which direction does the purple sphere move in the video? Options: (A) Down and to the right. (B) Up and to the left. (C) Up and to the right. (D) The object is stationary.</p>
+        <p><strong>Response:</strong> (A) Down and to the right.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-04.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-04.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> What is the unrealistic phenomenon displayed in the video? Options: (A) The man can manipulate time via phone. (B) Man grabs an object through a phone screen. (C) Chocolate transforms into different objects. (D) Visible means of propulsion enables flight.</p>
+        <p><strong>Response:</strong> (B) Man grabs an object through a phone screen.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-short-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-short-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> Offer a succinct account of the culinary process shown in this video.</p>
+        <p><strong>Response:</strong> Add tomato puree and mix it well with chicken pieces.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-long-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-long-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> Provide a detailed description of the given video, capturing its key moments.</p>
+        <p><strong>Response:</strong> In a sunlit meadow, a small tortoiseshell butterfly rests on a purple flower. A bee, with black and yellow stripes, lands on the same flower. The butterfly flaps its wings gently, while the bee busies itself, collecting nectar. The flower sways slightly in the breeze. The butterfly then takes off, and the bee follows, both heading to the next flower. The scene is a vivid display of insect interaction in a natural setting, with the colors of the butterfly and the bee contrasting against the green background of the meadow. The video captures this peaceful moment in a short 6-second duration.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+### Text-to-Image Generation
+<div align="center">
+  <img src="assets/text-to-image/text-to-image-overview.webp" alt="Lance text-to-image generation examples" width="100%">
+</div>
+### Image Editing
+<div align="center">
+  <img src="assets/image-editing/image-editing-overview.webp" alt="Lance image editing examples" width="100%">
+</div>
+### Image Understanding
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-01.png" alt="Pie chart reasoning case" width="100%">
+        <p><strong>Question:</strong> Is the largest segment greater than sum of all the other segments?</p>
+        <p><strong>Response:</strong> Yes, the largest segment in the pie chart is the blue one, which is the most populous segment.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-02.png" alt="Survey chart about border security priorities" width="100%">
+        <p><strong>Question:</strong> What percentage of respondents want better border security?</p>
+        <p><strong>Response:</strong> 29%</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-03.png" alt="License plate recognition case" width="100%">
+        <p><strong>Question:</strong> What is the license plate number of the car?</p>
+        <p><strong>Response:</strong> The license plate number of the car is BX62 BFY.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-04.png" alt="Market research chart reasoning case" width="100%">
+        <p><strong>Question:</strong> According to the data from the proprietary market research, how much amount was spent on the promotional meetings and events during 1998?</p>
+        <p><strong>Response:</strong> According to the data from the proprietary market research, the total amount spent on the promotional meetings and events during 1998 was approximately $1.3 billion.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-05.png" alt="Colosseum description case" width="100%">
+        <p><strong>Question:</strong> What is the appearance of the Colosseum in Rome, Italy?</p>
+        <p><strong>Response:</strong> The Colosseum in Rome, Italy is a magnificent ancient amphitheater made of stone and concrete. It is characterized by its large, arched openings and its semi-circular shape. The Colosseum has been partially restored, and its original color has faded, but the architectural details, like the columns and the semi-circular arches, are still visible. The lighting in the image highlights the grandeur and historical significance of the Colosseum, making it stand out against the backdrop of the evening sky.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-06.png" alt="Total solar eclipse case" width="100%">
+        <p><strong>Question:</strong> How does a total solar eclipse look like from Earth?</p>
+        <p><strong>Response:</strong> A total solar eclipse looks like a dark circle with a bright, white outer edge. This is a result of the Earth's rotation and the Earth's shadow, which partially obscures the sun's light. The bright white edge of the eclipse is caused by the sun's high energy and its interaction with the Earth's atmosphere, while the dark part of the eclipse is due to the Earth's shadow and the surrounding air currents. The solar eclipse's shape, with its bright white edge and dark center, is similar to the shape of a full moon or a dark disk. It is a natural phenomenon that occurs in the atmosphere of the Earth and is an important part of the solar system.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+## 🚀 Installation
+### Recommended Environment
+- **Software:** Python 3.10+, CUDA 12.4+ (required)
+- **Hardware:** A GPU with at least 40GB VRAM is required for inference
+### Installation Steps
+```bash
+bash ./setup_env.sh
+```
+### Download Model Weights
+Please download all necessary model checkpoints from [Lance-3B on Hugging Face](https://huggingface.co/bytedance-research/Lance) and place them in the `downloads/` directory.
+## 📚 Usage
+### Inference
+We provide a unified command-line interface for all generation / editing / understanding tasks:
+#### Option 1: Configure and Run the Unified Script
+```bash
+bash inference_lance.sh
+```
+- Before running, please configure the inference parameters at the top of `inference_lance.sh`.
+- **Supported tasks:** `t2i`, `t2v`, `image_edit`, `video_edit`, `x2t_image`, and `x2t_video`. You can modify `TASK_DEFAULT_CONFIGS` in `inference_lance.py` to customize the default data samples for each task.
+- **Note:** For all tasks, we recommend following the `prompt` format used in the provided examples when writing input prompts, as this typically leads to better generation quality.
+#### Option 2: Configure and Run the Unified Script
+We provide task-specific one-click commands for different generation, editing, and understanding tasks.
+##### Text-to-Video Generation
+```bash
+bash inference_lance.sh \
+  --TASK_NAME t2v \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --NUM_FRAMES 121 \
+  --VIDEO_HEIGHT 480 \
+  --VIDEO_WIDTH 848 \
+  --SAVE_PATH_GEN results/t2v
+```
+##### Text-to-Image Generation
+```bash
+bash inference_lance.sh \
+  --TASK_NAME t2i \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --VIDEO_HEIGHT 768 \
+  --VIDEO_WIDTH 768 \
+  --SAVE_PATH_GEN results/t2i
+```
+##### Video Editing
+```bash
+bash inference_lance.sh \
+  --TASK_NAME video_edit \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --SAVE_PATH_GEN results/video_edit
+```
+##### Image Editing
+```bash
+bash inference_lance.sh \
+  --TASK_NAME image_edit \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --SAVE_PATH_GEN results/image_edit
+```
+##### Video Understanding
+```bash
+bash inference_lance.sh \
+  --TASK_NAME x2t_video \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --NUM_FRAMES 50 \
+  --SAVE_PATH_GEN results/x2t_video
+```
+##### Image Understanding
+```bash
+bash inference_lance.sh \
+  --TASK_NAME x2t_image \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --SAVE_PATH_GEN results/x2t_image
+```
+#### Available Tasks
+| Task Name              | Description                                      | Example JSON                                 |
+|------------------------|--------------------------------------------------|----------------------------------------------|
+| `t2v`                  | Text-to-Video generation                         | `config/examples/t2v_example.json`           |
+| `t2i`                  | Text-to-Image generation                         | `config/examples/t2i_example.json`           |
+| `image_edit`           | Image editing                                    | `config/examples/image_edit_example.json`    |
+| `video_edit`           | Video editing                                    | `config/examples/video_edit_example.json`    |
+| `x2t_image`            | Image understanding            | `config/examples/x2t_image_example.json`    |
+| `x2t_video`            | Video understanding            | `config/examples/x2t_video_example.json`    |
+For understanding examples:
+- `config/examples/x2t_image_example.json`: image understanding examples for visual question answering and image-based reasoning.
+- `config/examples/x2t_video_example.json`: video understanding examples for video question answering and video captioning.
+#### Parameters
+You can configure the following hyperparameters at the top of the `inference_lance.sh` script:
+| Parameter | Default Value | Description |
+| --- | --- | --- |
+| `MODEL_PATH` | `"downloads/Lance_3B"` | Path to the downloaded Lance model weights  (`Lance_3B` or `Lance_3B_Video`). |
+| `NUM_GPUS` | `1` | Number of GPUs to use for inference. |
+| `VALIDATION_NUM_TIMESTEPS` | `30` | Number of denoising steps (e.g., 30 or 50). |
+| `VALIDATION_TIMESTEP_SHIFT` | `3.5` | Timestep shift parameter for flow matching scheduling. |
+| `CFG_TEXT_SCALE` | `4.0` | Classifier-Free Guidance (CFG) scale for text conditioning. |
+| `VALIDATION_DATA_SEED` | `42` | Random seed for generation reproducibility. |
+| `NUM_FRAMES` | `50` | Number of frames for video generation (Max: 121). *Unused for image tasks.* |
+| `VIDEO_HEIGHT` / `VIDEO_WIDTH`| `768` | Spatial resolution. *Unused for editing tasks (determined by input image/video).* |
+| `RESOLUTION` | `"video_480p"` | Base resolution preset (`image_768res` or `video_480p`). |
+### Gradio
+```bash
+python lance_gradio_t2v_v2t.py --gpus 0 --server-port 7860
+```
+### Benchmarks
+#### DPG-Bench Evaluation
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Global</th>
+      <th align="center">Entity</th>
+      <th align="center">Attribute</th>
+      <th align="center">Relation</th>
+      <th align="center">Other</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="8"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">83.27</td><td align="center">82.43</td><td align="center">80.91</td><td align="center">86.76</td><td align="center">80.41</td><td align="center">74.65</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">90.97</td><td align="center">89.61</td><td align="center">88.39</td><td align="center">90.58</td><td align="center">89.83</td><td align="center">83.50</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">87.90</td><td align="center">91.01</td><td align="center">88.83</td><td align="center">80.70</td><td align="center">88.68</td><td align="center">84.08</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">74.35</td><td align="center">90.00</td><td align="center">88.96</td><td align="center">90.87</td><td align="center">88.33</td><td align="center">83.84</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">91.32</td><td align="center">91.56</td><td align="center">92.02</td><td align="center">94.31</td><td align="center">92.73</td><td align="center">88.32</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="8"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center">86.90</td><td align="center">88.90</td><td align="center">89.40</td><td align="center">89.32</td><td align="center">89.48</td><td align="center">84.19</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center">88.81</td><td align="center">88.83</td><td align="center">90.18</td><td align="center">89.37</td><td align="center">90.27</td><td align="center">83.57</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center">89.00</td><td align="center"><b>91.78</b></td><td align="center">89.96</td><td align="center">91.81</td><td align="center"><b>91.64</b></td><td align="center">86.14</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">88.94</td><td align="center">90.37</td><td align="center"><u>91.29</u></td><td align="center">90.82</td><td align="center">88.67</td><td align="center">85.07</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>90.39</u></td><td align="center">90.78</td><td align="center">90.68</td><td align="center">90.29</td><td align="center">88.77</td><td align="center">85.18</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>90.42</b></td><td align="center"><u>91.68</u></td><td align="center">90.94</td><td align="center"><u>91.87</u></td><td align="center"><u>90.73</u></td><td align="center"><b>86.76</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center">89.50</td><td align="center">91.40</td><td align="center"><b>92.07</b></td><td align="center">91.91</td><td align="center">88.81</td><td align="center"><u>86.54</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>83.89</b></td><td align="center"><b>91.07</b></td><td align="center"><b>89.36</b></td><td align="center"><b>93.38</b></td><td align="center"><b>80.80</b></td><td align="center"><b>84.67</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+<p align="center"><em><sup>†</sup> indicates methods that use LLM rewriters for prompt rewriting before generation.</em></p>
+#### GenEval Evaluation
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">1-Obj.</th>
+      <th align="center">2-Obj.</th>
+      <th align="center">Count</th>
+      <th align="center">Colors</th>
+      <th align="center">Position</th>
+      <th align="center">Attr.</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="9"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">0.98</td><td align="center">0.74</td><td align="center">0.39</td><td align="center">0.85</td><td align="center">0.15</td><td align="center">0.23</td><td align="center">0.55</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">0.96</td><td align="center">0.87</td><td align="center">0.47</td><td align="center">0.83</td><td align="center">0.43</td><td align="center">0.45</td><td align="center">0.67</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">0.99</td><td align="center">0.94</td><td align="center">0.72</td><td align="center">0.89</td><td align="center">0.33</td><td align="center">0.60</td><td align="center">0.74</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">0.98</td><td align="center">0.93</td><td align="center">0.75</td><td align="center">0.93</td><td align="center">0.68</td><td align="center">0.65</td><td align="center">0.82</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">0.99</td><td align="center">0.92</td><td align="center">0.89</td><td align="center">0.88</td><td align="center">0.76</td><td align="center">0.77</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="9"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center">0.89</td><td align="center">0.59</td><td align="center">0.90</td><td align="center">0.79</td><td align="center">0.66</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center"><b>1.00</b></td><td align="center">0.95</td><td align="center">0.64</td><td align="center">0.88</td><td align="center">0.55</td><td align="center">0.76</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center">0.87</td><td align="center">0.58</td><td align="center">0.92</td><td align="center">0.52</td><td align="center">0.62</td><td align="center">0.76</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">0.98</td><td align="center">0.95</td><td align="center"><b>0.84</b></td><td align="center"><u>0.95</u></td><td align="center">0.78</td><td align="center">0.77</td><td align="center">0.88</td>
+    </tr>
+    <tr>
+      <td align="left">Mogao</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center"><u>0.83</u></td><td align="center">0.93</td><td align="center">0.84</td><td align="center">0.80</td><td align="center"><u>0.89</u></td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>0.99</u></td><td align="center">0.94</td><td align="center">0.74</td><td align="center">0.91</td><td align="center">0.77</td><td align="center">0.74</td><td align="center">0.85</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center">0.81</td><td align="center">0.91</td><td align="center"><b>0.88</b></td><td align="center"><b>0.83</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center"><u>0.96</u></td><td align="center">0.80</td><td align="center">0.91</td><td align="center">0.84</td><td align="center">0.76</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>1.00</b></td><td align="center"><b>0.94</b></td><td align="center"><b>0.84</b></td><td align="center"><b>0.97</b></td><td align="center"><b>0.87</b></td><td align="center"><b>0.81</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+<p align="center"><em><sup>†</sup> indicates methods that use LLM rewriters for prompt rewriting before generation.</em></p>
+#### GEdit-Bench Evaluation
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">BC</th>
+      <th align="center">CA</th>
+      <th align="center">MM</th>
+      <th align="center">MC</th>
+      <th align="center">PB</th>
+      <th align="center">ST</th>
+      <th align="center">SA</th>
+      <th align="center">SR</th>
+      <th align="center">SRp</th>
+      <th align="center">TM</th>
+      <th align="center">TT</th>
+      <th align="center">Avg/G_O</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="14"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Gemini 2.0</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">6.32</td>
+    </tr>
+    <tr>
+      <td align="left">GPT Image 1</td><td align="center">-</td><td align="center">6.96</td><td align="center">6.85</td><td align="center">7.10</td><td align="center">5.41</td><td align="center">6.74</td><td align="center">7.44</td><td align="center">7.51</td><td align="center">8.73</td><td align="center">8.55</td><td align="center">8.45</td><td align="center">8.69</td><td align="center">7.49</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image-Edit</td><td align="center">20B</td><td align="center">8.23</td><td align="center">8.30</td><td align="center">7.33</td><td align="center">8.05</td><td align="center">7.49</td><td align="center">6.74</td><td align="center">8.57</td><td align="center">8.09</td><td align="center">8.29</td><td align="center">8.48</td><td align="center">8.50</td><td align="center">8.01</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="14"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Lumina-DiMOO</td><td align="center">8B</td><td align="center">3.43</td><td align="center">4.27</td><td align="center">3.08</td><td align="center">2.77</td><td align="center">4.74</td><td align="center">5.19</td><td align="center">4.44</td><td align="center">3.80</td><td align="center">4.38</td><td align="center">2.68</td><td align="center">4.20</td><td align="center">3.91</td>
+    </tr>
+    <tr>
+      <td align="left">Ovis-U1</td><td align="center">1.2B</td><td align="center"><u>7.49</u></td><td align="center">6.88</td><td align="center">6.21</td><td align="center">4.79</td><td align="center">5.98</td><td align="center"><u>6.46</u></td><td align="center">7.49</td><td align="center"><u>7.25</u></td><td align="center"><u>7.27</u></td><td align="center">4.48</td><td align="center">6.31</td><td align="center">6.42</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL</td><td align="center">7B</td><td align="center">7.32</td><td align="center">6.91</td><td align="center">6.38</td><td align="center">4.75</td><td align="center">4.57</td><td align="center">6.15</td><td align="center"><b>7.90</b></td><td align="center">7.16</td><td align="center">7.02</td><td align="center"><u>7.32</u></td><td align="center">6.22</td><td align="center">6.52</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center">7.08</td><td align="center">7.05</td><td align="center">6.38</td><td align="center"><u>7.02</u></td><td align="center"><u>6.03</u></td><td align="center">6.27</td><td align="center">7.13</td><td align="center">6.55</td><td align="center">6.33</td><td align="center">6.59</td><td align="center"><u>6.85</u></td><td align="center">6.66</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U (w/ CoT)</td><td align="center">1.7B</td><td align="center">7.05</td><td align="center"><b>7.87</b></td><td align="center"><u>6.50</u></td><td align="center">6.99</td><td align="center">5.77</td><td align="center">6.10</td><td align="center">7.33</td><td align="center">7.16</td><td align="center">7.12</td><td align="center"><b>7.36</b></td><td align="center">6.46</td><td align="center"><u>6.88</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>7.73</b></td><td align="center"><u>7.74</u></td><td align="center"><b>7.28</b></td><td align="center"><b>7.83</b></td><td align="center"><b>7.50</b></td><td align="center"><b>7.03</b></td><td align="center"><u>7.64</u></td><td align="center"><b>7.85</b></td><td align="center"><b>7.71</b></td><td align="center">4.46</td><td align="center"><b>7.57</b></td><td align="center"><b>7.30</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+#### VBench Evaluation (Video Generation)
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Type</th>
+      <th align="left">Model</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Total Score ↑</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" rowspan="12"><i>Gen. Only</i></td>
+      <td align="left">ModelScope</td><td align="center">1.7B</td><td align="center">75.75</td>
+    </tr>
+    <tr>
+      <td align="left">LaVie</td><td align="center">3B</td><td align="center">77.08</td>
+    </tr>
+    <tr>
+      <td align="left">Show-1</td><td align="center">6B</td><td align="center">78.93</td>
+    </tr>
+    <tr>
+      <td align="left">AnimateDiff-V2</td><td align="center">-</td><td align="center">80.27</td>
+    </tr>
+    <tr>
+      <td align="left">VideoCrafter-2.0</td><td align="center">-</td><td align="center">80.44</td>
+    </tr>
+    <tr>
+      <td align="left">CogVideoX</td><td align="center">5B</td><td align="center">81.61</td>
+    </tr>
+    <tr>
+      <td align="left">Kling</td><td align="center">-</td><td align="center">81.85</td>
+    </tr>
+    <tr>
+      <td align="left">Open-Sora-2.0</td><td align="center">-</td><td align="center">81.71</td>
+    </tr>
+    <tr>
+      <td align="left">Gen-3</td><td align="center">-</td><td align="center">82.32</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Video-T2V</td><td align="center">30B</td><td align="center">81.83</td>
+    </tr>
+    <tr>
+      <td align="left">Hunyuan Video</td><td align="center">-</td><td align="center">83.43</td>
+    </tr>
+    <tr>
+      <td align="left">Wan2.1-T2V</td><td align="center">14B</td><td align="center">83.69</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="6"><i>Unified</i></td>
+      <td align="left">HaproOmni</td><td align="center">7B</td><td align="center">78.10</td>
+    </tr>
+    <tr>
+      <td align="left">Emu3</td><td align="center">8B</td><td align="center">80.96</td>
+    </tr>
+    <tr>
+      <td align="left">VILA-U</td><td align="center">7B</td><td align="center">74.01</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">2B</td><td align="center">81.34</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">1.5B</td><td align="center"><u>84.06</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>85.11</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+#### Running Benchmarks
+Ready-to-run benchmark scripts are provided under `benchmarks/`:
+| Benchmark              | Modality | Script                                                        |
+|------------------------|----------|---------------------------------------------------------------|
+| GenEVAL (image gen)    | Image    | `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`              |
+| DPG (image gen)        | Image    | `benchmarks/image_gen/DPG/sample_DPG.sh`                      |
+| GEdit (image edit)     | Image    | `benchmarks/image_gen/GEdit/sample_GEdit.sh`                  |
+| VBench (video gen)     | Video    | `benchmarks/video_gen/Vbench/sample_vbench.sh`                |
+## 📄 License
+Copyright 2025 Bytedance Ltd. and/or its affiliates.
+## 🙏 Acknowledgements
+We would like to thank the contributors of [BAGEL](https://github.com/ByteDance-Seed/bagel), [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), and [Wan2.2](https://github.com/Wan-Video/Wan2.2) for their open research and contributions.
+## 💖 Citation
+If you find **Lance** useful for your project or research, welcome to 🌟 this repo and cite our work using the following BibTeX:
+```bibtex
+@misc{fu2026lanceunifiedmultimodalmodeling,
+      title         = {Lance: Unified Multimodal Modeling by Multi-Task Synergy},
+      author        = {Fengyi Fu and Mengqi Huang and Shaojin Wu and Yunsheng Jiang and Yufei Huo and Hao Li and Yinghang Song and Fei Ding and Jianzhu Guo and Qian He and Zheren Fu and Zhendong Mao and Yongdong Zhang},
+      year          = {2026},
+      eprint        = {2605.18678},
+      archivePrefix = {arXiv},
+      primaryClass  = {cs.CV},
+      url           = {https://arxiv.org/abs/2605.18678},
+}
+```
+## 📞 Contact
+For questions, issues, or collaborations, please contact [Mengqi Huang](https://corleone-huang.github.io/) and [Jianzhu Guo](https://guojianzhu.com/).

README_zh.md ADDED Viewed

	@@ -0,0 +1,660 @@

+<div align="center">
+  <img src="assets/logo/lance-logo.webp" alt="Lance logo" width="300">
+  <h1 align="center"><sup>Lance: Unified Multimodal Modeling by Multi-Task Synergy</sup></h1>
+  <p>
+    <strong>
+    <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Fengyi Fu</a><sup>*</sup>,
+    <a href="https://corleone-huang.github.io/" style="text-decoration: none; color: inherit;">Mengqi Huang</a><sup>*,✉</sup>,
+    <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Shaojin Wu</a><sup>*</sup>,
+    Yunsheng Jiang<sup>*</sup>,
+    Yufei Huo,
+    <a href="https://guojianzhu.com/" style="text-decoration: none; color: inherit;">Jianzhu Guo</a><sup>✉,§</sup>
+    </strong><br>
+    Hao Li,
+    Yinghang Song,
+    Fei Ding,
+    Qian He,
+    Zheren Fu,
+    Zhendong Mao,
+    Yongdong Zhang
+    <br>
+    <em>ByteDance</em>
+    <br>
+    <sup>*</sup> 共同一作 &nbsp;&nbsp; <sup>✉</sup> 通讯作者 &nbsp;&nbsp; <sup>§</sup> Project lead
+  </p>
+  <p>
+    <a href="https://lance-project.github.io/" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat" alt="Homepage"></a>
+    <a href="http://arxiv.org/abs/2605.18678" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv" alt="arXiv"></a>
+    <a href="https://huggingface.co/bytedance-research/Lance" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface" alt="Model"></a>
+    <br>
+    <a href="./README.md"><ins>English</ins></a> | 简体中文
+  </p>
+</div>
+## 🌟 亮点
+**Lance** 是一个3B参数、原生统一的多模态模型，在单一框架下同时支持 **图像与视频的理解、生成和编辑**。
+- **3B 规模高效强大。** 仅使用 **3B active parameters**，Lance 即可在图像生成、图像编辑和视频生成等基准上取得强劲表现。
+- **从零训练。** Lance 采用分阶段多任务训练配方，在 **128 张 A100 GPU** 的预算内从零完成训练。
+<div align="center">
+  <img src="assets/benchmarks/benchmark-overview.png" alt="Lance benchmark overview across image generation, image editing, video generation, and video understanding" width="980">
+</div>
+## 🎨 演示
+### 文生视频
+<table align="center">
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-01.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-02.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-03.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-04.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-05.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-06.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-07.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-08.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+### 视频编辑
+<table align="center">
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-01.mp4"><img src="assets/video-editing/previews/video-editing-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-02.mp4"><img src="assets/video-editing/previews/video-editing-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-03.mp4"><img src="assets/video-editing/previews/video-editing-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-04.mp4"><img src="assets/video-editing/previews/video-editing-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-05.mp4"><img src="assets/video-editing/previews/video-editing-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-06.mp4"><img src="assets/video-editing/previews/video-editing-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-07.mp4"><img src="assets/video-editing/previews/video-editing-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-08.mp4"><img src="assets/video-editing/previews/video-editing-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+### 多轮一致性编辑
+<div align="center">
+  <a href="assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4">
+    <img src="assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif" width="100%">
+  </a>
+</div>
+### 智能视频生成
+<table align="center">
+  <tr>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-01.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-02.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-03.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-04.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+</table>
+### 视频理解
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> How many times did the person launch objects on the table? Options: (A) 3 (B) 2 (C) 4</p>
+        <p><strong>Response:</strong> (A) 3</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-02.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-02.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> The person makes sets of repeated actions. How many distinct repeated actions did the person do? Options: (A) 2 (B) 3 (C) 4</p>
+        <p><strong>Response:</strong> (A) 2</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-03.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-03.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> In which direction does the purple sphere move in the video? Options: (A) Down and to the right. (B) Up and to the left. (C) Up and to the right. (D) The object is stationary.</p>
+        <p><strong>Response:</strong> (A) Down and to the right.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-04.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-04.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> What is the unrealistic phenomenon displayed in the video? Options: (A) The man can manipulate time via phone. (B) Man grabs an object through a phone screen. (C) Chocolate transforms into different objects. (D) Visible means of propulsion enables flight.</p>
+        <p><strong>Response:</strong> (B) Man grabs an object through a phone screen.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-short-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-short-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> Offer a succinct account of the culinary process shown in this video.</p>
+        <p><strong>Response:</strong> Add tomato puree and mix it well with chicken pieces.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-long-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-long-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> Provide a detailed description of the given video, capturing its key moments.</p>
+        <p><strong>Response:</strong> In a sunlit meadow, a small tortoiseshell butterfly rests on a purple flower. A bee, with black and yellow stripes, lands on the same flower. The butterfly flaps its wings gently, while the bee busies itself, collecting nectar. The flower sways slightly in the breeze. The butterfly then takes off, and the bee follows, both heading to the next flower. The scene is a vivid display of insect interaction in a natural setting, with the colors of the butterfly and the bee contrasting against the green background of the meadow. The video captures this peaceful moment in a short 6-second duration.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+### 文生图
+<div align="center">
+  <img src="assets/text-to-image/text-to-image-overview.webp" alt="Lance text-to-image generation examples" width="100%">
+</div>
+### 图像编辑
+<div align="center">
+  <img src="assets/image-editing/image-editing-overview.webp" alt="Lance image editing examples" width="100%">
+</div>
+### 图像理解
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-01.png" alt="Pie chart reasoning case" width="100%">
+        <p><strong>问题：</strong> Is the largest segment greater than sum of all the other segments?</p>
+        <p><strong>Response:</strong> Yes, the largest segment in the pie chart is the blue one, which is the most populous segment.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-02.png" alt="Survey chart about border security priorities" width="100%">
+        <p><strong>问题：</strong> What percentage of respondents want better border security?</p>
+        <p><strong>Response:</strong> 29%</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-03.png" alt="License plate recognition case" width="100%">
+        <p><strong>问题：</strong> What is the license plate number of the car?</p>
+        <p><strong>Response:</strong> The license plate number of the car is BX62 BFY.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-04.png" alt="Market research chart reasoning case" width="100%">
+        <p><strong>问题：</strong> According to the data from the proprietary market research, how much amount was spent on the promotional meetings and events during 1998?</p>
+        <p><strong>Response:</strong> According to the data from the proprietary market research, the total amount spent on the promotional meetings and events during 1998 was approximately $1.3 billion.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-05.png" alt="Colosseum description case" width="100%">
+        <p><strong>问题：</strong> What is the appearance of the Colosseum in Rome, Italy?</p>
+        <p><strong>Response:</strong> The Colosseum in Rome, Italy is a magnificent ancient amphitheater made of stone and concrete. It is characterized by its large, arched openings and its semi-circular shape. The Colosseum has been partially restored, and its original color has faded, but the architectural details, like the columns and the semi-circular arches, are still visible. The lighting in the image highlights the grandeur and historical significance of the Colosseum, making it stand out against the backdrop of the evening sky.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-06.png" alt="Total solar eclipse case" width="100%">
+        <p><strong>问题：</strong> How does a total solar eclipse look like from Earth?</p>
+        <p><strong>Response:</strong> A total solar eclipse looks like a dark circle with a bright, white outer edge. This is a result of the Earth's rotation and the Earth's shadow, which partially obscures the sun's light. The bright white edge of the eclipse is caused by the sun's high energy and its interaction with the Earth's atmosphere, while the dark part of the eclipse is due to the Earth's shadow and the surrounding air currents. The solar eclipse's shape, with its bright white edge and dark center, is similar to the shape of a full moon or a dark disk. It is a natural phenomenon that occurs in the atmosphere of the Earth and is an important part of the solar system.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+## 🚀 安装
+### 推荐环境
+- **软件环境：** Python 3.10+，CUDA 12.4+（必需）
+- **硬件环境：** 推理至少需要一张显存不低于 40GB 的 GPU
+### 安装步骤
+```bash
+bash ./setup_env.sh
+```
+### 下载模型权重
+请从 [Hugging Face 上的 Lance-3B](https://huggingface.co/bytedance-research/Lance) 下载所需的全部模型权重，并放置到 `downloads/` 目录下。
+## 📚 使用方法
+### 推理
+Lance 为生成、编辑和理解任务提供了统一的命令行入口：
+#### 方式一：配置并运行统一推理脚本
+```bash
+bash inference_lance.sh
+```
+- 运行前，请先在 `inference_lance.sh` 顶部配置推理参数。
+- **支持任务：** `t2i`、`t2v`、`image_edit`、`video_edit`、`x2t_image` 和 `x2t_video`。你也可以在 `inference_lance.py` 中修改 `TASK_DEFAULT_CONFIGS`，自定义每个任务默认使用的数据样例。
+- **注意：** 对于所有任务，建议在编写输入 prompt 时参考提供示例中的 `prompt` 格式，这通常有助于获得更好的生成效果。
+#### Option 2: 运行任务专用一键脚本
+我们提供了面向不同生成、编辑和理解任务的一键启动命令，便于快速运行指定任务类型。
+##### 文本-视频生成
+```bash
+bash inference_lance.sh \
+  --TASK_NAME t2v \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --NUM_FRAMES 121 \
+  --VIDEO_HEIGHT 480 \
+  --VIDEO_WIDTH 848 \
+  --SAVE_PATH_GEN results/t2v
+```
+##### 文本-图像生成
+```bash
+bash inference_lance.sh \
+  --TASK_NAME t2i \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --VIDEO_HEIGHT 768 \
+  --VIDEO_WIDTH 768 \
+  --SAVE_PATH_GEN results/t2i
+```
+##### 视频编辑
+```bash
+bash inference_lance.sh \
+  --TASK_NAME video_edit \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --SAVE_PATH_GEN results/video_edit
+```
+##### 图像编辑
+```bash
+bash inference_lance.sh \
+  --TASK_NAME image_edit \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --SAVE_PATH_GEN results/image_edit
+```
+##### 视频理解
+```bash
+bash inference_lance.sh \
+  --TASK_NAME x2t_video \
+  --MODEL_PATH downloads/Lance_3B_Video \
+  --RESOLUTION video_480p \
+  --NUM_FRAMES 50 \
+  --SAVE_PATH_GEN results/x2t_video
+```
+##### 图像理解
+```bash
+bash inference_lance.sh \
+  --TASK_NAME x2t_image \
+  --MODEL_PATH downloads/Lance_3B \
+  --RESOLUTION image_768res \
+  --SAVE_PATH_GEN results/x2t_image
+```
+#### 可用任务
+| 任务名 | 说明 | 示例 JSON |
+|------------------------|--------------------------------------------------|----------------------------------------------|
+| `t2v` | 文生视频 | `config/examples/t2v_example.json` |
+| `t2i` | 文生图 | `config/examples/t2i_example.json` |
+| `image_edit` | 图像编辑 | `config/examples/image_edit_example.json` |
+| `video_edit` | 视频编辑 | `config/examples/video_edit_example.json` |
+| `x2t_image` | 图像理解 | `config/examples/x2t_image_example.json` |
+| `x2t_video` | 视频理解 | `config/examples/x2t_video_example.json` |
+关于理解任务的示例文件：
+- `config/examples/x2t_image_example.json`：用于图像理解示例，包括视觉问答和基于图像的推理。
+- `config/examples/x2t_video_example.json`：用于视频理解示例，包括视频问答和视频描述。
+#### 参数说明
+你可以在 `inference_lance.sh` 顶部配置以下超参数：
+| 参数 | 默认值 | 说明 |
+| --- | --- | --- |
+| `MODEL_PATH` | `"downloads/Lance_3B"` | 下载后的 Lance 模型权重路径（如 `Lance_3B` 或 `Lance_3B_Video`）。 |
+| `NUM_GPUS` | `1` | 用于推理的 GPU 数量。 |
+| `VALIDATION_NUM_TIMESTEPS` | `30` | 去噪步数（例如 30 或 50）。 |
+| `VALIDATION_TIMESTEP_SHIFT` | `3.5` | Flow matching 调度中的 timestep shift 参数。 |
+| `CFG_TEXT_SCALE` | `4.0` | 文本条件的 CFG（Classifier-Free Guidance）系数。 |
+| `VALIDATION_DATA_SEED` | `42` | 用于复现实验的随机种子。 |
+| `NUM_FRAMES` | `50` | 视频生成帧数（最大 121）。*图像任务不使用该参数。* |
+| `VIDEO_HEIGHT` / `VIDEO_WIDTH`| `768` | 空间分辨率。*编辑任务不使用该参数（由输入图像/视频决定）。* |
+| `RESOLUTION` | `"video_480p"` | 基础分辨率预设（如 `image_768res` 或 `video_480p`）。 |
+### Gradio
+```bash
+python lance_gradio_t2v_v2t.py --gpus 0 --server-port 7860
+```
+### 基准评测
+#### DPG-Bench 评测
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Global</th>
+      <th align="center">Entity</th>
+      <th align="center">Attribute</th>
+      <th align="center">Relation</th>
+      <th align="center">Other</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="8"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">83.27</td><td align="center">82.43</td><td align="center">80.91</td><td align="center">86.76</td><td align="center">80.41</td><td align="center">74.65</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">90.97</td><td align="center">89.61</td><td align="center">88.39</td><td align="center">90.58</td><td align="center">89.83</td><td align="center">83.50</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">87.90</td><td align="center">91.01</td><td align="center">88.83</td><td align="center">80.70</td><td align="center">88.68</td><td align="center">84.08</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">74.35</td><td align="center">90.00</td><td align="center">88.96</td><td align="center">90.87</td><td align="center">88.33</td><td align="center">83.84</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">91.32</td><td align="center">91.56</td><td align="center">92.02</td><td align="center">94.31</td><td align="center">92.73</td><td align="center">88.32</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="8"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center">86.90</td><td align="center">88.90</td><td align="center">89.40</td><td align="center">89.32</td><td align="center">89.48</td><td align="center">84.19</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center">88.81</td><td align="center">88.83</td><td align="center">90.18</td><td align="center">89.37</td><td align="center">90.27</td><td align="center">83.57</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center">89.00</td><td align="center"><b>91.78</b></td><td align="center">89.96</td><td align="center">91.81</td><td align="center"><b>91.64</b></td><td align="center">86.14</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">88.94</td><td align="center">90.37</td><td align="center"><u>91.29</u></td><td align="center">90.82</td><td align="center">88.67</td><td align="center">85.07</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>90.39</u></td><td align="center">90.78</td><td align="center">90.68</td><td align="center">90.29</td><td align="center">88.77</td><td align="center">85.18</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>90.42</b></td><td align="center"><u>91.68</u></td><td align="center">90.94</td><td align="center"><u>91.87</u></td><td align="center"><u>90.73</u></td><td align="center"><b>86.76</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center">89.50</td><td align="center">91.40</td><td align="center"><b>92.07</b></td><td align="center">91.91</td><td align="center">88.81</td><td align="center"><u>86.54</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>83.89</b></td><td align="center"><b>91.07</b></td><td align="center"><b>89.36</b></td><td align="center"><b>93.38</b></td><td align="center"><b>80.80</b></td><td align="center"><b>84.67</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+<p align="center"><em><sup>†</sup> 表示该方法在生成前使用 LLM rewriter 进行提示词改写。</em></p>
+#### GenEval 评测
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">1-Obj.</th>
+      <th align="center">2-Obj.</th>
+      <th align="center">Count</th>
+      <th align="center">Colors</th>
+      <th align="center">Position</th>
+      <th align="center">Attr.</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="9"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">0.98</td><td align="center">0.74</td><td align="center">0.39</td><td align="center">0.85</td><td align="center">0.15</td><td align="center">0.23</td><td align="center">0.55</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">0.96</td><td align="center">0.87</td><td align="center">0.47</td><td align="center">0.83</td><td align="center">0.43</td><td align="center">0.45</td><td align="center">0.67</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">0.99</td><td align="center">0.94</td><td align="center">0.72</td><td align="center">0.89</td><td align="center">0.33</td><td align="center">0.60</td><td align="center">0.74</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">0.98</td><td align="center">0.93</td><td align="center">0.75</td><td align="center">0.93</td><td align="center">0.68</td><td align="center">0.65</td><td align="center">0.82</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">0.99</td><td align="center">0.92</td><td align="center">0.89</td><td align="center">0.88</td><td align="center">0.76</td><td align="center">0.77</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="9"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center">0.89</td><td align="center">0.59</td><td align="center">0.90</td><td align="center">0.79</td><td align="center">0.66</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center"><b>1.00</b></td><td align="center">0.95</td><td align="center">0.64</td><td align="center">0.88</td><td align="center">0.55</td><td align="center">0.76</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center">0.87</td><td align="center">0.58</td><td align="center">0.92</td><td align="center">0.52</td><td align="center">0.62</td><td align="center">0.76</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">0.98</td><td align="center">0.95</td><td align="center"><b>0.84</b></td><td align="center"><u>0.95</u></td><td align="center">0.78</td><td align="center">0.77</td><td align="center">0.88</td>
+    </tr>
+    <tr>
+      <td align="left">Mogao</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center"><u>0.83</u></td><td align="center">0.93</td><td align="center">0.84</td><td align="center">0.80</td><td align="center"><u>0.89</u></td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>0.99</u></td><td align="center">0.94</td><td align="center">0.74</td><td align="center">0.91</td><td align="center">0.77</td><td align="center">0.74</td><td align="center">0.85</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center">0.81</td><td align="center">0.91</td><td align="center"><b>0.88</b></td><td align="center"><b>0.83</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center"><u>0.96</u></td><td align="center">0.80</td><td align="center">0.91</td><td align="center">0.84</td><td align="center">0.76</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>1.00</b></td><td align="center"><b>0.94</b></td><td align="center"><b>0.84</b></td><td align="center"><b>0.97</b></td><td align="center"><b>0.87</b></td><td align="center"><b>0.81</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+<p align="center"><em><sup>†</sup> 表示该方法在生成前使用 LLM rewriter 进行提示词改写。</em></p>
+#### GEdit-Bench 评测
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">BC</th>
+      <th align="center">CA</th>
+      <th align="center">MM</th>
+      <th align="center">MC</th>
+      <th align="center">PB</th>
+      <th align="center">ST</th>
+      <th align="center">SA</th>
+      <th align="center">SR</th>
+      <th align="center">SRp</th>
+      <th align="center">TM</th>
+      <th align="center">TT</th>
+      <th align="center">Avg/G_O</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="14"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Gemini 2.0</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">6.32</td>
+    </tr>
+    <tr>
+      <td align="left">GPT Image 1</td><td align="center">-</td><td align="center">6.96</td><td align="center">6.85</td><td align="center">7.10</td><td align="center">5.41</td><td align="center">6.74</td><td align="center">7.44</td><td align="center">7.51</td><td align="center">8.73</td><td align="center">8.55</td><td align="center">8.45</td><td align="center">8.69</td><td align="center">7.49</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image-Edit</td><td align="center">20B</td><td align="center">8.23</td><td align="center">8.30</td><td align="center">7.33</td><td align="center">8.05</td><td align="center">7.49</td><td align="center">6.74</td><td align="center">8.57</td><td align="center">8.09</td><td align="center">8.29</td><td align="center">8.48</td><td align="center">8.50</td><td align="center">8.01</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="14"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Lumina-DiMOO</td><td align="center">8B</td><td align="center">3.43</td><td align="center">4.27</td><td align="center">3.08</td><td align="center">2.77</td><td align="center">4.74</td><td align="center">5.19</td><td align="center">4.44</td><td align="center">3.80</td><td align="center">4.38</td><td align="center">2.68</td><td align="center">4.20</td><td align="center">3.91</td>
+    </tr>
+    <tr>
+      <td align="left">Ovis-U1</td><td align="center">1.2B</td><td align="center"><u>7.49</u></td><td align="center">6.88</td><td align="center">6.21</td><td align="center">4.79</td><td align="center">5.98</td><td align="center"><u>6.46</u></td><td align="center">7.49</td><td align="center"><u>7.25</u></td><td align="center"><u>7.27</u></td><td align="center">4.48</td><td align="center">6.31</td><td align="center">6.42</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL</td><td align="center">7B</td><td align="center">7.32</td><td align="center">6.91</td><td align="center">6.38</td><td align="center">4.75</td><td align="center">4.57</td><td align="center">6.15</td><td align="center"><b>7.90</b></td><td align="center">7.16</td><td align="center">7.02</td><td align="center"><u>7.32</u></td><td align="center">6.22</td><td align="center">6.52</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center">7.08</td><td align="center">7.05</td><td align="center">6.38</td><td align="center"><u>7.02</u></td><td align="center"><u>6.03</u></td><td align="center">6.27</td><td align="center">7.13</td><td align="center">6.55</td><td align="center">6.33</td><td align="center">6.59</td><td align="center"><u>6.85</u></td><td align="center">6.66</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U (w/ CoT)</td><td align="center">1.7B</td><td align="center">7.05</td><td align="center"><b>7.87</b></td><td align="center"><u>6.50</u></td><td align="center">6.99</td><td align="center">5.77</td><td align="center">6.10</td><td align="center">7.33</td><td align="center">7.16</td><td align="center">7.12</td><td align="center"><b>7.36</b></td><td align="center">6.46</td><td align="center"><u>6.88</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>7.73</b></td><td align="center"><u>7.74</u></td><td align="center"><b>7.28</b></td><td align="center"><b>7.83</b></td><td align="center"><b>7.50</b></td><td align="center"><b>7.03</b></td><td align="center"><u>7.64</u></td><td align="center"><b>7.85</b></td><td align="center"><b>7.71</b></td><td align="center">4.46</td><td align="center"><b>7.57</b></td><td align="center"><b>7.30</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+#### VBench 评测（视频生成）
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">类型</th>
+      <th align="left">Model</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Total Score ↑</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" rowspan="12"><i>Gen. Only</i></td>
+      <td align="left">ModelScope</td><td align="center">1.7B</td><td align="center">75.75</td>
+    </tr>
+    <tr>
+      <td align="left">LaVie</td><td align="center">3B</td><td align="center">77.08</td>
+    </tr>
+    <tr>
+      <td align="left">Show-1</td><td align="center">6B</td><td align="center">78.93</td>
+    </tr>
+    <tr>
+      <td align="left">AnimateDiff-V2</td><td align="center">-</td><td align="center">80.27</td>
+    </tr>
+    <tr>
+      <td align="left">VideoCrafter-2.0</td><td align="center">-</td><td align="center">80.44</td>
+    </tr>
+    <tr>
+      <td align="left">CogVideoX</td><td align="center">5B</td><td align="center">81.61</td>
+    </tr>
+    <tr>
+      <td align="left">Kling</td><td align="center">-</td><td align="center">81.85</td>
+    </tr>
+    <tr>
+      <td align="left">Open-Sora-2.0</td><td align="center">-</td><td align="center">81.71</td>
+    </tr>
+    <tr>
+      <td align="left">Gen-3</td><td align="center">-</td><td align="center">82.32</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Video-T2V</td><td align="center">30B</td><td align="center">81.83</td>
+    </tr>
+    <tr>
+      <td align="left">Hunyuan Video</td><td align="center">-</td><td align="center">83.43</td>
+    </tr>
+    <tr>
+      <td align="left">Wan2.1-T2V</td><td align="center">14B</td><td align="center">83.69</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="6"><i>Unified</i></td>
+      <td align="left">HaproOmni</td><td align="center">7B</td><td align="center">78.10</td>
+    </tr>
+    <tr>
+      <td align="left">Emu3</td><td align="center">8B</td><td align="center">80.96</td>
+    </tr>
+    <tr>
+      <td align="left">VILA-U</td><td align="center">7B</td><td align="center">74.01</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">2B</td><td align="center">81.34</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">1.5B</td><td align="center"><u>84.06</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>85.11</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+#### 运行基准评测
+`benchmarks/` 目录下提供了可直接运行的基准评测脚本：
+| 基准 | 模态 | 脚本 |
+|------------------------|----------|---------------------------------------------------------------|
+| GenEVAL（图像生成） | 图像 | `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` |
+| DPG（图像生成） | 图像 | `benchmarks/image_gen/DPG/sample_DPG.sh` |
+| GEdit（��像编辑） | 图像 | `benchmarks/image_gen/GEdit/sample_GEdit.sh` |
+| VBench（视频生成） | 视频 | `benchmarks/video_gen/Vbench/sample_vbench.sh` |
+## 📄 许可证
+Copyright 2025 Bytedance Ltd. and/or its affiliates.
+## 🙏 致谢
+我们感谢 [BAGEL](https://github.com/ByteDance-Seed/bagel)、[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) 和 [Wan2.2](https://github.com/Wan-Video/Wan2.2) 的贡献者，感谢他们开放的研究与社区贡献。
+## 💖 引用
+如果 **Lance** 对您的项目或研究有帮助，欢迎 🌟 本仓库，并使用以下 BibTeX 引用我们的工作：
+```bibtex
+@misc{fu2026lanceunifiedmultimodalmodeling,
+      title         = {Lance: Unified Multimodal Modeling by Multi-Task Synergy},
+      author        = {Fengyi Fu and Mengqi Huang and Shaojin Wu and Yunsheng Jiang and Yufei Huo and Hao Li and Yinghang Song and Fei Ding and Jianzhu Guo and Qian He and Zheren Fu and Zhendong Mao and Yongdong Zhang},
+      year          = {2026},
+      eprint        = {2605.18678},
+      archivePrefix = {arXiv},
+      primaryClass  = {cs.CV},
+      url           = {https://arxiv.org/abs/2605.18678},
+}
+```
+## 📞 联系方式
+如有问题、反馈或合作需求，请联系 [Mengqi Huang](https://corleone-huang.github.io/) 和 [Jianzhu Guo](https://guojianzhu.com/)。

RIFE ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 5d8adbdd40e12c2c8f91930eff838aebe561c086

SECURITY.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Security and privacy
+If you discover potential security issues in the project, or believe you may have found a security issue, please notify the ByteDance security team through our [security center](https://security.bytedance.com/src) or [vulnerability reporting email](mailto:src@bytedance.com). Please **do not** create public GitHub Issues.
+We will assess the vulnerability based on the Common Vulnerability Scoring System (CVSS 3.1). The security team will keep you updated on key progress and may request further information or guidance from you. You are welcome to contact us via the email or website mentioned above to ask questions or discuss disclosure matters.
+To protect the security of our customers, ByteDance requests that you do not publish or share information regarding the vulnerability in any public forum, nor publish or share data involving users, until the vulnerability has been remediated and our users have been notified. Please understand that the time required for remediation depends on the severity of the vulnerability and the scope of the impact.
+Individuals, companies, and security teams may wish to publish security advisories on their own websites or other forums. Please contact us via the email or website mentioned above prior to publication to discuss the information that can be disclosed and to coordinate the disclosure timeline.
+# Bug Bounty Reward
+[For the policy of bug bounty reward](https://bytedance.larkoffice.com/docx/ZstQd7bbooDctqxBCAmcFasOngd), if you have any questions about the rules, please contact [https://src.bytedance.com/home](https://src.bytedance.com/home) for consultation.

SPACE_DEPLOYMENT.md ADDED Viewed

	@@ -0,0 +1,76 @@

+# Hugging Face Space Deployment
+This repository is prepared for a Docker-based Hugging Face Space.
+## Runtime
+- Space SDK: Docker
+- Public port: `7860`
+- Entrypoint: `python app.py`
+- Recommended hardware: GPU, preferably `l40s` or stronger
+## Model Assets
+The app first checks local model files under `LANCE_MODEL_BASE_DIR`.
+Default behavior:
+- Local checkout with `downloads/`: use `./downloads`
+- Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
+- Video tasks preload `Lance_3B_Video` at startup.
+- Image tasks unload the active video model first, then load `Lance_3B`.
+- Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
+Useful environment variables:
+- `LANCE_MODEL_REPO_ID`: Hugging Face model repo to download from. Default: `bytedance-research/Lance`
+- `LANCE_MODEL_BASE_DIR`: directory containing `Lance_3B_Video`, `Qwen2.5-VL-ViT`, and `Wan2.2_VAE.pth`
+- `LANCE_VIDEO_MODEL_PATH`: explicit video model directory override
+- `LANCE_IMAGE_MODEL_PATH`: explicit image model directory override
+- `LANCE_MODEL_PATH`: legacy explicit model directory override used for both task families if the family-specific override is unset
+- `LANCE_MODEL_VARIANT`: `video` or `image`; default is `video`
+- `LANCE_AUTO_DOWNLOAD`: set to `1` to download missing assets from the Hub
+- `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
+- `LANCE_QUEUE_SIZE`: Gradio queue size
+- `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
+Expected model layout:
+```text
+${LANCE_MODEL_BASE_DIR}/
+  Lance_3B_Video/
+    llm_config.json
+    model.safetensors
+    tokenizer.json
+    ...
+  Lance_3B/
+    llm_config.json
+    model.safetensors
+    tokenizer.json
+    ...
+  Qwen2.5-VL-ViT/
+    config.json
+    vit.safetensors
+  Wan2.2_VAE.pth
+```
+## Local Docker Check
+```bash
+docker build -t lance-space .
+docker run --gpus all -p 7860:7860 \
+  -e LANCE_MODEL_BASE_DIR=/models/lance \
+  -v /path/to/lance/downloads:/models/lance \
+  lance-space
+```
+Open `http://localhost:7860`.
+## Files Not Uploaded
+The Space build excludes generated or heavyweight local files through `.dockerignore`:
+- `downloads/`
+- `results/`
+- `tmps/`
+- Python cache files

app.py ADDED Viewed

The diff for this file is too large to render. See raw diff

app_save.py ADDED Viewed

	@@ -0,0 +1,2064 @@

+from __future__ import annotations
+import argparse
+import base64
+import concurrent.futures
+import gc
+import json
+import os
+import random
+import subprocess
+import threading
+import time
+import traceback
+from collections import deque
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from transformers import set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from common.utils.logging import get_logger
+from common.utils.misc import AutoEncoderParams, tuple_mul
+from config.config_factory import DataArguments, InferenceArguments, ModelArguments
+from data.data_utils import add_special_tokens
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.datasets_custom import ValidationDataset
+from inference_lance import (
+    PROMPT_JSON_FILENAME,
+    apply_inference_defaults,
+    clean_memory,
+    init_from_model_path_if_needed,
+    save_prompt_results,
+    validate_on_fixed_batch,
+)
+from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+REPO_ROOT = Path(__file__).resolve().parent
+GRADIO_TMP_ROOT = Path(os.getenv("LANCE_GRADIO_TMP_ROOT", "/tmp/lance_gradio")).expanduser()
+TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
+RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
+GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
+RUN_RECORD_FILENAME = "generation_record.json"
+LOCAL_MODEL_BASE_DIR = Path("downloads")
+SPACE_MODEL_BASE_DIR = Path("/data/lance_models")
+DEFAULT_MODEL_REPO_ID = "bytedance-research/Lance"
+DEFAULT_MODEL_VARIANT = "video"
+MODEL_VARIANT_VIDEO = "video"
+MODEL_VARIANT_IMAGE = "image"
+MODEL_VARIANT_TO_DIR = {
+    MODEL_VARIANT_VIDEO: "Lance_3B_Video",
+    MODEL_VARIANT_IMAGE: "Lance_3B",
+}
+DEFAULT_MODEL_PATH = LOCAL_MODEL_BASE_DIR / MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]
+DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
+DEFAULT_TASK = "t2v"
+DEFAULT_TIMESTEPS = 30
+DEFAULT_TIMESTEP_SHIFT = 3.5
+DEFAULT_CFG_TEXT_SCALE = 4.0
+DEFAULT_RESOLUTION = "video_848x480"
+DEFAULT_IMAGE_RESOLUTION = "image_768x768"
+DEFAULT_BASIC_SEED = 42
+DEFAULT_HEIGHT = 480
+DEFAULT_WIDTH = 848
+DEFAULT_IMAGE_SIZE = 768
+DEFAULT_NUM_FRAMES = 101
+DEFAULT_VIDEO_ASPECT_RATIO = "16:9"
+DEFAULT_IMAGE_ASPECT_RATIO = "1:1"
+DEFAULT_FRAME_INTERPOLATION = True
+ASPECT_RATIO_CHOICES = ["21:9", "16:9", "3:2", "4:3", "1:1", "3:4", "2:3", "9:16", "9:21"]
+VIDEO_ASPECT_RATIO_TO_SIZE = {
+    "21:9": (976, 416),
+    "16:9": (848, 480),
+    "3:2": (784, 528),
+    "4:3": (736, 560),
+    "1:1": (640, 640),
+    "3:4": (560, 736),
+    "2:3": (528, 784),
+    "9:16": (480, 848),
+    "9:21": (416, 976),
+}
+IMAGE_ASPECT_RATIO_TO_SIZE = {
+    "21:9": (1168, 496),
+    "16:9": (1024, 576),
+    "3:2": (944, 624),
+    "4:3": (880, 672),
+    "1:1": (768, 768),
+    "3:4": (672, 880),
+    "2:3": (624, 944),
+    "9:16": (576, 1024),
+    "9:21": (496, 1168),
+}
+DEFAULT_GPUS = "0"
+DEFAULT_QUEUE_SIZE = 32
+USE_KVCACHE = True
+TEXT_TEMPLATE = True
+RECORD_WRITE_LOCK = threading.Lock()
+LANCE_HOMEPAGE_URL = "https://lance-project.github.io/"
+LANCE_PAPER_URL = "http://arxiv.org/abs/2605.18678"
+LANCE_HUGGING_FACE_URL = "https://huggingface.co/bytedance-research/Lance"
+LANCE_GITHUB_URL = "https://github.com/bytedance/Lance"
+LANCE_LOGO_PATH = REPO_ROOT / "assets" / "logo" / "lance-logo.webp"
+APP_CSS = """
+.gradio-container {
+    max-width: 1680px !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+.contain {
+    max-width: 1680px !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+.lance-hero {
+    text-align: center;
+    padding: 8px 12px 6px;
+}
+.lance-logo {
+    width: min(160px, 36vw);
+    height: auto;
+    display: block;
+    margin: 0 auto 4px;
+}
+.lance-title {
+    margin: 0 auto 5px;
+    font-size: clamp(20px, 2.4vw, 30px);
+    line-height: 1.08;
+    font-weight: 800;
+    letter-spacing: 0;
+}
+.lance-authors {
+    margin: 0 auto 6px;
+    max-width: 980px;
+    font-size: 13px;
+    line-height: 1.35;
+    color: var(--body-text-color-subdued);
+}
+.lance-authors a {
+    color: inherit;
+    text-decoration: none;
+}
+.lance-authors a:hover {
+    text-decoration: underline;
+}
+.lance-badges {
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: center;
+    gap: 5px;
+    margin: 4px auto 0;
+}
+.lance-badges a {
+    line-height: 0;
+}
+.lance-badges img {
+    height: 20px;
+    width: auto;
+    display: block;
+}
+.lance-status {
+    max-width: 1180px;
+    margin: 0 auto 18px;
+}
+.task-selector {
+    overflow-x: auto;
+}
+.task-selector .wrap {
+    display: grid;
+    grid-template-columns: repeat(3, minmax(220px, 1fr));
+    gap: 8px;
+    min-width: 680px;
+}
+.task-selector label {
+    justify-content: center;
+    min-height: 38px;
+    white-space: nowrap;
+}
+.prompt-examples table,
+.prompt-examples th,
+.prompt-examples td {
+    border: 1px solid var(--border-color-primary) !important;
+}
+.prompt-examples table {
+    border-collapse: collapse !important;
+    width: 100% !important;
+}
+.prompt-examples td {
+    border-bottom: 1px solid var(--border-color-primary) !important;
+    padding: 12px !important;
+    vertical-align: top !important;
+}
+.prompt-example-proxy {
+    display: none !important;
+}
+.lance-main-row {
+    display: grid !important;
+    grid-template-columns: minmax(0, 1fr) minmax(0, 1fr) !important;
+    gap: 16px !important;
+    align-items: start !important;
+}
+.lance-main-column {
+    min-width: 0 !important;
+    width: 100% !important;
+}
+.lance-display-frame,
+.lance-display-frame > div,
+.lance-display-frame textarea {
+    width: 100% !important;
+}
+.lance-display-frame textarea {
+    min-height: 360px !important;
+}
+@media (max-width: 900px) {
+    .lance-main-row {
+        grid-template-columns: minmax(0, 1fr) !important;
+    }
+}
+"""
+TASK_T2V = "t2v"
+TASK_T2I = "t2i"
+TASK_V2T = "v2t"
+TASK_X2T = "x2t"
+TASK_X2T_VIDEO = "x2t_video"
+TASK_X2T_IMAGE = "x2t_image"
+TASK_IMAGE_EDIT = "image_edit"
+TASK_VIDEO_EDIT = "video_edit"
+TASK_LABEL_VIDEO_GENERATION = "Video Generation"
+TASK_LABEL_VIDEO_EDIT = "Video Edit"
+TASK_LABEL_VIDEO_UNDERSTANDING = "Video Understanding"
+TASK_LABEL_IMAGE_GENERATION = "Image Generation"
+TASK_LABEL_IMAGE_EDIT = "Image Edit"
+TASK_LABEL_IMAGE_UNDERSTANDING = "Image Understanding"
+TASK_CHOICES = [
+    TASK_LABEL_VIDEO_GENERATION,
+    TASK_LABEL_VIDEO_EDIT,
+    TASK_LABEL_VIDEO_UNDERSTANDING,
+    TASK_LABEL_IMAGE_GENERATION,
+    TASK_LABEL_IMAGE_EDIT,
+    TASK_LABEL_IMAGE_UNDERSTANDING,
+]
+TASK_LABEL_TO_INTERNAL = {
+    TASK_LABEL_VIDEO_GENERATION: TASK_T2V,
+    TASK_LABEL_VIDEO_EDIT: TASK_VIDEO_EDIT,
+    TASK_LABEL_VIDEO_UNDERSTANDING: TASK_X2T_VIDEO,
+    TASK_LABEL_IMAGE_GENERATION: TASK_T2I,
+    TASK_LABEL_IMAGE_EDIT: TASK_IMAGE_EDIT,
+    TASK_LABEL_IMAGE_UNDERSTANDING: TASK_X2T_IMAGE,
+    TASK_T2V: TASK_T2V,
+    TASK_VIDEO_EDIT: TASK_VIDEO_EDIT,
+    TASK_V2T: TASK_X2T_VIDEO,
+    TASK_X2T: TASK_X2T_VIDEO,
+    TASK_X2T_VIDEO: TASK_X2T_VIDEO,
+    TASK_T2I: TASK_T2I,
+    TASK_IMAGE_EDIT: TASK_IMAGE_EDIT,
+    TASK_X2T_IMAGE: TASK_X2T_IMAGE,
+}
+GENERATION_TASKS = {TASK_T2V, TASK_T2I, TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
+UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
+IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
+VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
+EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
+VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
+IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
+RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
+CAPTION_SYSTEM_PROMPT_TEMPLATE = (
+    "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
+)
+V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
+I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
+V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
+I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
+def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
+    """Get Aspect Ratio choices with default/recommended marker for the given task."""
+    internal_task = normalize_task(task)
+    default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
+    return [
+        (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
+        for ratio in ASPECT_RATIO_CHOICES
+    ]
+def env_flag(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+def running_on_space() -> bool:
+    return bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
+def display_path(path: Path) -> str:
+    path_text = path.as_posix()
+    if path.is_absolute():
+        try:
+            path_text = path.relative_to(Path.cwd()).as_posix()
+        except ValueError:
+            return path_text
+    if path_text == "." or path_text.startswith("./"):
+        return path_text
+    return f"./{path_text}"
+def get_model_base_dir() -> Path:
+    configured = os.getenv("LANCE_MODEL_BASE_DIR")
+    if configured:
+        return Path(configured).expanduser()
+    if LOCAL_MODEL_BASE_DIR.exists():
+        return LOCAL_MODEL_BASE_DIR
+    return SPACE_MODEL_BASE_DIR if running_on_space() else LOCAL_MODEL_BASE_DIR
+def normalize_model_variant(model_variant: Optional[str] = None) -> str:
+    variant = (model_variant or os.getenv("LANCE_MODEL_VARIANT", DEFAULT_MODEL_VARIANT)).strip().lower()
+    if variant in {"image", "t2i", "i2t"}:
+        return MODEL_VARIANT_IMAGE
+    return MODEL_VARIANT_VIDEO
+def get_model_path(model_variant: Optional[str] = None) -> Path:
+    variant = normalize_model_variant(model_variant)
+    variant_env_name = "LANCE_IMAGE_MODEL_PATH" if variant == MODEL_VARIANT_IMAGE else "LANCE_VIDEO_MODEL_PATH"
+    variant_configured = os.getenv(variant_env_name)
+    if variant_configured:
+        return Path(variant_configured).expanduser()
+    configured = os.getenv("LANCE_MODEL_PATH")
+    if configured:
+        return Path(configured).expanduser()
+    model_dir_name = MODEL_VARIANT_TO_DIR[variant]
+    return get_model_base_dir() / model_dir_name
+def get_required_model_asset_paths(model_base_dir: Path, model_path: Path) -> list[Path]:
+    return [
+        model_path / "llm_config.json",
+        model_path / "model.safetensors",
+        model_base_dir / "Qwen2.5-VL-ViT" / "vit.safetensors",
+        model_base_dir / "Wan2.2_VAE.pth",
+    ]
+def ensure_model_assets(model_variant: Optional[str] = None) -> Path:
+    model_base_dir = get_model_base_dir()
+    os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
+    model_path = get_model_path(model_variant)
+    required_paths = get_required_model_asset_paths(model_base_dir, model_path)
+    if all(path.exists() for path in required_paths):
+        return model_path
+    downloads_model_base_dir = Path("downloads")
+    if model_base_dir == Path(".") and downloads_model_base_dir.exists():
+        downloads_model_path = downloads_model_base_dir / MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]
+        downloads_required_paths = get_required_model_asset_paths(downloads_model_base_dir, downloads_model_path)
+        if all(path.exists() for path in downloads_required_paths):
+            model_base_dir = downloads_model_base_dir
+            model_path = downloads_model_path
+            required_paths = downloads_required_paths
+            os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
+            return model_path
+    auto_download = env_flag("LANCE_AUTO_DOWNLOAD", running_on_space())
+    if not auto_download:
+        missing = "\n".join(f"- {display_path(path)}" for path in required_paths if not path.exists())
+        raise FileNotFoundError(
+            "Lance model assets are missing. Set LANCE_MODEL_BASE_DIR or enable "
+            f"LANCE_AUTO_DOWNLOAD=1.\nMissing files:\n{missing}"
+        )
+    model_base_dir.mkdir(parents=True, exist_ok=True)
+    repo_id = os.getenv("LANCE_MODEL_REPO_ID", DEFAULT_MODEL_REPO_ID)
+    print(f"[startup] Downloading Lance model assets from {repo_id} to {display_path(model_base_dir)}", flush=True)
+    snapshot_path = Path(
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=str(model_base_dir),
+            local_dir_use_symlinks=False,
+            resume_download=True,
+        )
+    )
+    if snapshot_path != model_base_dir and not model_path.exists():
+        os.environ["LANCE_MODEL_BASE_DIR"] = display_path(snapshot_path)
+        model_path = get_model_path(model_variant)
+    return model_path
+def ensure_dirs() -> None:
+    TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
+    RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
+def save_generation_record(record: dict, save_dir: Path) -> None:
+    ensure_dirs()
+    run_record_path = save_dir / RUN_RECORD_FILENAME
+    with run_record_path.open("w", encoding="utf-8") as f:
+        json.dump(record, f, ensure_ascii=False, indent=2)
+    with RECORD_WRITE_LOCK:
+        with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+def normalize_seed(seed: int) -> int:
+    return random.randint(0, 2**31 - 1) if seed == -1 else seed
+def normalize_task(task: str) -> str:
+    task_key = (task or TASK_LABEL_VIDEO_GENERATION).strip()
+    task = TASK_LABEL_TO_INTERNAL.get(task_key, TASK_LABEL_TO_INTERNAL.get(task_key.lower(), ""))
+    if task not in GENERATION_TASKS | UNDERSTANDING_TASKS:
+        raise ValueError(f"Unsupported task type: {task}")
+    return task
+def normalize_resolution_for_backend(resolution: str, task: str) -> str:
+    internal_task = normalize_task(task)
+    if internal_task in IMAGE_TASKS:
+        return DEFAULT_IMAGE_RESOLUTION
+    if internal_task in VIDEO_TASKS:
+        return DEFAULT_RESOLUTION
+    return str(resolution)
+def get_default_aspect_ratio(task: str) -> str:
+    internal_task = normalize_task(task)
+    return DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
+def get_size_for_aspect_ratio(task: str, aspect_ratio: str) -> tuple[int, int]:
+    internal_task = normalize_task(task)
+    aspect_ratio = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
+    size_map = IMAGE_ASPECT_RATIO_TO_SIZE if internal_task in IMAGE_TASKS else VIDEO_ASPECT_RATIO_TO_SIZE
+    return size_map[aspect_ratio]
+def format_size_markdown(task: str, width: int, height: int) -> str:
+    internal_task = normalize_task(task)
+    if internal_task in UNDERSTANDING_TASKS:
+        return ""
+    #return f"**Output Resolution:** `{width} x {height}`"
+    return f"{width} x {height}"
+def update_size_from_aspect_ratio(task: str, aspect_ratio: str):
+    width, height = get_size_for_aspect_ratio(task, aspect_ratio)
+    return height, width, format_size_markdown(task, width, height)
+def reset_generation_defaults_for_task(task: str):
+    internal_task = normalize_task(task)
+    aspect_ratio = get_default_aspect_ratio(internal_task)
+    width, height = get_size_for_aspect_ratio(internal_task, aspect_ratio)
+    resolution = DEFAULT_IMAGE_RESOLUTION if internal_task in IMAGE_TASKS else DEFAULT_RESOLUTION
+    num_frames = DEFAULT_NUM_FRAMES if internal_task == TASK_T2V else 1
+    return aspect_ratio, height, width, num_frames, resolution, format_size_markdown(internal_task, width, height)
+def apply_prompt_example(task: str, evt: gr.SelectData):
+    prompt_text = ""
+    if isinstance(evt.row_value, list) and evt.row_value:
+        prompt_text = str(evt.row_value[0])
+    elif evt.value is not None:
+        prompt_text = str(evt.value)
+    defaults = reset_generation_defaults_for_task(task)
+    return (prompt_text, *defaults)
+def get_understanding_system_prompt_choices(task: str) -> list[str]:
+    internal_task = normalize_task(task)
+    if internal_task == TASK_X2T_IMAGE:
+        return [I2T_QA_SYSTEM_PROMPT]
+    return [V2T_QA_SYSTEM_PROMPT]
+def normalize_understanding_system_prompt(task: str, system_prompt: Optional[str]) -> str:
+    return get_understanding_system_prompt_choices(task)[0]
+def create_request_json(
+    task: str,
+    prompt: str,
+    input_video: Optional[str],
+    input_image: Optional[str],
+    system_prompt: Optional[str] = None,
+) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"
+    if task == TASK_T2V:
+        payload = {"000000.mp4": prompt}
+    elif task == TASK_T2I:
+        payload = {"000000.png": prompt}
+    elif task == TASK_VIDEO_EDIT:
+        if not input_video:
+            raise ValueError("The video edit task requires an input video.")
+        payload = {
+            "000000": {
+                "interleave_array": [prompt, input_video, input_video],
+                "element_dtype_array": ["text", "video", "video"],
+                "istarget_in_interleave": [0, 0, 1],
+            }
+        }
+    elif task == TASK_IMAGE_EDIT:
+        if not input_image:
+            raise ValueError("The image edit task requires an input image.")
+        payload = {
+            "000000": {
+                "interleave_array": [prompt, input_image, input_image],
+                "element_dtype_array": ["text", "image", "image"],
+                "istarget_in_interleave": [0, 0, 1],
+            }
+        }
+    elif task == TASK_X2T_VIDEO:
+        if not input_video:
+            raise ValueError("The video understanding task requires an input video.")
+        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
+        payload = {
+            "000000": {
+                "interleave_array": [input_video, [system_prompt, prompt, ""]],
+                "element_dtype_array": ["video", "text"],
+                "istarget_in_interleave": [0, 1],
+            }
+        }
+    elif task == TASK_X2T_IMAGE:
+        if not input_image:
+            raise ValueError("The image understanding task requires an input image.")
+        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
+        payload = {
+            "000000": {
+                "interleave_array": [input_image, [system_prompt, prompt, ""]],
+                "element_dtype_array": ["image", "text"],
+                "istarget_in_interleave": [0, 1],
+            }
+        }
+    else:
+        raise ValueError(f"Unsupported task type: {task}")
+    with prompt_file.open("w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+    return prompt_file
+def resolve_example_path(path: str) -> str:
+    candidate = Path(path)
+    if candidate.is_absolute():
+        return str(candidate)
+    repo_candidate = (REPO_ROOT / candidate)
+    if repo_candidate.exists():
+        return str(repo_candidate.resolve())
+    if candidate.exists():
+        return str(candidate.resolve())
+    return path
+def resolve_browser_video_example_path(path: str) -> str:
+    candidate = Path(path)
+    compatible_candidate = candidate.with_name(f"{candidate.stem}_h264{candidate.suffix}")
+    repo_compatible_candidate = REPO_ROOT / compatible_candidate
+    if not compatible_candidate.is_absolute() and repo_compatible_candidate.exists():
+        return str(repo_compatible_candidate.resolve())
+    if compatible_candidate.is_absolute() and compatible_candidate.exists():
+        return str(compatible_candidate.resolve())
+    repo_candidate = REPO_ROOT / candidate
+    if not candidate.is_absolute() and repo_candidate.exists():
+        return str(repo_candidate.resolve())
+    if candidate.is_absolute() and candidate.exists():
+        return str(candidate.resolve())
+    return resolve_example_path(path)
+def load_json_examples(relative_path: str) -> dict:
+    path = REPO_ROOT / relative_path
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+T2V_EXAMPLE_SUMMARIES = {
+    "000000.mp4": "Red panda surfing on a bright seaside wave.",
+    "000002.mp4": "Panda cub skateboarding in a creative loft.",
+    "000004.mp4": "Young woman shaping clay in a sunlit pottery workshop.",
+    "000005.mp4": "Panda boxing a robot in a luxurious palace ring.",
+    "000008.mp4": "Fantasy pastel horse stepping through a glowing cloud valley.",
+}
+def make_generation_examples(
+    task_label: str,
+    relative_path: str,
+    limit: int,
+    image_task: bool,
+    selected_keys: Optional[list[str]] = None,
+    summaries: Optional[dict[str, str]] = None,
+) -> list[list]:
+    data = load_json_examples(relative_path)
+    items = [(key, data[key]) for key in selected_keys if key in data] if selected_keys else list(data.items())[:limit]
+    examples = []
+    for output_name, prompt in items:
+        examples.append([prompt])
+    return examples
+def make_edit_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
+    data = load_json_examples(relative_path)
+    examples = []
+    for sample in list(data.values())[:limit]:
+        interleave = sample["interleave_array"]
+        prompt = interleave[0]
+        media_path = resolve_example_path(interleave[1])
+        examples.append([
+            prompt,
+            media_path if media_type == "video" else None,
+            media_path if media_type == "image" else None,
+        ])
+    return examples
+def make_understanding_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
+    data = load_json_examples(relative_path)
+    examples = []
+    for sample in list(data.values())[:limit]:
+        interleave = sample["interleave_array"]
+        media_path = (
+            resolve_browser_video_example_path(interleave[0])
+            if media_type == "video"
+            else resolve_example_path(interleave[0])
+        )
+        text_payload = interleave[1]
+        question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
+        examples.append([
+            question,
+            media_path if media_type == "video" else None,
+            media_path if media_type == "image" else None,
+        ])
+    return examples
+def make_understanding_system_prompt_map(relative_path: str, task: str) -> dict[str, str]:
+    data = load_json_examples(relative_path)
+    system_prompts = {}
+    for sample in data.values():
+        interleave = sample["interleave_array"]
+        text_payload = interleave[1]
+        if not isinstance(text_payload, list) or len(text_payload) < 2:
+            continue
+        system_prompts[text_payload[1]] = normalize_understanding_system_prompt(task, text_payload[0])
+    return system_prompts
+VIDEO_GENERATION_EXAMPLES = make_generation_examples(
+    TASK_LABEL_VIDEO_GENERATION,
+    "config/examples/t2v_example.json",
+    limit=6,
+    image_task=False,
+    #selected_keys=["000000.mp4", "000002.mp4", "000005.mp4", "000004.mp4", "000008.mp4"],
+    selected_keys=["000004.mp4", "000005.mp4", "000002.mp4", "000000.mp4", "000008.mp4", "000007.mp4"],
+    summaries=T2V_EXAMPLE_SUMMARIES,
+)
+VIDEO_EDIT_EXAMPLES = make_edit_examples(
+    TASK_LABEL_VIDEO_EDIT,
+    "config/examples/video_edit_example.json",
+    limit=3,
+    media_type="video",
+)
+VIDEO_UNDERSTANDING_EXAMPLES = make_understanding_examples(
+    TASK_LABEL_VIDEO_UNDERSTANDING,
+    "config/examples/x2t_video_example.json",
+    limit=3,
+    media_type="video",
+)
+VIDEO_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
+    "config/examples/x2t_video_example.json",
+    TASK_X2T_VIDEO,
+)
+IMAGE_GENERATION_EXAMPLES = make_generation_examples(
+    TASK_LABEL_IMAGE_GENERATION,
+    "config/examples/t2i_example.json",
+    limit=5,
+    image_task=True,
+    selected_keys=["000000.png", "000003.png", "000006.png", "000008.png", "000009.png"],
+)
+IMAGE_EDIT_EXAMPLES = make_edit_examples(
+    TASK_LABEL_IMAGE_EDIT,
+    "config/examples/image_edit_example.json",
+    limit=5,
+    media_type="image",
+)
+IMAGE_UNDERSTANDING_EXAMPLES = make_understanding_examples(
+    TASK_LABEL_IMAGE_UNDERSTANDING,
+    "config/examples/x2t_image_example.json",
+    limit=3,
+    media_type="image",
+)
+IMAGE_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
+    "config/examples/x2t_image_example.json",
+    TASK_X2T_IMAGE,
+)
+def build_save_dir(task: str) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"
+def find_generated_video(save_dir: Path) -> Optional[Path]:
+    videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return videos[0] if videos else None
+def find_generated_image(save_dir: Path) -> Optional[Path]:
+    images = sorted(save_dir.glob("*.png"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return images[0] if images else None
+def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tuple[Path, str]:
+    rife_dir = REPO_ROOT / "RIFE"
+    rife_script = rife_dir / "inference_video.py"
+    if not rife_script.exists():
+        raise FileNotFoundError(f"RIFE inference script not found: {rife_script}")
+    output_path = video_path.with_name(f"{video_path.stem}_rife_{2 ** exp}x{video_path.suffix}")
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = str(device_id)
+    command = [
+        "python3",
+        str(rife_script),
+        "--exp",
+        str(exp),
+        "--video",
+        str(video_path),
+        "--output",
+        str(output_path),
+        "--model",
+        str(rife_dir / "train_log"),
+    ]
+    rife_start = time.perf_counter()
+    try:
+        completed = subprocess.run(
+            command,
+            cwd=str(video_path.parent),
+            env=env,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(
+            "\n".join(
+                [
+                    f"RIFE failed with exit code {exc.returncode}.",
+                    f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
+                    exc.stdout.strip() if exc.stdout else "",
+                    exc.stderr.strip() if exc.stderr else "",
+                ]
+            ).strip()
+        ) from exc
+    if not output_path.exists():
+        raise FileNotFoundError(f"RIFE completed but output video was not found: {output_path}")
+    elapsed = time.perf_counter() - rife_start
+    log = "\n".join(
+        [
+            "[rife] Frame interpolation finished.",
+            f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
+            f"elapsed={elapsed:.2f}s",
+            f"output={output_path}",
+            completed.stdout.strip(),
+            completed.stderr.strip(),
+        ]
+    ).strip()
+    return output_path, log
+def extract_text_result(save_dir: Path) -> str:
+    prompt_result_path = save_dir / PROMPT_JSON_FILENAME
+    if not prompt_result_path.exists():
+        return ""
+    with prompt_result_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not data:
+        return ""
+    first_value = next(iter(data.values()))
+    return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)
+class LanceT2VV2TPipeline:
+    def __init__(self, device_id: int, model_variant: str = MODEL_VARIANT_VIDEO) -> None:
+        self._init_lock = threading.Lock()
+        self._generate_lock = threading.Lock()
+        self.initialized = False
+        self.device = device_id
+        self.model_variant = normalize_model_variant(model_variant)
+        self.logger = get_logger(f"lance_{self.model_variant}_gpu{device_id}")
+        self.model: Optional[Lance] = None
+        self.vae_model: Optional[WanVideoVAE] = None
+        self.vae_config: Optional[AutoEncoderParams] = None
+        self.tokenizer: Optional[Qwen2Tokenizer] = None
+        self.new_token_ids: Optional[dict] = None
+        self.image_token_id: Optional[int] = None
+        self.base_model_args: Optional[ModelArguments] = None
+        self.base_data_args: Optional[DataArguments] = None
+        self.base_inference_args: Optional[InferenceArguments] = None
+    def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
+        elapsed = time.perf_counter() - start_time
+        suffix = f" | {extra}" if extra else ""
+        print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)
+    def _build_base_model_args(self) -> ModelArguments:
+        model_path = str(get_model_path(self.model_variant))
+        return ModelArguments(
+            model_path=model_path,
+            vit_type=DEFAULT_VIT_TYPE,
+            llm_qk_norm=True,
+            llm_qk_norm_und=True,
+            llm_qk_norm_gen=True,
+            tie_word_embeddings=False,
+            max_num_frames=121,
+            max_latent_size=64,
+            latent_patch_size=[1, 1, 1],
+        )
+    def _build_base_inference_args(self) -> InferenceArguments:
+        return InferenceArguments(
+            validation_num_timesteps=DEFAULT_TIMESTEPS,
+            validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
+            copy_init_moe=True,
+            visual_und=True,
+            visual_gen=True,
+            vae_model_type="wan",
+            apply_qwen_2_5_vl_pos_emb=True,
+            apply_chat_template=False,
+            cfg_type=0,
+            validation_data_seed=42,
+            video_height=DEFAULT_HEIGHT,
+            video_width=DEFAULT_WIDTH,
+            num_frames=DEFAULT_NUM_FRAMES,
+            task=DEFAULT_TASK,
+            save_path_gen=str(RESULTS_ROOT),
+            resolution=DEFAULT_RESOLUTION,
+            text_template=TEXT_TEMPLATE,
+            use_KVcache=USE_KVCACHE,
+        )
+    def initialize(self) -> None:
+        with self._init_lock:
+            if self.initialized:
+                return
+            ensure_dirs()
+            resolved_model_path = ensure_model_assets(self.model_variant)
+            print(
+                f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
+                flush=True,
+            )
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
+            if self.device >= torch.cuda.device_count():
+                raise RuntimeError(
+                    f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
+                )
+            torch.cuda.set_device(self.device)
+            model_args = self._build_base_model_args()
+            data_args = DataArguments()
+            inference_args = self._build_base_inference_args()
+            apply_inference_defaults(model_args, data_args, inference_args)
+            inference_args.validation_noise_seed = inference_args.validation_data_seed
+            self.base_model_args = model_args
+            self.base_data_args = data_args
+            self.base_inference_args = inference_args
+            set_seed(inference_args.global_seed)
+            stage_start = time.perf_counter()
+            print(
+                f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
+                flush=True,
+            )
+            llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
+            self._log_stage("LLM config load", stage_start)
+            llm_config.layer_module = model_args.layer_module
+            llm_config.qk_norm = model_args.llm_qk_norm
+            llm_config.qk_norm_und = model_args.llm_qk_norm_und
+            llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+            llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+            llm_config.freeze_und = inference_args.freeze_und
+            llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
+            language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+            self._log_stage("LLM weight init", stage_start)
+            vit_model = None
+            vit_config = None
+            if inference_args.visual_und:
+                if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+                    raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
+                vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+                self._log_stage("VIT config load", stage_start)
+                stage_start = time.perf_counter()
+                print(
+                    f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
+                    flush=True,
+                )
+                vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+                vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
+                vit_model.load_state_dict(vit_weights, strict=True)
+                self._log_stage("VIT weight load", stage_start)
+                clean_memory(vit_weights)
+            if inference_args.visual_gen:
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
+                vae_model = WanVideoVAE()
+                vae_config = deepcopy(vae_model.vae_config)
+                self._log_stage("VAE init", stage_start)
+            else:
+                vae_model = None
+                vae_config = None
+            config = LanceConfig(
+                visual_gen=inference_args.visual_gen,
+                visual_und=inference_args.visual_und,
+                llm_config=llm_config,
+                vit_config=vit_config if inference_args.visual_und else None,
+                vae_config=vae_config if inference_args.visual_gen else None,
+                latent_patch_size=model_args.latent_patch_size,
+                max_num_frames=model_args.max_num_frames,
+                max_latent_size=model_args.max_latent_size,
+                vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+                connector_act=model_args.connector_act,
+                interpolate_pos=model_args.interpolate_pos,
+                timestep_shift=inference_args.timestep_shift,
+            )
+            model: Lance = Lance(
+                language_model=language_model,
+                vit_model=vit_model if inference_args.visual_und else None,
+                vit_type=model_args.vit_type,
+                config=config,
+                training_args=inference_args,
+            )
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
+            model = model.to(self.device)
+            self._log_stage("Lance model move to GPU", stage_start)
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
+            tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+            tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+            self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
+            if inference_args.copy_init_moe:
+                language_model.init_moe()
+            init_from_model_path_if_needed(model, model_args)
+            if num_new_tokens > 0:
+                model.language_model.resize_token_embeddings(len(tokenizer))
+                model.config.llm_config.vocab_size = len(tokenizer)
+                model.language_model.config.vocab_size = len(tokenizer)
+            if model_args.vit_type.lower() == "qwen2_5_vl":
+                from common.model.hacks import hack_qwen2_5_vl_config
+                language_model = hack_qwen2_5_vl_config(language_model)
+            image_token_id = language_model.config.video_token_id
+            new_token_ids.update({"image_token_id": image_token_id})
+            model.update_tokenizer(tokenizer=tokenizer)
+            if model_args.tie_word_embeddings:
+                model.language_model.untie_lm_head()
+                model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+                model_args.tie_word_embeddings = False
+                llm_config.tie_word_embeddings = False
+            else:
+                assert (
+                    model.language_model.get_input_embeddings().weight.data.data_ptr()
+                    != model.language_model.get_output_embeddings().weight.data.data_ptr()
+                ), "tie_word_embeddings conflict"
+            model = model.to(device=self.device, dtype=torch.bfloat16)
+            model.eval()
+            if vae_model is not None and hasattr(vae_model, "eval"):
+                vae_model.eval()
+            self.model = model
+            self.vae_model = vae_model
+            self.vae_config = vae_config
+            self.tokenizer = tokenizer
+            self.new_token_ids = new_token_ids
+            self.image_token_id = image_token_id
+            self.initialized = True
+            print(
+                f"[startup][gpu:{self.device}][{self.model_variant}] Lance multimodal Gradio model loaded and ready for reuse.",
+                flush=True,
+            )
+    def unload(self) -> None:
+        with self._init_lock:
+            if self.model is not None:
+                self.model.cpu()
+            if self.vae_model is not None and hasattr(self.vae_model, "vae"):
+                vae_inner = self.vae_model.vae
+                if hasattr(vae_inner, "model"):
+                    vae_inner.model.cpu()
+            self.model = None
+            self.vae_model = None
+            self.vae_config = None
+            self.tokenizer = None
+            self.new_token_ids = None
+            self.image_token_id = None
+            self.base_model_args = None
+            self.base_data_args = None
+            self.base_inference_args = None
+            self.initialized = False
+            gc.collect()
+            if torch.cuda.is_available():
+                with torch.cuda.device(self.device):
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+    def _build_request_batch(
+        self,
+        prompt_file: Path,
+        model_args: ModelArguments,
+        data_args: DataArguments,
+        inference_args: InferenceArguments,
+    ):
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.vae_config is not None
+        dataset_config = DataConfig.from_yaml(str(prompt_file))
+        if inference_args.visual_und:
+            dataset_config.vit_patch_size = model_args.vit_patch_size
+            dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+            dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+        if inference_args.visual_gen:
+            vae_downsample = tuple_mul(
+                tuple(model_args.latent_patch_size),
+                (
+                    self.vae_config.downsample_temporal,
+                    self.vae_config.downsample_spatial,
+                    self.vae_config.downsample_spatial,
+                ),
+            )
+            dataset_config.latent_patch_size = model_args.latent_patch_size
+            dataset_config.vae_downsample = vae_downsample
+            dataset_config.max_latent_size = model_args.max_latent_size
+            dataset_config.max_num_frames = model_args.max_num_frames
+        dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+        dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+        dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+        dataset_config.num_frames = inference_args.num_frames
+        dataset_config.H = inference_args.video_height
+        dataset_config.W = inference_args.video_width
+        dataset_config.task = inference_args.task
+        dataset_config.resolution = inference_args.resolution
+        dataset_config.text_template = inference_args.text_template
+        val_dataset = ValidationDataset(
+            jsonl_path=str(prompt_file),
+            tokenizer=self.tokenizer,
+            data_args=data_args,
+            model_args=model_args,
+            training_args=inference_args,
+            new_token_ids=self.new_token_ids,
+            dataset_config=dataset_config,
+            local_rank=0,
+            world_size=1,
+        )
+        return simple_custom_collate([val_dataset[0]])
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        system_prompt: Optional[str],
+        input_video: Optional[str],
+        input_image: Optional[str],
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+        enable_frame_interpolation: bool,
+    ):
+        self.initialize()
+        internal_task = normalize_task(task)
+        prompt = (prompt or "").strip()
+        input_video = str(input_video).strip() if input_video else ""
+        input_image = str(input_image).strip() if input_image else ""
+        if internal_task in GENERATION_TASKS and not prompt:
+            return None, None, "", "Please enter a prompt.", ""
+        if internal_task in UNDERSTANDING_TASKS and not prompt:
+            return None, None, "", "Please enter a question.", ""
+        if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
+            return None, None, "", "Please upload an input video.", ""
+        if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
+            return None, None, "", "Please upload an input image.", ""
+        if height <= 0 or width <= 0:
+            return None, None, "", "Height and width must be greater than 0.", ""
+        if num_frames <= 0:
+            return None, None, "", "The number of frames must be greater than 0.", ""
+        assert self.model is not None
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.image_token_id is not None
+        assert self.base_model_args is not None
+        assert self.base_data_args is not None
+        assert self.base_inference_args is not None
+        active_model_path = self.base_model_args.model_path
+        with self._generate_lock:
+            torch.cuda.set_device(self.device)
+            actual_seed = normalize_seed(int(seed))
+            prompt_file = create_request_json(
+                task=internal_task,
+                prompt=prompt,
+                input_video=input_video,
+                input_image=input_image,
+                system_prompt=system_prompt,
+            )
+            save_dir = build_save_dir(internal_task)
+            save_dir.mkdir(parents=True, exist_ok=True)
+            request_started_at = datetime.now().isoformat(timespec="seconds")
+            request_model_args = deepcopy(self.base_model_args)
+            request_model_args.cfg_text_scale = float(cfg_text_scale)
+            request_data_args = deepcopy(self.base_data_args)
+            request_data_args.val_dataset_config_file = str(prompt_file)
+            request_inference_args = deepcopy(self.base_inference_args)
+            request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
+            request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
+            request_inference_args.validation_data_seed = actual_seed
+            request_inference_args.validation_noise_seed = actual_seed
+            request_inference_args.video_height = int(height)
+            request_inference_args.video_width = int(width)
+            request_inference_args.num_frames = int(num_frames)
+            display_resolution = str(resolution)
+            backend_resolution = normalize_resolution_for_backend(display_resolution, internal_task)
+            request_inference_args.resolution = backend_resolution
+            request_inference_args.save_path_gen = str(save_dir)
+            request_inference_args.task = internal_task
+            request_inference_args.text_template = TEXT_TEMPLATE
+            request_inference_args.prompt_data_dict = {}
+            try:
+                print(
+                    "[lance_gradio_t2v_v2t] Start generation "
+                    f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
+                    f"size={height}x{width} | frames={num_frames} | resolution={display_resolution}",
+                    flush=True,
+                )
+                val_data_cpu = self._build_request_batch(
+                    prompt_file=prompt_file,
+                    model_args=request_model_args,
+                    data_args=request_data_args,
+                    inference_args=request_inference_args,
+                )
+                generate_start = time.perf_counter()
+                validate_on_fixed_batch(
+                    fsdp_model=self.model,
+                    vae_model=self.vae_model,
+                    tokenizer=self.tokenizer,
+                    val_data_cpu=val_data_cpu,
+                    training_args=request_inference_args,
+                    model_args=request_model_args,
+                    inference_args=request_inference_args,
+                    new_token_ids=self.new_token_ids,
+                    image_token_id=self.image_token_id,
+                    device=self.device,
+                    save_source_video=False,
+                    save_path_gen=request_inference_args.save_path_gen,
+                    save_path_gt="",
+                )
+                elapsed = time.perf_counter() - generate_start
+                save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
+                clean_memory()
+                video_path = find_generated_video(save_dir) if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} else None
+                original_video_path = video_path
+                rife_log = ""
+                rife_error = ""
+                frame_interpolation_enabled = bool(enable_frame_interpolation) and internal_task in {TASK_T2V, TASK_VIDEO_EDIT}
+                if frame_interpolation_enabled and video_path is not None:
+                    try:
+                        clean_memory()
+                        print(
+                            "[rife] Start frame interpolation "
+                            f"| task={internal_task} | gpu={self.device} | input={video_path}",
+                            flush=True,
+                        )
+                        video_path, rife_log = run_rife_interpolation(video_path, self.device, exp=1)
+                    except Exception:
+                        rife_error = traceback.format_exc()
+                        print(rife_error, flush=True)
+                image_path = find_generated_image(save_dir) if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} else None
+                text_result = extract_text_result(save_dir) if internal_task in UNDERSTANDING_TASKS else ""
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "success",
+                    "task": internal_task,
+                    "model_variant": self.model_variant,
+                    "model_path": active_model_path,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "system_prompt": normalize_understanding_system_prompt(internal_task, system_prompt)
+                    if internal_task in UNDERSTANDING_TASKS
+                    else "",
+                    "input_video": input_video,
+                    "input_image": input_image,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": display_resolution,
+                    "backend_resolution": backend_resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "frame_interpolation": frame_interpolation_enabled,
+                    "elapsed_seconds": round(elapsed, 3),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "original_video_path": str(original_video_path) if original_video_path is not None else "",
+                    "video_path": str(video_path) if video_path is not None else "",
+                    "image_path": str(image_path) if image_path is not None else "",
+                    "text_result": text_result,
+                    "rife_error": rife_error,
+                }
+                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} and video_path is None:
+                    record["status"] = "completed_without_video"
+                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} and image_path is None:
+                    record["status"] = "completed_without_image"
+                if internal_task in UNDERSTANDING_TASKS and not text_result:
+                    record["status"] = "completed_without_text"
+                save_generation_record(record, save_dir)
+                logs = "\n".join(
+                    [
+                        "[lance_gradio_t2v_v2t] Inference finished in-process.",
+                        f"task={internal_task}",
+                        f"model_variant={self.model_variant}",
+                        f"model_path={active_model_path}",
+                        f"gpu={self.device}",
+                        f"seed={actual_seed}",
+                        f"height={height}",
+                        f"width={width}",
+                        f"num_frames={num_frames}",
+                        f"resolution={display_resolution}",
+                        f"backend_resolution={backend_resolution}",
+                        f"validation_num_timesteps={validation_num_timesteps}",
+                        f"validation_timestep_shift={validation_timestep_shift}",
+                        f"cfg_text_scale={cfg_text_scale}",
+                        f"frame_interpolation={frame_interpolation_enabled}",
+                        f"original_video_path={original_video_path or ''}",
+                        f"rife_error={rife_error.strip() if rife_error else ''}",
+                        f"elapsed={elapsed:.2f}s",
+                        f"output_dir={save_dir}",
+                        rife_log,
+                    ]
+                )
+                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
+                    if video_path is None:
+                        status = (
+                            "Inference completed, but no output video was found.\n\n"
+                            f"- Task: `{internal_task}`\n"
+                            f"- Model: `{self.model_variant}`\n"
+                            f"- Model path: `{active_model_path}`\n"
+                            f"- GPU: `{self.device}`\n"
+                            f"- Actual seed: `{actual_seed}`\n"
+                            f"- Output directory: `{save_dir}`"
+                        )
+                        return None, None, "", status, logs
+                    # status = (
+                    #     "Inference completed.\n\n"
+                    #     f"- Task: `{internal_task}`\n"
+                    #     f"- Model: `{self.model_variant}`\n"
+                    #     f"- Model path: `{active_model_path}`\n"
+                    #     f"- GPU: `{self.device}`\n"
+                    #     f"- Actual seed: `{actual_seed}`\n"
+                    #     f"- Output directory: `{save_dir}`\n"
+                    #     f"- Result file: `{video_path}`"
+                    # )
+                    status = ""
+                    return str(video_path), None, "", status, logs
+                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
+                    if image_path is None:
+                        status = (
+                            "Inference completed, but no output image was found.\n\n"
+                            f"- Task: `{internal_task}`\n"
+                            f"- Model: `{self.model_variant}`\n"
+                            f"- Model path: `{active_model_path}`\n"
+                            f"- GPU: `{self.device}`\n"
+                            f"- Actual seed: `{actual_seed}`\n"
+                            f"- Output directory: `{save_dir}`"
+                        )
+                        return None, None, "", status, logs
+                    # status = (
+                    #     "Inference completed.\n\n"
+                    #     f"- Task: `{internal_task}`\n"
+                    #     f"- Model: `{self.model_variant}`\n"
+                    #     f"- Model path: `{active_model_path}`\n"
+                    #     f"- GPU: `{self.device}`\n"
+                    #     f"- Actual seed: `{actual_seed}`\n"
+                    #     f"- Output directory: `{save_dir}`\n"
+                    #     f"- Result file: `{image_path}`"
+                    # )
+                    status = ""
+                    return None, str(image_path), "", status, logs
+                # status = (
+                #     "Understanding completed.\n\n"
+                #     f"- Task: `{task}`\n"
+                #     f"- Model: `{self.model_variant}`\n"
+                #     f"- Model path: `{active_model_path}`\n"
+                #     f"- GPU: `{self.device}`\n"
+                #     f"- Actual seed: `{actual_seed}`\n"
+                #     f"- Output directory: `{save_dir}`"
+                # )
+                status = ""
+                return None, None, text_result, status, logs
+            except Exception:
+                error_trace = traceback.format_exc()
+                print(error_trace, flush=True)
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "failed",
+                    "task": internal_task,
+                    "model_variant": self.model_variant,
+                    "model_path": active_model_path,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "input_video": input_video,
+                    "input_image": input_image,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": display_resolution,
+                    "backend_resolution": backend_resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "video_path": "",
+                    "image_path": "",
+                    "text_result": "",
+                    "error": error_trace,
+                }
+                save_generation_record(record, save_dir)
+                status = (
+                    "Inference failed.\n\n"
+                    f"- Task: `{internal_task}`\n"
+                    f"- Model: `{self.model_variant}`\n"
+                    f"- Model path: `{active_model_path}`\n"
+                    f"- GPU: `{self.device}`\n"
+                    f"- Actual seed: `{actual_seed}`\n"
+                    f"- Resolution: `{display_resolution}`\n"
+                    f"- Output directory: `{save_dir}`"
+                )
+                return None, None, "", status, error_trace
+class PipelinePool:
+    def __init__(self, gpu_ids: list[int], model_variant: str = MODEL_VARIANT_VIDEO) -> None:
+        if not gpu_ids:
+            raise ValueError("At least one GPU must be configured.")
+        self.gpu_ids = gpu_ids
+        self.model_variant = normalize_model_variant(model_variant)
+        self.pipelines = [
+            LanceT2VV2TPipeline(device_id=gpu_id, model_variant=self.model_variant)
+            for gpu_id in gpu_ids
+        ]
+        self._available = deque(self.pipelines)
+        self._condition = threading.Condition()
+    @property
+    def size(self) -> int:
+        return len(self.pipelines)
+    @property
+    def gpu_summary(self) -> str:
+        return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
+    def initialize_all(self) -> None:
+        print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
+        exceptions: list[Exception] = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
+            futures = {
+                executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
+            }
+            for future in concurrent.futures.as_completed(futures):
+                gpu_id = futures[future]
+                try:
+                    future.result()
+                except Exception as exc:
+                    print(f"[startup][gpu:{gpu_id}][{self.model_variant}] Preload failed: {exc}", flush=True)
+                    exceptions.append(exc)
+        if exceptions:
+            raise RuntimeError(
+                f"{self.model_variant} preload failed on {len(exceptions)} GPU(s). Please check the terminal logs."
+            ) from exceptions[0]
+        print(
+            f"[startup][{self.model_variant}] GPU preload finished. Ready to handle {self.size} concurrent request(s).",
+            flush=True,
+        )
+    def acquire(self) -> LanceT2VV2TPipeline:
+        with self._condition:
+            while not self._available:
+                self._condition.wait()
+            return self._available.popleft()
+    def release(self, pipeline: LanceT2VV2TPipeline) -> None:
+        with self._condition:
+            self._available.append(pipeline)
+            self._condition.notify()
+    def unload_all(self) -> None:
+        print(f"[runtime][{self.model_variant}] Unloading model pool from GPU(s): {self.gpu_ids}", flush=True)
+        with self._condition:
+            while len(self._available) != len(self.pipelines):
+                self._condition.wait()
+        for pipeline in self.pipelines:
+            pipeline.unload()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        print(f"[runtime][{self.model_variant}] Model pool unloaded.", flush=True)
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        system_prompt: Optional[str],
+        input_video: Optional[str],
+        input_image: Optional[str],
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+        enable_frame_interpolation: bool,
+    ):
+        pipeline = self.acquire()
+        try:
+            return pipeline.generate(
+                task=task,
+                prompt=prompt,
+                system_prompt=system_prompt,
+                input_video=input_video,
+                input_image=input_image,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                seed=seed,
+                resolution=resolution,
+                validation_num_timesteps=validation_num_timesteps,
+                validation_timestep_shift=validation_timestep_shift,
+                cfg_text_scale=cfg_text_scale,
+                enable_frame_interpolation=enable_frame_interpolation,
+            )
+        finally:
+            self.release(pipeline)
+ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
+ACTIVE_POOL_LOCK = threading.Lock()
+QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
+def get_task_model_variant(task: str) -> str:
+    internal_task = normalize_task(task)
+    return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
+def get_pipeline_pool(task: str) -> PipelinePool:
+    global ACTIVE_PIPELINE_POOL
+    model_variant = get_task_model_variant(task)
+    with ACTIVE_POOL_LOCK:
+        if ACTIVE_PIPELINE_POOL is not None and ACTIVE_PIPELINE_POOL.model_variant == model_variant:
+            return ACTIVE_PIPELINE_POOL
+        gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
+        if ACTIVE_PIPELINE_POOL is not None:
+            previous_variant = ACTIVE_PIPELINE_POOL.model_variant
+            print(
+                f"[runtime] Switching Lance model from {previous_variant} to {model_variant}.",
+                flush=True,
+            )
+            ACTIVE_PIPELINE_POOL.unload_all()
+            ACTIVE_PIPELINE_POOL = None
+        ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=model_variant)
+        ACTIVE_PIPELINE_POOL.initialize_all()
+        return ACTIVE_PIPELINE_POOL
+def run_task(
+    task: str,
+    prompt: str,
+    system_prompt: Optional[str],
+    input_video: Optional[str],
+    input_image: Optional[str],
+    height: int,
+    width: int,
+    num_frames: int,
+    seed: int,
+    resolution: str,
+    validation_num_timesteps: int,
+    validation_timestep_shift: float,
+    cfg_text_scale: float,
+    enable_frame_interpolation: bool,
+):
+    pipeline_pool = get_pipeline_pool(task)
+    return pipeline_pool.generate(
+        task=task,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        input_video=input_video,
+        input_image=input_image,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        seed=seed,
+        resolution=resolution,
+        validation_num_timesteps=validation_num_timesteps,
+        validation_timestep_shift=validation_timestep_shift,
+        cfg_text_scale=cfg_text_scale,
+        enable_frame_interpolation=enable_frame_interpolation,
+    )
+def build_status_markdown() -> str:
+    gpu_text = "unknown"
+    concurrency = 1
+    active_variant = "none"
+    if ACTIVE_PIPELINE_POOL is not None:
+        active_variant = ACTIVE_PIPELINE_POOL.model_variant
+        gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
+        concurrency = ACTIVE_PIPELINE_POOL.size
+    return (
+        f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
+        f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Active model: `{active_variant}`  |  "
+        f"Switch mode: `unload then load`"
+    )
+def get_logo_data_uri() -> str:
+    if not LANCE_LOGO_PATH.exists():
+        return ""
+    encoded_logo = base64.b64encode(LANCE_LOGO_PATH.read_bytes()).decode("ascii")
+    return f"data:image/webp;base64,{encoded_logo}"
+def build_header_html() -> str:
+    logo_data_uri = get_logo_data_uri()
+    logo_html = (
+        f'<img class="lance-logo" src="{logo_data_uri}" alt="Lance logo">'
+        if logo_data_uri
+        else ""
+    )
+    return f"""
+    <div class="lance-hero">
+        {logo_html}
+        <h1 class="lance-title">Lance: Unified Multimodal Modeling by Multi-Task Synergy</h1>
+        <div class="lance-authors">
+            <strong>
+                <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" target="_blank">Fengyi Fu</a><sup>*</sup>,
+                <a href="https://corleone-huang.github.io/" target="_blank">Mengqi Huang</a><sup>*,✉</sup>,
+                <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" target="_blank">Shaojin Wu</a><sup>*</sup>,
+                Yunsheng Jiang<sup>*</sup>,
+                Yufei Huo,
+                <a href="https://guojianzhu.com/" target="_blank">Jianzhu Guo</a><sup>✉,§</sup>
+            </strong><br>
+            Hao Li, Yinghang Song, Fei Ding, Qian He, Zheren Fu, Zhendong Mao, Yongdong Zhang<br>
+            <em>ByteDance</em>
+        </div>
+        <div class="lance-badges">
+            <a href="{LANCE_HOMEPAGE_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Homepage" src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat">
+            </a>
+            <a href="{LANCE_PAPER_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Paper" src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv">
+            </a>
+            <a href="{LANCE_HUGGING_FACE_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Hugging Face" src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface">
+            </a>
+            <a href="{LANCE_GITHUB_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="GitHub" src="https://img.shields.io/badge/Code-GitHub-536af5?color=536af5&logo=github">
+            </a>
+        </div>
+    </div>
+    """
+def update_task_ui(task: str):
+    internal_task = normalize_task(task)
+    is_image_task = internal_task in IMAGE_TASKS
+    is_video_task = internal_task in VIDEO_TASKS
+    is_edit_task = internal_task in EDIT_TASKS
+    is_understanding_task = internal_task in UNDERSTANDING_TASKS
+    is_generation_task = internal_task in GENERATION_TASKS
+    show_media_input = is_edit_task or is_understanding_task
+    resolution_choices = IMAGE_RESOLUTION_CHOICES if is_image_task else VIDEO_RESOLUTION_CHOICES
+    resolution_value = DEFAULT_IMAGE_RESOLUTION if is_image_task else DEFAULT_RESOLUTION
+    aspect_ratio_value = DEFAULT_IMAGE_ASPECT_RATIO if is_image_task else DEFAULT_VIDEO_ASPECT_RATIO
+    width_value, height_value = get_size_for_aspect_ratio(internal_task, aspect_ratio_value)
+    size_markdown = format_size_markdown(internal_task, width_value, height_value)
+    system_prompt_choices = get_understanding_system_prompt_choices(internal_task)
+    if is_generation_task:
+        text_label = "Prompt"
+        text_placeholder = "Describe what you want to generate..."
+    elif is_edit_task:
+        text_label = "Instruction"
+        text_placeholder = "Describe the edit you want..."
+    else:
+        text_label = "Question"
+        text_placeholder = "Ask a question about the input..."
+    return (
+        gr.update(
+            label=text_label,
+            placeholder=text_placeholder,
+            visible=True,
+        ),
+        gr.update(
+            choices=system_prompt_choices,
+            value=system_prompt_choices[0],
+            visible=False,
+        ),
+        gr.update(label="Input Video", visible=show_media_input and is_video_task),
+        gr.update(label="Input Image", visible=show_media_input and is_image_task),
+        gr.update(value=aspect_ratio_value, visible=is_generation_task or is_edit_task),
+        gr.update(value=height_value),
+        gr.update(value=width_value),
+        gr.update(value=size_markdown, visible=is_generation_task or is_edit_task),
+        gr.update(visible=internal_task == TASK_T2V, value=DEFAULT_NUM_FRAMES),
+        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}, value=DEFAULT_FRAME_INTERPOLATION),
+        gr.update(choices=resolution_choices, value=resolution_value, visible=False),
+        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}),
+        gr.update(visible=internal_task in {TASK_T2I, TASK_IMAGE_EDIT}),
+        gr.update(visible=is_understanding_task, value=""),
+        gr.update(visible=internal_task == TASK_T2V),
+        gr.update(visible=internal_task == TASK_VIDEO_EDIT),
+        gr.update(visible=internal_task == TASK_X2T_VIDEO),
+        gr.update(visible=internal_task == TASK_T2I),
+        gr.update(visible=internal_task == TASK_IMAGE_EDIT),
+        gr.update(visible=internal_task == TASK_X2T_IMAGE),
+    )
+def keep_example_clicks_from_changing_visibility(*examples_components) -> None:
+    for examples_component in examples_components:
+        dataset = getattr(examples_component, "dataset", None)
+        component_props = getattr(dataset, "component_props", None)
+        if not component_props:
+            continue
+        for props in component_props:
+            props.pop("visible", None)
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="Lance", css=APP_CSS) as demo:
+        gr.HTML(build_header_html())
+        gr.Markdown(build_status_markdown(), elem_classes=["lance-status"], visible=False)
+        with gr.Row(elem_classes=["lance-main-row"]):
+            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
+                task = gr.Radio(
+                    label="Task",
+                    choices=TASK_CHOICES,
+                    value=TASK_LABEL_VIDEO_GENERATION,
+                    elem_classes=["task-selector"],
+                )
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    lines=6,
+                    placeholder="Describe the video you want to generate...",
+                )
+                system_prompt = gr.Dropdown(
+                    label="System Prompt",
+                    choices=get_understanding_system_prompt_choices(TASK_X2T_VIDEO),
+                    value=V2T_QA_SYSTEM_PROMPT,
+                    visible=False,
+                )
+                input_video = gr.Video(label="Input Video", visible=False, elem_classes=["lance-display-frame"])
+                input_image = gr.Image(label="Input Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
+                with gr.Row():
+                    seed = gr.Number(
+                        label="Seed (-1 for random seed)",
+                        value=DEFAULT_BASIC_SEED,
+                        precision=0,
+                        # info="-1 for random seed",
+                    )
+                    aspect_ratio = gr.Dropdown(
+                        label="Aspect Ratio",
+                        # choices=ASPECT_RATIO_CHOICES, # 原始版本，不显示 是否为 default
+                        choices=get_aspect_ratio_choices_for_task(TASK_T2V),
+                        value=DEFAULT_VIDEO_ASPECT_RATIO,
+                    )
+                    # real_size = gr.Markdown(format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT))
+                    real_size = gr.Textbox(
+                        label="Output Resolution",
+                        value=format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT),
+                        interactive=False,
+                    )
+                    enable_frame_interpolation = gr.Checkbox(
+                        label="Frame Interpolation",
+                        value=DEFAULT_FRAME_INTERPOLATION,
+                    )
+                resolution = gr.Dropdown(
+                    label="Resolution",
+                    choices=RESOLUTION_CHOICES,
+                    value=DEFAULT_RESOLUTION,
+                    visible=False,
+                )
+                height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
+                width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
+                num_frames = gr.Slider(
+                    minimum=1,
+                    maximum=121,
+                    step=1,
+                    value=DEFAULT_NUM_FRAMES,
+                    label="Output Frames",
+                )
+                # seed = gr.Number(
+                #     label="Seed",
+                #     value=DEFAULT_BASIC_SEED,
+                #     precision=0,
+                #     info="-1 means using a random seed each time",
+                # )
+                with gr.Accordion("Advanced Parameters", open=False):
+                    validation_num_timesteps = gr.Slider(
+                        minimum=1,
+                        maximum=50,
+                        step=1,
+                        value=DEFAULT_TIMESTEPS,
+                        label="Validation Num Timesteps",
+                    )
+                    with gr.Row():
+                        validation_timestep_shift = gr.Number(
+                            label="Validation Timestep Shift",
+                            value=DEFAULT_TIMESTEP_SHIFT,
+                        )
+                        cfg_text_scale = gr.Number(
+                            label="CFG Text Scale",
+                            value=DEFAULT_CFG_TEXT_SCALE,
+                        )
+                generation_example_inputs = [
+                    prompt,
+                    input_video,
+                    input_image,
+                ]
+            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
+                output_video = gr.Video(label="Output Video", elem_classes=["lance-display-frame"])
+                output_image = gr.Image(label="Output Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
+                output_text = gr.Textbox(label="Output Text", lines=8, visible=False, elem_classes=["lance-display-frame"])
+                status = gr.Markdown("WAITING TO RUN.")
+                logs = gr.Textbox(label="Run Logs", lines=22, max_lines=30)
+        run_button = gr.Button("RUN", variant="primary")
+        with gr.Group(visible=True, elem_classes=["prompt-examples"]) as video_generation_examples_group:
+            gr.Markdown("### Video generation recommended cases")
+            video_generation_examples = gr.Dataframe(
+                value=VIDEO_GENERATION_EXAMPLES,
+                headers=["Prompt"],
+                datatype=["str"],
+                interactive=False,
+                show_row_numbers=False,
+                wrap=True,
+                line_breaks=True,
+                row_count=(len(VIDEO_GENERATION_EXAMPLES), "fixed"),
+                col_count=(1, "fixed"),
+                max_height=420,
+                elem_classes=["prompt-table"],
+            )
+        with gr.Group(visible=False) as video_edit_examples_group:
+            gr.Markdown("### Video edit recommended cases")
+            video_edit_examples = gr.Examples(
+                examples=VIDEO_EDIT_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=3,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False) as video_understanding_examples_group:
+            gr.Markdown("### Video understanding recommended cases")
+            video_understanding_examples = gr.Examples(
+                examples=VIDEO_UNDERSTANDING_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=4,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False, elem_classes=["prompt-examples"]) as image_generation_examples_group:
+            gr.Markdown("### Image generation recommended cases")
+            image_generation_examples = gr.Dataframe(
+                value=IMAGE_GENERATION_EXAMPLES,
+                headers=["Prompt"],
+                datatype=["str"],
+                interactive=False,
+                show_row_numbers=False,
+                wrap=True,
+                line_breaks=True,
+                row_count=(len(IMAGE_GENERATION_EXAMPLES), "fixed"),
+                col_count=(1, "fixed"),
+                max_height=420,
+                elem_classes=["prompt-table"],
+            )
+        with gr.Group(visible=False) as image_edit_examples_group:
+            gr.Markdown("### Image edit recommended cases")
+            image_edit_examples = gr.Examples(
+                examples=IMAGE_EDIT_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=5,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False) as image_understanding_examples_group:
+            gr.Markdown("### Image understanding recommended cases")
+            image_understanding_examples = gr.Examples(
+                examples=IMAGE_UNDERSTANDING_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=4,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        keep_example_clicks_from_changing_visibility(
+            video_generation_examples,
+            video_edit_examples,
+            video_understanding_examples,
+            image_generation_examples,
+            image_edit_examples,
+            image_understanding_examples,
+        )
+        task.change(
+            fn=update_task_ui,
+            inputs=[task],
+            outputs=[
+                prompt,
+                system_prompt,
+                input_video,
+                input_image,
+                aspect_ratio,
+                height,
+                width,
+                real_size,
+                num_frames,
+                enable_frame_interpolation,
+                resolution,
+                output_video,
+                output_image,
+                output_text,
+                video_generation_examples_group,
+                video_edit_examples_group,
+                video_understanding_examples_group,
+                image_generation_examples_group,
+                image_edit_examples_group,
+                image_understanding_examples_group,
+            ],
+        )
+        aspect_ratio.change(
+            fn=update_size_from_aspect_ratio,
+            inputs=[task, aspect_ratio],
+            outputs=[height, width, real_size],
+            queue=False,
+            show_api=False,
+        )
+        for examples_component in (video_edit_examples, video_understanding_examples, image_edit_examples, image_understanding_examples):
+            examples_component.load_input_event.then(
+                fn=reset_generation_defaults_for_task,
+                inputs=[task],
+                outputs=[aspect_ratio, height, width, num_frames, resolution, real_size],
+                queue=False,
+                show_api=False,
+            )
+        video_generation_examples.select(
+            fn=apply_prompt_example,
+            inputs=[task],
+            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
+            queue=False,
+            show_api=False,
+        )
+        image_generation_examples.select(
+            fn=apply_prompt_example,
+            inputs=[task],
+            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
+            queue=False,
+            show_api=False,
+        )
+        run_button.click(
+            fn=run_task,
+            inputs=[
+                task,
+                prompt,
+                system_prompt,
+                input_video,
+                input_image,
+                height,
+                width,
+                num_frames,
+                seed,
+                resolution,
+                validation_num_timesteps,
+                validation_timestep_shift,
+                cfg_text_scale,
+                enable_frame_interpolation,
+            ],
+            outputs=[output_video, output_image, output_text, status, logs],
+        )
+    return demo
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Lance multimodal Gradio")
+    parser.add_argument("--server-name", default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
+    parser.add_argument("--server-port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
+    parser.add_argument("--share", action="store_true", default=env_flag("GRADIO_SHARE", False))
+    parser.add_argument(
+        "--gpus",
+        default=os.getenv("LANCE_GPUS", DEFAULT_GPUS),
+        help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=int(os.getenv("LANCE_QUEUE_SIZE", str(DEFAULT_QUEUE_SIZE))),
+        help="Maximum number of queued Gradio requests.",
+    )
+    return parser.parse_args()
+def parse_gpu_ids(gpu_string: str) -> list[int]:
+    gpu_ids: list[int] = []
+    for item in gpu_string.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        gpu_ids.append(int(item))
+    if not gpu_ids:
+        raise ValueError("No valid GPU IDs were parsed.")
+    return gpu_ids
+if __name__ == "__main__":
+    args = parse_args()
+    os.environ["LANCE_GPUS"] = args.gpus
+    resolved_model_path = ensure_model_assets(MODEL_VARIANT_VIDEO)
+    print(f"[startup] Using Lance model path: {resolved_model_path}", flush=True)
+    QUEUE_MAX_SIZE = args.queue_size
+    gpu_ids = parse_gpu_ids(args.gpus)
+    ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=MODEL_VARIANT_VIDEO)
+    ACTIVE_PIPELINE_POOL.initialize_all()
+    demo = build_demo()
+    demo.queue(
+        max_size=args.queue_size,
+        default_concurrency_limit=ACTIVE_PIPELINE_POOL.size,
+    ).launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+    )

app_wrong.py ADDED Viewed

	@@ -0,0 +1,2247 @@

+from __future__ import annotations
+import argparse
+import base64
+import concurrent.futures
+import gc
+import json
+import os
+import random
+import subprocess
+import threading
+import time
+import traceback
+from collections import deque
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import gradio as gr
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+from transformers import set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from common.utils.logging import get_logger
+from common.utils.misc import AutoEncoderParams, tuple_mul
+from config.config_factory import DataArguments, InferenceArguments, ModelArguments
+from data.data_utils import add_special_tokens
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.datasets_custom import ValidationDataset
+from inference_lance import (
+    PROMPT_JSON_FILENAME,
+    apply_inference_defaults,
+    clean_memory,
+    init_from_model_path_if_needed,
+    save_prompt_results,
+    validate_on_fixed_batch,
+)
+from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+REPO_ROOT = Path(__file__).resolve().parent
+GRADIO_TMP_ROOT = Path(os.getenv("LANCE_GRADIO_TMP_ROOT", "/tmp/lance_gradio")).expanduser()
+TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
+RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
+GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
+RUN_RECORD_FILENAME = "generation_record.json"
+LOCAL_MODEL_BASE_DIR = Path("downloads")
+SPACE_MODEL_BASE_DIR = Path("/data/lance_models")
+DEFAULT_MODEL_REPO_ID = "bytedance-research/Lance"
+DEFAULT_MODEL_VARIANT = "video"
+MODEL_VARIANT_VIDEO = "video"
+MODEL_VARIANT_IMAGE = "image"
+MODEL_VARIANT_TO_DIR = {
+    MODEL_VARIANT_VIDEO: "Lance_3B_Video",
+    MODEL_VARIANT_IMAGE: "Lance_3B",
+}
+DEFAULT_MODEL_PATH = LOCAL_MODEL_BASE_DIR / MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]
+DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
+DEFAULT_TASK = "t2v"
+DEFAULT_TIMESTEPS = 30
+DEFAULT_TIMESTEP_SHIFT = 3.5
+DEFAULT_CFG_TEXT_SCALE = 4.0
+DEFAULT_RESOLUTION = "video_848x480"
+DEFAULT_IMAGE_RESOLUTION = "image_768x768"
+DEFAULT_BASIC_SEED = 42
+DEFAULT_HEIGHT = 480
+DEFAULT_WIDTH = 848
+DEFAULT_IMAGE_SIZE = 768
+DEFAULT_VIDEO_DURATION_SECONDS = 5
+DEFAULT_NUM_FRAMES = 12 * DEFAULT_VIDEO_DURATION_SECONDS + 1
+DEFAULT_VIDEO_ASPECT_RATIO = "16:9"
+DEFAULT_IMAGE_ASPECT_RATIO = "1:1"
+FRAME_INTERPOLATION_YES = "Yes"
+FRAME_INTERPOLATION_NO = "No"
+DEFAULT_FRAME_INTERPOLATION = FRAME_INTERPOLATION_YES
+ASPECT_RATIO_CHOICES = ["21:9", "16:9", "3:2", "4:3", "1:1", "3:4", "2:3", "9:16", "9:21"]
+VIDEO_ASPECT_RATIO_TO_SIZE = {
+    "21:9": (976, 416),
+    "16:9": (848, 480),
+    "3:2": (784, 528),
+    "4:3": (736, 560),
+    "1:1": (640, 640),
+    "3:4": (560, 736),
+    "2:3": (528, 784),
+    "9:16": (480, 848),
+    "9:21": (416, 976),
+}
+IMAGE_ASPECT_RATIO_TO_SIZE = {
+    "21:9": (1168, 496),
+    "16:9": (1024, 576),
+    "3:2": (944, 624),
+    "4:3": (880, 672),
+    "1:1": (768, 768),
+    "3:4": (672, 880),
+    "2:3": (624, 944),
+    "9:16": (576, 1024),
+    "9:21": (496, 1168),
+}
+DEFAULT_GPUS = "0"
+DEFAULT_QUEUE_SIZE = 32
+USE_KVCACHE = True
+TEXT_TEMPLATE = True
+RECORD_WRITE_LOCK = threading.Lock()
+LANCE_HOMEPAGE_URL = "https://lance-project.github.io/"
+LANCE_PAPER_URL = "http://arxiv.org/abs/2605.18678"
+LANCE_HUGGING_FACE_URL = "https://huggingface.co/bytedance-research/Lance"
+LANCE_GITHUB_URL = "https://github.com/bytedance/Lance"
+LANCE_LOGO_PATH = REPO_ROOT / "assets" / "logo" / "lance-logo.webp"
+APP_CSS = """
+.gradio-container {
+    max-width: 1680px !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+.contain {
+    max-width: 1680px !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+.lance-hero {
+    text-align: center;
+    padding: 8px 12px 6px;
+}
+.lance-logo {
+    width: min(160px, 36vw);
+    height: auto;
+    display: block;
+    margin: 0 auto 4px;
+}
+.lance-title {
+    margin: 0 auto 5px;
+    font-size: clamp(20px, 2.4vw, 30px);
+    line-height: 1.08;
+    font-weight: 800;
+    letter-spacing: 0;
+}
+.lance-authors {
+    margin: 0 auto 6px;
+    max-width: 1280px;
+    font-size: 20px;
+    line-height: 1.24;
+    color: var(--body-text-color-subdued);
+}
+.lance-authors a {
+    color: inherit;
+    text-decoration: none;
+}
+.lance-authors a:hover {
+    text-decoration: underline;
+}
+.lance-badges {
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: center;
+    gap: 5px;
+    margin: 4px auto 0;
+}
+.lance-badges a {
+    line-height: 0;
+}
+.lance-badges img {
+    height: 20px;
+    width: auto;
+    display: block;
+}
+.lance-status {
+    max-width: 1180px;
+    margin: 0 auto 18px;
+}
+.task-selector {
+    overflow-x: auto;
+}
+.lance-main-column > label span,
+.lance-main-column > .block-title,
+.lance-main-column > .label-wrap span,
+.lance-main-column > .form > label span,
+.lance-main-column > .form > .block-title,
+.lance-main-column > .form > .label-wrap span {
+    font-size: 20px !important;
+    font-weight: 700 !important;
+}
+.task-selector .wrap {
+    display: grid;
+    grid-template-columns: repeat(3, minmax(220px, 1fr));
+    gap: 8px;
+    min-width: 680px;
+}
+.task-selector label {
+    justify-content: center;
+    min-height: 38px;
+    white-space: nowrap;
+    border-radius: 10px !important;
+}
+.task-selector span {
+    font-size: 20px !important;
+}
+.recommended-title {
+    text-align: center !important;
+    margin: 14px auto 10px !important;
+}
+.recommended-title h3,
+.recommended-title p {
+    text-align: center !important;
+    font-size: 22px !important;
+    font-weight: 800 !important;
+    color: var(--body-text-color) !important;
+}
+.example-panel {
+    margin-top: 14px !important;
+    padding: 10px 12px !important;
+    border-radius: 8px !important;
+    background: rgba(248, 250, 252, 0.72) !important;
+    border: 1px solid var(--border-color-primary) !important;
+}
+.prompt-examples table,
+.prompt-examples th,
+.prompt-examples td {
+    border: 1px solid var(--border-color-primary) !important;
+}
+.prompt-examples table {
+    border-collapse: collapse !important;
+    width: 100% !important;
+}
+.prompt-examples td {
+    border-bottom: 1px solid var(--border-color-primary) !important;
+    padding: 12px !important;
+    vertical-align: top !important;
+}
+.example-panel th,
+.example-panel .block-label,
+.example-panel label span,
+.example-panel .label-wrap span {
+    font-size: 18px !important;
+    font-weight: 700 !important;
+}
+.prompt-dataset {
+    max-height: 420px !important;
+    overflow-y: auto !important;
+    overscroll-behavior: contain !important;
+    scrollbar-gutter: stable !important;
+}
+.prompt-dataset button {
+    height: auto !important;
+    min-height: 48px !important;
+    white-space: normal !important;
+    text-align: left !important;
+    align-items: flex-start !important;
+}
+.prompt-dataset .paginate {
+    display: none !important;
+}
+.prompt-example-proxy {
+    display: none !important;
+}
+.lance-main-row {
+    display: grid !important;
+    grid-template-columns: minmax(0, 1fr) minmax(0, 1fr) !important;
+    gap: 16px !important;
+    align-items: start !important;
+}
+.lance-main-column {
+    min-width: 0 !important;
+    width: 100% !important;
+}
+.lance-display-frame,
+.lance-display-frame > div,
+.lance-display-frame textarea {
+    width: 100% !important;
+}
+.lance-display-frame textarea {
+    min-height: 360px !important;
+}
+.lance-run-button {
+    font-size: 18px !important;
+    font-weight: 800 !important;
+}
+.generation-controls-row {
+    width: 100% !important;
+    max-width: 100% !important;
+    overflow-x: hidden !important;
+}
+.generation-controls-row > .form {
+    display: grid !important;
+    grid-template-columns:
+        minmax(0, 1.25fr)
+        minmax(0, 1.3fr)
+        minmax(0, 1fr)
+        minmax(0, 1.25fr) !important;
+    gap: 12px !important;
+    align-items: start !important;
+    width: 100% !important;
+    max-width: 100% !important;
+    overflow: visible !important;
+}
+.generation-control,
+.generation-control > div,
+.generation-controls-row > .form > div {
+    min-width: 0 !important;
+    max-width: 100% !important;
+}
+.generation-controls-row .generation-control label,
+.generation-controls-row .generation-control label span,
+.generation-controls-row .generation-control .block-label,
+.generation-controls-row .generation-control .block-title,
+.generation-controls-row .generation-control > label,
+.generation-controls-row .generation-control .label-wrap,
+.generation-controls-row .generation-control .label-wrap span {
+    font-size: 22px !important;
+    font-weight: 700 !important;
+    line-height: 1.15 !important;
+    letter-spacing: 0 !important;
+    white-space: normal !important;
+}
+.generation-controls-row .generation-value-control input,
+.generation-controls-row .generation-value-control textarea,
+.generation-controls-row .generation-value-control [data-testid="textbox"],
+.generation-controls-row .generation-dropdown-control input[role="listbox"],
+.generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
+.generation-controls-row .generation-dropdown-control .secondary-wrap input {
+    font-size: 22px !important;
+    font-weight: 700 !important;
+    line-height: 1.2 !important;
+    letter-spacing: 0 !important;
+    text-align: left !important;
+}
+.generation-controls-row .generation-value-control input,
+.generation-controls-row .generation-value-control textarea,
+.generation-controls-row .generation-dropdown-control input[role="listbox"],
+.generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
+.generation-controls-row .generation-dropdown-control .secondary-wrap input {
+    min-height: 64px !important;
+    width: 100% !important;
+    box-sizing: border-box !important;
+}
+@media (max-width: 1100px) {
+    .generation-controls-row > .form {
+        grid-template-columns: repeat(2, minmax(0, 1fr)) !important;
+    }
+}
+@media (max-width: 900px) {
+    .lance-main-row {
+        grid-template-columns: minmax(0, 1fr) !important;
+    }
+}
+"""
+TASK_T2V = "t2v"
+TASK_T2I = "t2i"
+TASK_V2T = "v2t"
+TASK_X2T = "x2t"
+TASK_X2T_VIDEO = "x2t_video"
+TASK_X2T_IMAGE = "x2t_image"
+TASK_IMAGE_EDIT = "image_edit"
+TASK_VIDEO_EDIT = "video_edit"
+TASK_LABEL_VIDEO_GENERATION = "Video Generation"
+TASK_LABEL_VIDEO_EDIT = "Video Edit"
+TASK_LABEL_VIDEO_UNDERSTANDING = "Video Understanding"
+TASK_LABEL_IMAGE_GENERATION = "Image Generation"
+TASK_LABEL_IMAGE_EDIT = "Image Edit"
+TASK_LABEL_IMAGE_UNDERSTANDING = "Image Understanding"
+TASK_CHOICES = [
+    TASK_LABEL_VIDEO_GENERATION,
+    TASK_LABEL_VIDEO_EDIT,
+    TASK_LABEL_VIDEO_UNDERSTANDING,
+    TASK_LABEL_IMAGE_GENERATION,
+    TASK_LABEL_IMAGE_EDIT,
+    TASK_LABEL_IMAGE_UNDERSTANDING,
+]
+TASK_LABEL_TO_INTERNAL = {
+    TASK_LABEL_VIDEO_GENERATION: TASK_T2V,
+    TASK_LABEL_VIDEO_EDIT: TASK_VIDEO_EDIT,
+    TASK_LABEL_VIDEO_UNDERSTANDING: TASK_X2T_VIDEO,
+    TASK_LABEL_IMAGE_GENERATION: TASK_T2I,
+    TASK_LABEL_IMAGE_EDIT: TASK_IMAGE_EDIT,
+    TASK_LABEL_IMAGE_UNDERSTANDING: TASK_X2T_IMAGE,
+    TASK_T2V: TASK_T2V,
+    TASK_VIDEO_EDIT: TASK_VIDEO_EDIT,
+    TASK_V2T: TASK_X2T_VIDEO,
+    TASK_X2T: TASK_X2T_VIDEO,
+    TASK_X2T_VIDEO: TASK_X2T_VIDEO,
+    TASK_T2I: TASK_T2I,
+    TASK_IMAGE_EDIT: TASK_IMAGE_EDIT,
+    TASK_X2T_IMAGE: TASK_X2T_IMAGE,
+}
+GENERATION_TASKS = {TASK_T2V, TASK_T2I, TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
+UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
+IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
+VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
+EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
+VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
+IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
+RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
+CAPTION_SYSTEM_PROMPT_TEMPLATE = (
+    "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
+)
+V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
+I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
+V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
+I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
+def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
+    """Get Aspect Ratio choices with default/recommended marker for the given task."""
+    internal_task = normalize_task(task)
+    default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
+    return [
+        (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
+        for ratio in ASPECT_RATIO_CHOICES
+    ]
+def env_flag(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+def running_on_space() -> bool:
+    return bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
+def display_path(path: Path) -> str:
+    path_text = path.as_posix()
+    if path.is_absolute():
+        try:
+            path_text = path.relative_to(Path.cwd()).as_posix()
+        except ValueError:
+            return path_text
+    if path_text == "." or path_text.startswith("./"):
+        return path_text
+    return f"./{path_text}"
+def get_model_base_dir() -> Path:
+    configured = os.getenv("LANCE_MODEL_BASE_DIR")
+    if configured:
+        return Path(configured).expanduser()
+    if LOCAL_MODEL_BASE_DIR.exists():
+        return LOCAL_MODEL_BASE_DIR
+    return SPACE_MODEL_BASE_DIR if running_on_space() else LOCAL_MODEL_BASE_DIR
+def normalize_model_variant(model_variant: Optional[str] = None) -> str:
+    variant = (model_variant or os.getenv("LANCE_MODEL_VARIANT", DEFAULT_MODEL_VARIANT)).strip().lower()
+    if variant in {"image", "t2i", "i2t"}:
+        return MODEL_VARIANT_IMAGE
+    return MODEL_VARIANT_VIDEO
+def get_model_path(model_variant: Optional[str] = None) -> Path:
+    variant = normalize_model_variant(model_variant)
+    variant_env_name = "LANCE_IMAGE_MODEL_PATH" if variant == MODEL_VARIANT_IMAGE else "LANCE_VIDEO_MODEL_PATH"
+    variant_configured = os.getenv(variant_env_name)
+    if variant_configured:
+        return Path(variant_configured).expanduser()
+    configured = os.getenv("LANCE_MODEL_PATH")
+    if configured:
+        return Path(configured).expanduser()
+    model_dir_name = MODEL_VARIANT_TO_DIR[variant]
+    return get_model_base_dir() / model_dir_name
+def get_required_model_asset_paths(model_base_dir: Path, model_path: Path) -> list[Path]:
+    return [
+        model_path / "llm_config.json",
+        model_path / "model.safetensors",
+        model_base_dir / "Qwen2.5-VL-ViT" / "vit.safetensors",
+        model_base_dir / "Wan2.2_VAE.pth",
+    ]
+def ensure_model_assets(model_variant: Optional[str] = None) -> Path:
+    model_base_dir = get_model_base_dir()
+    os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
+    model_path = get_model_path(model_variant)
+    required_paths = get_required_model_asset_paths(model_base_dir, model_path)
+    if all(path.exists() for path in required_paths):
+        return model_path
+    downloads_model_base_dir = Path("downloads")
+    if model_base_dir == Path(".") and downloads_model_base_dir.exists():
+        downloads_model_path = downloads_model_base_dir / MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]
+        downloads_required_paths = get_required_model_asset_paths(downloads_model_base_dir, downloads_model_path)
+        if all(path.exists() for path in downloads_required_paths):
+            model_base_dir = downloads_model_base_dir
+            model_path = downloads_model_path
+            required_paths = downloads_required_paths
+            os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
+            return model_path
+    auto_download = env_flag("LANCE_AUTO_DOWNLOAD", running_on_space())
+    if not auto_download:
+        missing = "\n".join(f"- {display_path(path)}" for path in required_paths if not path.exists())
+        raise FileNotFoundError(
+            "Lance model assets are missing. Set LANCE_MODEL_BASE_DIR or enable "
+            f"LANCE_AUTO_DOWNLOAD=1.\nMissing files:\n{missing}"
+        )
+    model_base_dir.mkdir(parents=True, exist_ok=True)
+    repo_id = os.getenv("LANCE_MODEL_REPO_ID", DEFAULT_MODEL_REPO_ID)
+    print(f"[startup] Downloading Lance model assets from {repo_id} to {display_path(model_base_dir)}", flush=True)
+    snapshot_path = Path(
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=str(model_base_dir),
+            local_dir_use_symlinks=False,
+            resume_download=True,
+        )
+    )
+    if snapshot_path != model_base_dir and not model_path.exists():
+        os.environ["LANCE_MODEL_BASE_DIR"] = display_path(snapshot_path)
+        model_path = get_model_path(model_variant)
+    return model_path
+def ensure_dirs() -> None:
+    TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
+    RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
+def save_generation_record(record: dict, save_dir: Path) -> None:
+    ensure_dirs()
+    run_record_path = save_dir / RUN_RECORD_FILENAME
+    with run_record_path.open("w", encoding="utf-8") as f:
+        json.dump(record, f, ensure_ascii=False, indent=2)
+    with RECORD_WRITE_LOCK:
+        with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+def normalize_seed(seed: int) -> int:
+    return random.randint(0, 2**31 - 1) if seed == -1 else seed
+def normalize_task(task: str) -> str:
+    task_key = (task or TASK_LABEL_VIDEO_GENERATION).strip()
+    task = TASK_LABEL_TO_INTERNAL.get(task_key, TASK_LABEL_TO_INTERNAL.get(task_key.lower(), ""))
+    if task not in GENERATION_TASKS | UNDERSTANDING_TASKS:
+        raise ValueError(f"Unsupported task type: {task}")
+    return task
+def normalize_resolution_for_backend(resolution: str, task: str) -> str:
+    internal_task = normalize_task(task)
+    if internal_task in IMAGE_TASKS:
+        return DEFAULT_IMAGE_RESOLUTION
+    if internal_task in VIDEO_TASKS:
+        return DEFAULT_RESOLUTION
+    return str(resolution)
+def get_default_aspect_ratio(task: str) -> str:
+    internal_task = normalize_task(task)
+    return DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
+def get_size_for_aspect_ratio(task: str, aspect_ratio: str) -> tuple[int, int]:
+    internal_task = normalize_task(task)
+    aspect_ratio = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
+    size_map = IMAGE_ASPECT_RATIO_TO_SIZE if internal_task in IMAGE_TASKS else VIDEO_ASPECT_RATIO_TO_SIZE
+    return size_map[aspect_ratio]
+def format_size_markdown(task: str, width: int, height: int) -> str:
+    internal_task = normalize_task(task)
+    if internal_task in UNDERSTANDING_TASKS:
+        return ""
+    return f"{width} x {height}"
+def normalize_frame_interpolation(value) -> bool:
+    if isinstance(value, bool):
+        return value
+    return str(value or "").strip().lower() in {"1", "true", "yes", "on", "open"}
+def video_seconds_to_num_frames(seconds: int) -> int:
+    seconds = max(1, min(10, int(seconds)))
+    return 12 * seconds + 1
+def update_size_from_aspect_ratio(task: str, aspect_ratio: str):
+    width, height = get_size_for_aspect_ratio(task, aspect_ratio)
+    return height, width, format_size_markdown(task, width, height)
+def reset_generation_defaults_for_task(task: str):
+    internal_task = normalize_task(task)
+    aspect_ratio = get_default_aspect_ratio(internal_task)
+    width, height = get_size_for_aspect_ratio(internal_task, aspect_ratio)
+    resolution = DEFAULT_IMAGE_RESOLUTION if internal_task in IMAGE_TASKS else DEFAULT_RESOLUTION
+    num_frames = DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1
+    return aspect_ratio, height, width, num_frames, resolution, format_size_markdown(internal_task, width, height)
+def apply_prompt_example(task: str, evt: gr.SelectData):
+    prompt_text = ""
+    if isinstance(evt.row_value, list) and evt.row_value:
+        prompt_text = str(evt.row_value[0])
+    elif isinstance(evt.value, list) and evt.value:
+        prompt_text = str(evt.value[0])
+    elif evt.value is not None:
+        prompt_text = str(evt.value)
+    defaults = reset_generation_defaults_for_task(task)
+    return (prompt_text, *defaults)
+def get_understanding_system_prompt_choices(task: str) -> list[str]:
+    internal_task = normalize_task(task)
+    if internal_task == TASK_X2T_IMAGE:
+        return [I2T_QA_SYSTEM_PROMPT]
+    return [V2T_QA_SYSTEM_PROMPT]
+def normalize_understanding_system_prompt(task: str, system_prompt: Optional[str]) -> str:
+    return get_understanding_system_prompt_choices(task)[0]
+def create_request_json(
+    task: str,
+    prompt: str,
+    input_video: Optional[str],
+    input_image: Optional[str],
+    system_prompt: Optional[str] = None,
+) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"
+    if task == TASK_T2V:
+        payload = {"000000.mp4": prompt}
+    elif task == TASK_T2I:
+        payload = {"000000.png": prompt}
+    elif task == TASK_VIDEO_EDIT:
+        if not input_video:
+            raise ValueError("The video edit task requires an input video.")
+        payload = {
+            "000000": {
+                "interleave_array": [prompt, input_video, input_video],
+                "element_dtype_array": ["text", "video", "video"],
+                "istarget_in_interleave": [0, 0, 1],
+            }
+        }
+    elif task == TASK_IMAGE_EDIT:
+        if not input_image:
+            raise ValueError("The image edit task requires an input image.")
+        payload = {
+            "000000": {
+                "interleave_array": [prompt, input_image, input_image],
+                "element_dtype_array": ["text", "image", "image"],
+                "istarget_in_interleave": [0, 0, 1],
+            }
+        }
+    elif task == TASK_X2T_VIDEO:
+        if not input_video:
+            raise ValueError("The video understanding task requires an input video.")
+        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
+        payload = {
+            "000000": {
+                "interleave_array": [input_video, [system_prompt, prompt, ""]],
+                "element_dtype_array": ["video", "text"],
+                "istarget_in_interleave": [0, 1],
+            }
+        }
+    elif task == TASK_X2T_IMAGE:
+        if not input_image:
+            raise ValueError("The image understanding task requires an input image.")
+        system_prompt = normalize_understanding_system_prompt(task, system_prompt)
+        payload = {
+            "000000": {
+                "interleave_array": [input_image, [system_prompt, prompt, ""]],
+                "element_dtype_array": ["image", "text"],
+                "istarget_in_interleave": [0, 1],
+            }
+        }
+    else:
+        raise ValueError(f"Unsupported task type: {task}")
+    with prompt_file.open("w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+    return prompt_file
+def resolve_example_path(path: str) -> str:
+    candidate = Path(path)
+    if candidate.is_absolute():
+        return str(candidate)
+    repo_candidate = (REPO_ROOT / candidate)
+    if repo_candidate.exists():
+        return str(repo_candidate.resolve())
+    if candidate.exists():
+        return str(candidate.resolve())
+    return path
+def resolve_browser_video_example_path(path: str) -> str:
+    candidate = Path(path)
+    compatible_candidate = candidate.with_name(f"{candidate.stem}_h264{candidate.suffix}")
+    repo_compatible_candidate = REPO_ROOT / compatible_candidate
+    if not compatible_candidate.is_absolute() and repo_compatible_candidate.exists():
+        return str(repo_compatible_candidate.resolve())
+    if compatible_candidate.is_absolute() and compatible_candidate.exists():
+        return str(compatible_candidate.resolve())
+    repo_candidate = REPO_ROOT / candidate
+    if not candidate.is_absolute() and repo_candidate.exists():
+        return str(repo_candidate.resolve())
+    if candidate.is_absolute() and candidate.exists():
+        return str(candidate.resolve())
+    return resolve_example_path(path)
+def load_json_examples(relative_path: str) -> dict:
+    path = REPO_ROOT / relative_path
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+T2V_EXAMPLE_SUMMARIES = {
+    "000000.mp4": "Red panda surfing on a bright seaside wave.",
+    "000002.mp4": "Panda cub skateboarding in a creative loft.",
+    "000004.mp4": "Young woman shaping clay in a sunlit pottery workshop.",
+    "000005.mp4": "Panda boxing a robot in a luxurious palace ring.",
+    "000008.mp4": "Fantasy pastel horse stepping through a glowing cloud valley.",
+}
+def make_generation_examples(
+    task_label: str,
+    relative_path: str,
+    limit: int,
+    image_task: bool,
+    selected_keys: Optional[list[str]] = None,
+    summaries: Optional[dict[str, str]] = None,
+) -> list[list]:
+    data = load_json_examples(relative_path)
+    items = [(key, data[key]) for key in selected_keys if key in data] if selected_keys else list(data.items())[:limit]
+    examples = []
+    for output_name, prompt in items:
+        examples.append([prompt])
+    return examples
+def make_edit_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
+    data = load_json_examples(relative_path)
+    examples = []
+    for sample in list(data.values())[:limit]:
+        interleave = sample["interleave_array"]
+        prompt = interleave[0]
+        media_path = resolve_example_path(interleave[1])
+        examples.append([
+            prompt,
+            media_path if media_type == "video" else None,
+            media_path if media_type == "image" else None,
+        ])
+    return examples
+def make_understanding_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
+    data = load_json_examples(relative_path)
+    examples = []
+    for sample in list(data.values())[:limit]:
+        interleave = sample["interleave_array"]
+        media_path = (
+            resolve_browser_video_example_path(interleave[0])
+            if media_type == "video"
+            else resolve_example_path(interleave[0])
+        )
+        text_payload = interleave[1]
+        question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
+        examples.append([
+            question,
+            media_path if media_type == "video" else None,
+            media_path if media_type == "image" else None,
+        ])
+    return examples
+def make_understanding_system_prompt_map(relative_path: str, task: str) -> dict[str, str]:
+    data = load_json_examples(relative_path)
+    system_prompts = {}
+    for sample in data.values():
+        interleave = sample["interleave_array"]
+        text_payload = interleave[1]
+        if not isinstance(text_payload, list) or len(text_payload) < 2:
+            continue
+        system_prompts[text_payload[1]] = normalize_understanding_system_prompt(task, text_payload[0])
+    return system_prompts
+VIDEO_GENERATION_EXAMPLES = make_generation_examples(
+    TASK_LABEL_VIDEO_GENERATION,
+    "config/examples/t2v_example.json",
+    limit=6,
+    image_task=False,
+    #selected_keys=["000000.mp4", "000002.mp4", "000005.mp4", "000004.mp4", "000008.mp4"],
+    selected_keys=["000004.mp4", "000002.mp4", "000000.mp4", "000005.mp4", "000008.mp4", "000007.mp4"],
+    summaries=T2V_EXAMPLE_SUMMARIES,
+)
+VIDEO_EDIT_EXAMPLES = make_edit_examples(
+    TASK_LABEL_VIDEO_EDIT,
+    "config/examples/video_edit_example.json",
+    limit=3,
+    media_type="video",
+)
+VIDEO_UNDERSTANDING_EXAMPLES = make_understanding_examples(
+    TASK_LABEL_VIDEO_UNDERSTANDING,
+    "config/examples/x2t_video_example.json",
+    limit=3,
+    media_type="video",
+)
+VIDEO_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
+    "config/examples/x2t_video_example.json",
+    TASK_X2T_VIDEO,
+)
+IMAGE_GENERATION_EXAMPLES = make_generation_examples(
+    TASK_LABEL_IMAGE_GENERATION,
+    "config/examples/t2i_example.json",
+    limit=5,
+    image_task=True,
+    selected_keys=["000000.png", "000003.png", "000006.png", "000008.png", "000009.png"],
+)
+IMAGE_EDIT_EXAMPLES = make_edit_examples(
+    TASK_LABEL_IMAGE_EDIT,
+    "config/examples/image_edit_example.json",
+    limit=5,
+    media_type="image",
+)
+IMAGE_UNDERSTANDING_EXAMPLES = make_understanding_examples(
+    TASK_LABEL_IMAGE_UNDERSTANDING,
+    "config/examples/x2t_image_example.json",
+    limit=3,
+    media_type="image",
+)
+IMAGE_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
+    "config/examples/x2t_image_example.json",
+    TASK_X2T_IMAGE,
+)
+def build_save_dir(task: str) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"
+def find_generated_video(save_dir: Path) -> Optional[Path]:
+    videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return videos[0] if videos else None
+def find_generated_image(save_dir: Path) -> Optional[Path]:
+    images = sorted(save_dir.glob("*.png"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return images[0] if images else None
+def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tuple[Path, str]:
+    rife_dir = REPO_ROOT / "RIFE"
+    rife_script = rife_dir / "inference_video.py"
+    if not rife_script.exists():
+        raise FileNotFoundError(f"RIFE inference script not found: {rife_script}")
+    output_path = video_path.with_name(f"{video_path.stem}_rife_{2 ** exp}x{video_path.suffix}")
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = str(device_id)
+    command = [
+        "python3",
+        str(rife_script),
+        "--exp",
+        str(exp),
+        "--video",
+        str(video_path),
+        "--output",
+        str(output_path),
+        "--model",
+        str(rife_dir / "train_log"),
+    ]
+    rife_start = time.perf_counter()
+    try:
+        completed = subprocess.run(
+            command,
+            cwd=str(video_path.parent),
+            env=env,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(
+            "\n".join(
+                [
+                    f"RIFE failed with exit code {exc.returncode}.",
+                    f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
+                    exc.stdout.strip() if exc.stdout else "",
+                    exc.stderr.strip() if exc.stderr else "",
+                ]
+            ).strip()
+        ) from exc
+    if not output_path.exists():
+        raise FileNotFoundError(f"RIFE completed but output video was not found: {output_path}")
+    elapsed = time.perf_counter() - rife_start
+    log = "\n".join(
+        [
+            "[rife] Frame interpolation finished.",
+            f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
+            f"elapsed={elapsed:.2f}s",
+            f"output={output_path}",
+            completed.stdout.strip(),
+            completed.stderr.strip(),
+        ]
+    ).strip()
+    return output_path, log
+def extract_text_result(save_dir: Path) -> str:
+    prompt_result_path = save_dir / PROMPT_JSON_FILENAME
+    if not prompt_result_path.exists():
+        return ""
+    with prompt_result_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not data:
+        return ""
+    first_value = next(iter(data.values()))
+    return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)
+class LanceT2VV2TPipeline:
+    def __init__(self, device_id: int, model_variant: str = MODEL_VARIANT_VIDEO) -> None:
+        self._init_lock = threading.Lock()
+        self._generate_lock = threading.Lock()
+        self.initialized = False
+        self.device = device_id
+        self.model_variant = normalize_model_variant(model_variant)
+        self.logger = get_logger(f"lance_{self.model_variant}_gpu{device_id}")
+        self.model: Optional[Lance] = None
+        self.vae_model: Optional[WanVideoVAE] = None
+        self.vae_config: Optional[AutoEncoderParams] = None
+        self.tokenizer: Optional[Qwen2Tokenizer] = None
+        self.new_token_ids: Optional[dict] = None
+        self.image_token_id: Optional[int] = None
+        self.base_model_args: Optional[ModelArguments] = None
+        self.base_data_args: Optional[DataArguments] = None
+        self.base_inference_args: Optional[InferenceArguments] = None
+    def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
+        elapsed = time.perf_counter() - start_time
+        suffix = f" | {extra}" if extra else ""
+        print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)
+    def _build_base_model_args(self) -> ModelArguments:
+        model_path = str(get_model_path(self.model_variant))
+        return ModelArguments(
+            model_path=model_path,
+            vit_type=DEFAULT_VIT_TYPE,
+            llm_qk_norm=True,
+            llm_qk_norm_und=True,
+            llm_qk_norm_gen=True,
+            tie_word_embeddings=False,
+            max_num_frames=121,
+            max_latent_size=64,
+            latent_patch_size=[1, 1, 1],
+        )
+    def _build_base_inference_args(self) -> InferenceArguments:
+        return InferenceArguments(
+            validation_num_timesteps=DEFAULT_TIMESTEPS,
+            validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
+            copy_init_moe=True,
+            visual_und=True,
+            visual_gen=True,
+            vae_model_type="wan",
+            apply_qwen_2_5_vl_pos_emb=True,
+            apply_chat_template=False,
+            cfg_type=0,
+            validation_data_seed=42,
+            video_height=DEFAULT_HEIGHT,
+            video_width=DEFAULT_WIDTH,
+            num_frames=DEFAULT_NUM_FRAMES,
+            task=DEFAULT_TASK,
+            save_path_gen=str(RESULTS_ROOT),
+            resolution=DEFAULT_RESOLUTION,
+            text_template=TEXT_TEMPLATE,
+            use_KVcache=USE_KVCACHE,
+        )
+    def initialize(self) -> None:
+        with self._init_lock:
+            if self.initialized:
+                return
+            ensure_dirs()
+            resolved_model_path = ensure_model_assets(self.model_variant)
+            print(
+                f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
+                flush=True,
+            )
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
+            if self.device >= torch.cuda.device_count():
+                raise RuntimeError(
+                    f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
+                )
+            torch.cuda.set_device(self.device)
+            model_args = self._build_base_model_args()
+            data_args = DataArguments()
+            inference_args = self._build_base_inference_args()
+            apply_inference_defaults(model_args, data_args, inference_args)
+            inference_args.validation_noise_seed = inference_args.validation_data_seed
+            self.base_model_args = model_args
+            self.base_data_args = data_args
+            self.base_inference_args = inference_args
+            set_seed(inference_args.global_seed)
+            stage_start = time.perf_counter()
+            print(
+                f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
+                flush=True,
+            )
+            llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
+            self._log_stage("LLM config load", stage_start)
+            llm_config.layer_module = model_args.layer_module
+            llm_config.qk_norm = model_args.llm_qk_norm
+            llm_config.qk_norm_und = model_args.llm_qk_norm_und
+            llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+            llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+            llm_config.freeze_und = inference_args.freeze_und
+            llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
+            language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+            self._log_stage("LLM weight init", stage_start)
+            vit_model = None
+            vit_config = None
+            if inference_args.visual_und:
+                if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+                    raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
+                vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+                self._log_stage("VIT config load", stage_start)
+                stage_start = time.perf_counter()
+                print(
+                    f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
+                    flush=True,
+                )
+                vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+                vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
+                vit_model.load_state_dict(vit_weights, strict=True)
+                self._log_stage("VIT weight load", stage_start)
+                clean_memory(vit_weights)
+            if inference_args.visual_gen:
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
+                vae_model = WanVideoVAE()
+                vae_config = deepcopy(vae_model.vae_config)
+                self._log_stage("VAE init", stage_start)
+            else:
+                vae_model = None
+                vae_config = None
+            config = LanceConfig(
+                visual_gen=inference_args.visual_gen,
+                visual_und=inference_args.visual_und,
+                llm_config=llm_config,
+                vit_config=vit_config if inference_args.visual_und else None,
+                vae_config=vae_config if inference_args.visual_gen else None,
+                latent_patch_size=model_args.latent_patch_size,
+                max_num_frames=model_args.max_num_frames,
+                max_latent_size=model_args.max_latent_size,
+                vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+                connector_act=model_args.connector_act,
+                interpolate_pos=model_args.interpolate_pos,
+                timestep_shift=inference_args.timestep_shift,
+            )
+            model: Lance = Lance(
+                language_model=language_model,
+                vit_model=vit_model if inference_args.visual_und else None,
+                vit_type=model_args.vit_type,
+                config=config,
+                training_args=inference_args,
+            )
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
+            model = model.to(self.device)
+            self._log_stage("Lance model move to GPU", stage_start)
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
+            tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+            tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+            self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
+            if inference_args.copy_init_moe:
+                language_model.init_moe()
+            init_from_model_path_if_needed(model, model_args)
+            if num_new_tokens > 0:
+                model.language_model.resize_token_embeddings(len(tokenizer))
+                model.config.llm_config.vocab_size = len(tokenizer)
+                model.language_model.config.vocab_size = len(tokenizer)
+            if model_args.vit_type.lower() == "qwen2_5_vl":
+                from common.model.hacks import hack_qwen2_5_vl_config
+                language_model = hack_qwen2_5_vl_config(language_model)
+            image_token_id = language_model.config.video_token_id
+            new_token_ids.update({"image_token_id": image_token_id})
+            model.update_tokenizer(tokenizer=tokenizer)
+            if model_args.tie_word_embeddings:
+                model.language_model.untie_lm_head()
+                model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+                model_args.tie_word_embeddings = False
+                llm_config.tie_word_embeddings = False
+            else:
+                assert (
+                    model.language_model.get_input_embeddings().weight.data.data_ptr()
+                    != model.language_model.get_output_embeddings().weight.data.data_ptr()
+                ), "tie_word_embeddings conflict"
+            model = model.to(device=self.device, dtype=torch.bfloat16)
+            model.eval()
+            if vae_model is not None and hasattr(vae_model, "eval"):
+                vae_model.eval()
+            self.model = model
+            self.vae_model = vae_model
+            self.vae_config = vae_config
+            self.tokenizer = tokenizer
+            self.new_token_ids = new_token_ids
+            self.image_token_id = image_token_id
+            self.initialized = True
+            print(
+                f"[startup][gpu:{self.device}][{self.model_variant}] Lance multimodal Gradio model loaded and ready for reuse.",
+                flush=True,
+            )
+    def unload(self) -> None:
+        with self._init_lock:
+            if self.model is not None:
+                self.model.cpu()
+            if self.vae_model is not None and hasattr(self.vae_model, "vae"):
+                vae_inner = self.vae_model.vae
+                if hasattr(vae_inner, "model"):
+                    vae_inner.model.cpu()
+            self.model = None
+            self.vae_model = None
+            self.vae_config = None
+            self.tokenizer = None
+            self.new_token_ids = None
+            self.image_token_id = None
+            self.base_model_args = None
+            self.base_data_args = None
+            self.base_inference_args = None
+            self.initialized = False
+            gc.collect()
+            if torch.cuda.is_available():
+                with torch.cuda.device(self.device):
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+    def _build_request_batch(
+        self,
+        prompt_file: Path,
+        model_args: ModelArguments,
+        data_args: DataArguments,
+        inference_args: InferenceArguments,
+    ):
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.vae_config is not None
+        dataset_config = DataConfig.from_yaml(str(prompt_file))
+        if inference_args.visual_und:
+            dataset_config.vit_patch_size = model_args.vit_patch_size
+            dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+            dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+        if inference_args.visual_gen:
+            vae_downsample = tuple_mul(
+                tuple(model_args.latent_patch_size),
+                (
+                    self.vae_config.downsample_temporal,
+                    self.vae_config.downsample_spatial,
+                    self.vae_config.downsample_spatial,
+                ),
+            )
+            dataset_config.latent_patch_size = model_args.latent_patch_size
+            dataset_config.vae_downsample = vae_downsample
+            dataset_config.max_latent_size = model_args.max_latent_size
+            dataset_config.max_num_frames = model_args.max_num_frames
+        dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+        dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+        dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+        dataset_config.num_frames = inference_args.num_frames
+        dataset_config.H = inference_args.video_height
+        dataset_config.W = inference_args.video_width
+        dataset_config.task = inference_args.task
+        dataset_config.resolution = inference_args.resolution
+        dataset_config.text_template = inference_args.text_template
+        val_dataset = ValidationDataset(
+            jsonl_path=str(prompt_file),
+            tokenizer=self.tokenizer,
+            data_args=data_args,
+            model_args=model_args,
+            training_args=inference_args,
+            new_token_ids=self.new_token_ids,
+            dataset_config=dataset_config,
+            local_rank=0,
+            world_size=1,
+        )
+        return simple_custom_collate([val_dataset[0]])
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        system_prompt: Optional[str],
+        input_video: Optional[str],
+        input_image: Optional[str],
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+        enable_frame_interpolation: bool,
+    ):
+        self.initialize()
+        internal_task = normalize_task(task)
+        prompt = (prompt or "").strip()
+        input_video = str(input_video).strip() if input_video else ""
+        input_image = str(input_image).strip() if input_image else ""
+        if internal_task in GENERATION_TASKS and not prompt:
+            return None, None, "", "Please enter a prompt.", ""
+        if internal_task in UNDERSTANDING_TASKS and not prompt:
+            return None, None, "", "Please enter a question.", ""
+        if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
+            return None, None, "", "Please upload an input video.", ""
+        if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
+            return None, None, "", "Please upload an input image.", ""
+        if height <= 0 or width <= 0:
+            return None, None, "", "Height and width must be greater than 0.", ""
+        if num_frames <= 0:
+            return None, None, "", "The number of frames must be greater than 0.", ""
+        assert self.model is not None
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.image_token_id is not None
+        assert self.base_model_args is not None
+        assert self.base_data_args is not None
+        assert self.base_inference_args is not None
+        active_model_path = self.base_model_args.model_path
+        with self._generate_lock:
+            torch.cuda.set_device(self.device)
+            actual_seed = normalize_seed(int(seed))
+            prompt_file = create_request_json(
+                task=internal_task,
+                prompt=prompt,
+                input_video=input_video,
+                input_image=input_image,
+                system_prompt=system_prompt,
+            )
+            save_dir = build_save_dir(internal_task)
+            save_dir.mkdir(parents=True, exist_ok=True)
+            request_started_at = datetime.now().isoformat(timespec="seconds")
+            request_model_args = deepcopy(self.base_model_args)
+            request_model_args.cfg_text_scale = float(cfg_text_scale)
+            request_data_args = deepcopy(self.base_data_args)
+            request_data_args.val_dataset_config_file = str(prompt_file)
+            request_inference_args = deepcopy(self.base_inference_args)
+            request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
+            request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
+            request_inference_args.validation_data_seed = actual_seed
+            request_inference_args.validation_noise_seed = actual_seed
+            request_inference_args.video_height = int(height)
+            request_inference_args.video_width = int(width)
+            request_inference_args.num_frames = int(num_frames)
+            display_resolution = str(resolution)
+            backend_resolution = normalize_resolution_for_backend(display_resolution, internal_task)
+            request_inference_args.resolution = backend_resolution
+            request_inference_args.save_path_gen = str(save_dir)
+            request_inference_args.task = internal_task
+            request_inference_args.text_template = TEXT_TEMPLATE
+            request_inference_args.prompt_data_dict = {}
+            try:
+                print(
+                    "[lance_gradio_t2v_v2t] Start generation "
+                    f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
+                    f"size={height}x{width} | frames={num_frames} | resolution={display_resolution}",
+                    flush=True,
+                )
+                val_data_cpu = self._build_request_batch(
+                    prompt_file=prompt_file,
+                    model_args=request_model_args,
+                    data_args=request_data_args,
+                    inference_args=request_inference_args,
+                )
+                generate_start = time.perf_counter()
+                validate_on_fixed_batch(
+                    fsdp_model=self.model,
+                    vae_model=self.vae_model,
+                    tokenizer=self.tokenizer,
+                    val_data_cpu=val_data_cpu,
+                    training_args=request_inference_args,
+                    model_args=request_model_args,
+                    inference_args=request_inference_args,
+                    new_token_ids=self.new_token_ids,
+                    image_token_id=self.image_token_id,
+                    device=self.device,
+                    save_source_video=False,
+                    save_path_gen=request_inference_args.save_path_gen,
+                    save_path_gt="",
+                )
+                elapsed = time.perf_counter() - generate_start
+                save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
+                clean_memory()
+                video_path = find_generated_video(save_dir) if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} else None
+                original_video_path = video_path
+                rife_log = ""
+                rife_error = ""
+                frame_interpolation_enabled = normalize_frame_interpolation(enable_frame_interpolation) and internal_task in {TASK_T2V, TASK_VIDEO_EDIT}
+                if frame_interpolation_enabled and video_path is not None:
+                    try:
+                        clean_memory()
+                        print(
+                            "[rife] Start frame interpolation "
+                            f"| task={internal_task} | gpu={self.device} | input={video_path}",
+                            flush=True,
+                        )
+                        video_path, rife_log = run_rife_interpolation(video_path, self.device, exp=1)
+                    except Exception:
+                        rife_error = traceback.format_exc()
+                        print(rife_error, flush=True)
+                image_path = find_generated_image(save_dir) if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} else None
+                text_result = extract_text_result(save_dir) if internal_task in UNDERSTANDING_TASKS else ""
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "success",
+                    "task": internal_task,
+                    "model_variant": self.model_variant,
+                    "model_path": active_model_path,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "system_prompt": normalize_understanding_system_prompt(internal_task, system_prompt)
+                    if internal_task in UNDERSTANDING_TASKS
+                    else "",
+                    "input_video": input_video,
+                    "input_image": input_image,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": display_resolution,
+                    "backend_resolution": backend_resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "frame_interpolation": frame_interpolation_enabled,
+                    "elapsed_seconds": round(elapsed, 3),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "original_video_path": str(original_video_path) if original_video_path is not None else "",
+                    "video_path": str(video_path) if video_path is not None else "",
+                    "image_path": str(image_path) if image_path is not None else "",
+                    "text_result": text_result,
+                    "rife_error": rife_error,
+                }
+                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} and video_path is None:
+                    record["status"] = "completed_without_video"
+                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} and image_path is None:
+                    record["status"] = "completed_without_image"
+                if internal_task in UNDERSTANDING_TASKS and not text_result:
+                    record["status"] = "completed_without_text"
+                save_generation_record(record, save_dir)
+                logs = "\n".join(
+                    [
+                        "[lance_gradio_t2v_v2t] Inference finished in-process.",
+                        f"task={internal_task}",
+                        f"model_variant={self.model_variant}",
+                        f"model_path={active_model_path}",
+                        f"gpu={self.device}",
+                        f"seed={actual_seed}",
+                        f"height={height}",
+                        f"width={width}",
+                        f"num_frames={num_frames}",
+                        f"resolution={display_resolution}",
+                        f"backend_resolution={backend_resolution}",
+                        f"validation_num_timesteps={validation_num_timesteps}",
+                        f"validation_timestep_shift={validation_timestep_shift}",
+                        f"cfg_text_scale={cfg_text_scale}",
+                        f"frame_interpolation={frame_interpolation_enabled}",
+                        f"original_video_path={original_video_path or ''}",
+                        f"rife_error={rife_error.strip() if rife_error else ''}",
+                        f"elapsed={elapsed:.2f}s",
+                        f"output_dir={save_dir}",
+                        rife_log,
+                    ]
+                )
+                if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
+                    if video_path is None:
+                        status = (
+                            "Inference completed, but no output video was found.\n\n"
+                            f"- Task: `{internal_task}`\n"
+                            f"- Model: `{self.model_variant}`\n"
+                            f"- Model path: `{active_model_path}`\n"
+                            f"- GPU: `{self.device}`\n"
+                            f"- Actual seed: `{actual_seed}`\n"
+                            f"- Output directory: `{save_dir}`"
+                        )
+                        return None, None, "", status, logs
+                    # status = (
+                    #     "Inference completed.\n\n"
+                    #     f"- Task: `{internal_task}`\n"
+                    #     f"- Model: `{self.model_variant}`\n"
+                    #     f"- Model path: `{active_model_path}`\n"
+                    #     f"- GPU: `{self.device}`\n"
+                    #     f"- Actual seed: `{actual_seed}`\n"
+                    #     f"- Output directory: `{save_dir}`\n"
+                    #     f"- Result file: `{video_path}`"
+                    # )
+                    status = ""
+                    return str(video_path), None, "", status, logs
+                if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
+                    if image_path is None:
+                        status = (
+                            "Inference completed, but no output image was found.\n\n"
+                            f"- Task: `{internal_task}`\n"
+                            f"- Model: `{self.model_variant}`\n"
+                            f"- Model path: `{active_model_path}`\n"
+                            f"- GPU: `{self.device}`\n"
+                            f"- Actual seed: `{actual_seed}`\n"
+                            f"- Output directory: `{save_dir}`"
+                        )
+                        return None, None, "", status, logs
+                    # status = (
+                    #     "Inference completed.\n\n"
+                    #     f"- Task: `{internal_task}`\n"
+                    #     f"- Model: `{self.model_variant}`\n"
+                    #     f"- Model path: `{active_model_path}`\n"
+                    #     f"- GPU: `{self.device}`\n"
+                    #     f"- Actual seed: `{actual_seed}`\n"
+                    #     f"- Output directory: `{save_dir}`\n"
+                    #     f"- Result file: `{image_path}`"
+                    # )
+                    status = ""
+                    return None, str(image_path), "", status, logs
+                # status = (
+                #     "Understanding completed.\n\n"
+                #     f"- Task: `{task}`\n"
+                #     f"- Model: `{self.model_variant}`\n"
+                #     f"- Model path: `{active_model_path}`\n"
+                #     f"- GPU: `{self.device}`\n"
+                #     f"- Actual seed: `{actual_seed}`\n"
+                #     f"- Output directory: `{save_dir}`"
+                # )
+                status = ""
+                return None, None, text_result, status, logs
+            except Exception:
+                error_trace = traceback.format_exc()
+                print(error_trace, flush=True)
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "failed",
+                    "task": internal_task,
+                    "model_variant": self.model_variant,
+                    "model_path": active_model_path,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "input_video": input_video,
+                    "input_image": input_image,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": display_resolution,
+                    "backend_resolution": backend_resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "video_path": "",
+                    "image_path": "",
+                    "text_result": "",
+                    "error": error_trace,
+                }
+                save_generation_record(record, save_dir)
+                status = (
+                    "Inference failed.\n\n"
+                    f"- Task: `{internal_task}`\n"
+                    f"- Model: `{self.model_variant}`\n"
+                    f"- Model path: `{active_model_path}`\n"
+                    f"- GPU: `{self.device}`\n"
+                    f"- Actual seed: `{actual_seed}`\n"
+                    f"- Resolution: `{display_resolution}`\n"
+                    f"- Output directory: `{save_dir}`"
+                )
+                return None, None, "", status, error_trace
+class PipelinePool:
+    def __init__(self, gpu_ids: list[int], model_variant: str = MODEL_VARIANT_VIDEO) -> None:
+        if not gpu_ids:
+            raise ValueError("At least one GPU must be configured.")
+        self.gpu_ids = gpu_ids
+        self.model_variant = normalize_model_variant(model_variant)
+        self.pipelines = [
+            LanceT2VV2TPipeline(device_id=gpu_id, model_variant=self.model_variant)
+            for gpu_id in gpu_ids
+        ]
+        self._available = deque(self.pipelines)
+        self._condition = threading.Condition()
+    @property
+    def size(self) -> int:
+        return len(self.pipelines)
+    @property
+    def gpu_summary(self) -> str:
+        return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
+    def initialize_all(self) -> None:
+        print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
+        exceptions: list[Exception] = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
+            futures = {
+                executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
+            }
+            for future in concurrent.futures.as_completed(futures):
+                gpu_id = futures[future]
+                try:
+                    future.result()
+                except Exception as exc:
+                    print(f"[startup][gpu:{gpu_id}][{self.model_variant}] Preload failed: {exc}", flush=True)
+                    exceptions.append(exc)
+        if exceptions:
+            raise RuntimeError(
+                f"{self.model_variant} preload failed on {len(exceptions)} GPU(s). Please check the terminal logs."
+            ) from exceptions[0]
+        print(
+            f"[startup][{self.model_variant}] GPU preload finished. Ready to handle {self.size} concurrent request(s).",
+            flush=True,
+        )
+    def acquire(self) -> LanceT2VV2TPipeline:
+        with self._condition:
+            while not self._available:
+                self._condition.wait()
+            return self._available.popleft()
+    def release(self, pipeline: LanceT2VV2TPipeline) -> None:
+        with self._condition:
+            self._available.append(pipeline)
+            self._condition.notify()
+    def unload_all(self) -> None:
+        print(f"[runtime][{self.model_variant}] Unloading model pool from GPU(s): {self.gpu_ids}", flush=True)
+        with self._condition:
+            while len(self._available) != len(self.pipelines):
+                self._condition.wait()
+        for pipeline in self.pipelines:
+            pipeline.unload()
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        print(f"[runtime][{self.model_variant}] Model pool unloaded.", flush=True)
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        system_prompt: Optional[str],
+        input_video: Optional[str],
+        input_image: Optional[str],
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+        enable_frame_interpolation: bool,
+    ):
+        pipeline = self.acquire()
+        try:
+            with get_gpu_runtime_lock(pipeline.device):
+                return pipeline.generate(
+                    task=task,
+                    prompt=prompt,
+                    system_prompt=system_prompt,
+                    input_video=input_video,
+                    input_image=input_image,
+                    height=height,
+                    width=width,
+                    num_frames=num_frames,
+                    seed=seed,
+                    resolution=resolution,
+                    validation_num_timesteps=validation_num_timesteps,
+                    validation_timestep_shift=validation_timestep_shift,
+                    cfg_text_scale=cfg_text_scale,
+                    enable_frame_interpolation=enable_frame_interpolation,
+                )
+        finally:
+            self.release(pipeline)
+ACTIVE_PIPELINE_POOLS: dict[str, PipelinePool] = {}
+ACTIVE_POOL_LOCK = threading.Lock()
+GPU_RUNTIME_LOCKS: dict[int, threading.Lock] = {}
+GPU_RUNTIME_LOCKS_LOCK = threading.Lock()
+QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
+PRELOAD_MODEL_VARIANTS = [MODEL_VARIANT_VIDEO, MODEL_VARIANT_IMAGE]
+def get_gpu_runtime_lock(device_id: int) -> threading.Lock:
+    with GPU_RUNTIME_LOCKS_LOCK:
+        lock = GPU_RUNTIME_LOCKS.get(device_id)
+        if lock is None:
+            lock = threading.Lock()
+            GPU_RUNTIME_LOCKS[device_id] = lock
+        return lock
+def get_task_model_variant(task: str) -> str:
+    internal_task = normalize_task(task)
+    return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
+def get_pipeline_pool(task: str) -> PipelinePool:
+    model_variant = get_task_model_variant(task)
+    with ACTIVE_POOL_LOCK:
+        pipeline_pool = ACTIVE_PIPELINE_POOLS.get(model_variant)
+        if pipeline_pool is not None:
+            return pipeline_pool
+        gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
+        print(
+            f"[runtime] Loading Lance {model_variant} model pool without unloading existing pools.",
+            flush=True,
+        )
+        pipeline_pool = PipelinePool(gpu_ids, model_variant=model_variant)
+        pipeline_pool.initialize_all()
+        ACTIVE_PIPELINE_POOLS[model_variant] = pipeline_pool
+        return pipeline_pool
+def preload_pipeline_pools(gpu_ids: list[int], model_variants: list[str]) -> None:
+    for model_variant in model_variants:
+        normalized_variant = normalize_model_variant(model_variant)
+        if normalized_variant in ACTIVE_PIPELINE_POOLS:
+            continue
+        resolved_model_path = ensure_model_assets(normalized_variant)
+        print(
+            f"[startup][{normalized_variant}] Using Lance model path: {resolved_model_path}",
+            flush=True,
+        )
+        pipeline_pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
+        pipeline_pool.initialize_all()
+        ACTIVE_PIPELINE_POOLS[normalized_variant] = pipeline_pool
+def run_task(
+    task: str,
+    prompt: str,
+    system_prompt: Optional[str],
+    input_video: Optional[str],
+    input_image: Optional[str],
+    height: int,
+    width: int,
+    num_frames: int,
+    seed: int,
+    resolution: str,
+    validation_num_timesteps: int,
+    validation_timestep_shift: float,
+    cfg_text_scale: float,
+    enable_frame_interpolation: bool,
+):
+    internal_task = normalize_task(task)
+    if internal_task == TASK_T2V:
+        num_frames = video_seconds_to_num_frames(num_frames)
+    pipeline_pool = get_pipeline_pool(task)
+    return pipeline_pool.generate(
+        task=task,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        input_video=input_video,
+        input_image=input_image,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        seed=seed,
+        resolution=resolution,
+        validation_num_timesteps=validation_num_timesteps,
+        validation_timestep_shift=validation_timestep_shift,
+        cfg_text_scale=cfg_text_scale,
+        enable_frame_interpolation=enable_frame_interpolation,
+    )
+def build_status_markdown() -> str:
+    gpu_text = "unknown"
+    concurrency = 1
+    loaded_variants = "none"
+    if ACTIVE_PIPELINE_POOLS:
+        loaded_variants = ",".join(sorted(ACTIVE_PIPELINE_POOLS))
+        gpu_ids = sorted({gpu_id for pool in ACTIVE_PIPELINE_POOLS.values() for gpu_id in pool.gpu_ids})
+        gpu_text = ",".join(str(gpu_id) for gpu_id in gpu_ids)
+        concurrency = len(gpu_ids)
+    return (
+        f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
+        f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Loaded models: `{loaded_variants}`  |  "
+        f"Switch mode: `dual resident`"
+    )
+def get_logo_data_uri() -> str:
+    if not LANCE_LOGO_PATH.exists():
+        return ""
+    encoded_logo = base64.b64encode(LANCE_LOGO_PATH.read_bytes()).decode("ascii")
+    return f"data:image/webp;base64,{encoded_logo}"
+def build_header_html() -> str:
+    logo_data_uri = get_logo_data_uri()
+    logo_html = (
+        f'<img class="lance-logo" src="{logo_data_uri}" alt="Lance logo">'
+        if logo_data_uri
+        else ""
+    )
+    return f"""
+    <div class="lance-hero">
+        {logo_html}
+        <h1 class="lance-title">Lance: Unified Multimodal Modeling by Multi-Task Synergy</h1>
+        <div class="lance-authors">
+            <strong>
+                <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" target="_blank">Fengyi Fu</a><sup>*</sup>,
+                <a href="https://corleone-huang.github.io/" target="_blank">Mengqi Huang</a><sup>*,✉</sup>,
+                <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" target="_blank">Shaojin Wu</a><sup>*</sup>,
+                Yunsheng Jiang<sup>*</sup>,
+                Yufei Huo,
+                <a href="https://guojianzhu.com/" target="_blank">Jianzhu Guo</a><sup>✉,§</sup>
+            </strong><br>
+            Hao Li, Yinghang Song, Fei Ding, Qian He, Zheren Fu, Zhendong Mao, Yongdong Zhang<br>
+            <em>ByteDance</em>
+        </div>
+        <div class="lance-badges">
+            <a href="{LANCE_HOMEPAGE_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Homepage" src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat">
+            </a>
+            <a href="{LANCE_PAPER_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Paper" src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv">
+            </a>
+            <a href="{LANCE_HUGGING_FACE_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="Hugging Face" src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface">
+            </a>
+            <a href="{LANCE_GITHUB_URL}" target="_blank" rel="noopener noreferrer">
+                <img alt="GitHub" src="https://img.shields.io/badge/Code-GitHub-536af5?color=536af5&logo=github">
+            </a>
+        </div>
+    </div>
+    """
+def update_task_ui(task: str):
+    internal_task = normalize_task(task)
+    is_image_task = internal_task in IMAGE_TASKS
+    is_video_task = internal_task in VIDEO_TASKS
+    is_edit_task = internal_task in EDIT_TASKS
+    is_understanding_task = internal_task in UNDERSTANDING_TASKS
+    is_generation_task = internal_task in GENERATION_TASKS
+    show_media_input = is_edit_task or is_understanding_task
+    resolution_choices = IMAGE_RESOLUTION_CHOICES if is_image_task else VIDEO_RESOLUTION_CHOICES
+    resolution_value = DEFAULT_IMAGE_RESOLUTION if is_image_task else DEFAULT_RESOLUTION
+    aspect_ratio_value = DEFAULT_IMAGE_ASPECT_RATIO if is_image_task else DEFAULT_VIDEO_ASPECT_RATIO
+    width_value, height_value = get_size_for_aspect_ratio(internal_task, aspect_ratio_value)
+    size_markdown = format_size_markdown(internal_task, width_value, height_value)
+    system_prompt_choices = get_understanding_system_prompt_choices(internal_task)
+    if is_generation_task:
+        text_label = "Prompt"
+        text_placeholder = "Describe what you want to generate..."
+    elif is_edit_task:
+        text_label = "Instruction"
+        text_placeholder = "Describe the edit you want..."
+    else:
+        text_label = "Question"
+        text_placeholder = "Ask a question about the input..."
+    return (
+        gr.update(
+            label=text_label,
+            placeholder=text_placeholder,
+            visible=True,
+        ),
+        gr.update(
+            choices=system_prompt_choices,
+            value=system_prompt_choices[0],
+            visible=False,
+        ),
+        gr.update(label="Input Video", visible=show_media_input and is_video_task),
+        gr.update(label="Input Image", visible=show_media_input and is_image_task),
+        gr.update(value=aspect_ratio_value, visible=is_generation_task or is_edit_task),
+        gr.update(value=height_value),
+        gr.update(value=width_value),
+        gr.update(value=size_markdown, visible=is_generation_task or is_edit_task),
+        gr.update(visible=internal_task == TASK_T2V, value=DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1),
+        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}, value=DEFAULT_FRAME_INTERPOLATION),
+        gr.update(choices=resolution_choices, value=resolution_value, visible=False),
+        gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}),
+        gr.update(visible=internal_task in {TASK_T2I, TASK_IMAGE_EDIT}),
+        gr.update(visible=is_understanding_task, value=""),
+        gr.update(visible=internal_task == TASK_T2V),
+        gr.update(visible=internal_task == TASK_VIDEO_EDIT),
+        gr.update(visible=internal_task == TASK_X2T_VIDEO),
+        gr.update(visible=internal_task == TASK_T2I),
+        gr.update(visible=internal_task == TASK_IMAGE_EDIT),
+        gr.update(visible=internal_task == TASK_X2T_IMAGE),
+    )
+def keep_example_clicks_from_changing_visibility(*examples_components) -> None:
+    for examples_component in examples_components:
+        dataset = getattr(examples_component, "dataset", None)
+        component_props = getattr(dataset, "component_props", None)
+        if not component_props:
+            continue
+        for props in component_props:
+            props.pop("visible", None)
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="Lance", css=APP_CSS) as demo:
+        gr.HTML(build_header_html())
+        gr.Markdown(build_status_markdown(), elem_classes=["lance-status"], visible=False)
+        with gr.Row(elem_classes=["lance-main-row"]):
+            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
+                task = gr.Radio(
+                    label="Task",
+                    choices=TASK_CHOICES,
+                    value=TASK_LABEL_VIDEO_GENERATION,
+                    elem_classes=["task-selector"],
+                )
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    lines=6,
+                    placeholder="Describe the video you want to generate...",
+                )
+                system_prompt = gr.Dropdown(
+                    label="System Prompt",
+                    choices=get_understanding_system_prompt_choices(TASK_X2T_VIDEO),
+                    value=V2T_QA_SYSTEM_PROMPT,
+                    visible=False,
+                )
+                input_video = gr.Video(label="Input Video", visible=False, elem_classes=["lance-display-frame"])
+                input_image = gr.Image(label="Input Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
+                with gr.Row(elem_classes=["generation-controls-row"]):
+                    enable_frame_interpolation = gr.Dropdown(
+                        label="Frame Interpolation",
+                        choices=[FRAME_INTERPOLATION_YES, FRAME_INTERPOLATION_NO],
+                        value=DEFAULT_FRAME_INTERPOLATION,
+                        elem_classes=["generation-control", "generation-dropdown-control"],
+                        min_width=0,
+                    )
+                    seed = gr.Number(
+                        label="Seed (-1 for random seed)",
+                        value=DEFAULT_BASIC_SEED,
+                        precision=0,
+                        elem_classes=["generation-control", "generation-value-control"],
+                        min_width=0,
+                        # info="-1 for random seed",
+                    )
+                    aspect_ratio = gr.Dropdown(
+                        label="Aspect Ratio",
+                        # choices=ASPECT_RATIO_CHOICES, # 原始版本，不显示 是否为 default
+                        choices=get_aspect_ratio_choices_for_task(TASK_T2V),
+                        value=DEFAULT_VIDEO_ASPECT_RATIO,
+                        elem_classes=["generation-control", "generation-dropdown-control"],
+                        min_width=0,
+                    )
+                    # real_size = gr.Markdown(format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT))
+                    real_size = gr.Textbox(
+                        label="Output Resolution",
+                        value=format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT),
+                        interactive=False,
+                        elem_classes=["generation-control", "generation-value-control"],
+                        min_width=0,
+                    )
+                resolution = gr.Dropdown(
+                    label="Resolution",
+                    choices=RESOLUTION_CHOICES,
+                    value=DEFAULT_RESOLUTION,
+                    visible=False,
+                )
+                height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
+                width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
+                num_frames = gr.Slider(
+                    minimum=1,
+                    maximum=10,
+                    step=1,
+                    value=DEFAULT_VIDEO_DURATION_SECONDS,
+                    label="Video Duration (seconds)",
+                )
+                # seed = gr.Number(
+                #     label="Seed",
+                #     value=DEFAULT_BASIC_SEED,
+                #     precision=0,
+                #     info="-1 means using a random seed each time",
+                # )
+                with gr.Accordion("Advanced Parameters", open=False):
+                    validation_num_timesteps = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=DEFAULT_TIMESTEPS,
+                        label="Validation Num Timesteps",
+                    )
+                    with gr.Row():
+                        validation_timestep_shift = gr.Number(
+                            label="Validation Timestep Shift",
+                            value=DEFAULT_TIMESTEP_SHIFT,
+                        )
+                        cfg_text_scale = gr.Number(
+                            label="CFG Text Scale",
+                            value=DEFAULT_CFG_TEXT_SCALE,
+                        )
+                generation_example_inputs = [
+                    prompt,
+                    input_video,
+                    input_image,
+                ]
+            with gr.Column(scale=1, elem_classes=["lance-main-column"]):
+                output_video = gr.Video(label="Output Video", elem_classes=["lance-display-frame"])
+                output_image = gr.Image(label="Output Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
+                output_text = gr.Textbox(label="Output Text", lines=8, visible=False, elem_classes=["lance-display-frame"])
+                status = gr.Markdown("WAITING TO RUN.")
+                logs = gr.Textbox(label="Run Logs", lines=22, max_lines=30)
+        run_button = gr.Button("🚀 Generate", variant="primary", elem_classes=["lance-run-button"])
+        with gr.Group(visible=True, elem_classes=["prompt-examples", "example-panel"]) as video_generation_examples_group:
+            gr.Markdown("### Video generation recommended cases", elem_classes=["recommended-title"])
+            video_generation_examples = gr.Dataset(
+                samples=VIDEO_GENERATION_EXAMPLES,
+                components=[gr.Textbox(label="Prompt", visible=False)],
+                headers=["Prompt"],
+                show_label=False,
+                type="values",
+                layout="table",
+                samples_per_page=len(VIDEO_GENERATION_EXAMPLES),
+                elem_classes=["prompt-dataset"],
+            )
+        with gr.Group(visible=False, elem_classes=["example-panel"]) as video_edit_examples_group:
+            gr.Markdown("### Video edit recommended cases", elem_classes=["recommended-title"])
+            video_edit_examples = gr.Examples(
+                examples=VIDEO_EDIT_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=3,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False, elem_classes=["example-panel"]) as video_understanding_examples_group:
+            gr.Markdown("### Video understanding recommended cases", elem_classes=["recommended-title"])
+            video_understanding_examples = gr.Examples(
+                examples=VIDEO_UNDERSTANDING_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=4,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False, elem_classes=["prompt-examples", "example-panel"]) as image_generation_examples_group:
+            gr.Markdown("### Image generation recommended cases", elem_classes=["recommended-title"])
+            image_generation_examples = gr.Dataset(
+                samples=IMAGE_GENERATION_EXAMPLES,
+                components=[gr.Textbox(label="Prompt", visible=False)],
+                headers=["Prompt"],
+                show_label=False,
+                type="values",
+                layout="table",
+                samples_per_page=len(IMAGE_GENERATION_EXAMPLES),
+                elem_classes=["prompt-dataset"],
+            )
+        with gr.Group(visible=False, elem_classes=["example-panel"]) as image_edit_examples_group:
+            gr.Markdown("### Image edit recommended cases", elem_classes=["recommended-title"])
+            image_edit_examples = gr.Examples(
+                examples=IMAGE_EDIT_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=5,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        with gr.Group(visible=False, elem_classes=["example-panel"]) as image_understanding_examples_group:
+            gr.Markdown("### Image understanding recommended cases", elem_classes=["recommended-title"])
+            image_understanding_examples = gr.Examples(
+                examples=IMAGE_UNDERSTANDING_EXAMPLES,
+                inputs=generation_example_inputs,
+                label="",
+                examples_per_page=4,
+                cache_examples=False,
+                preprocess=False,
+                postprocess=False,
+            )
+        keep_example_clicks_from_changing_visibility(
+            video_generation_examples,
+            video_edit_examples,
+            video_understanding_examples,
+            image_generation_examples,
+            image_edit_examples,
+            image_understanding_examples,
+        )
+        task.change(
+            fn=update_task_ui,
+            inputs=[task],
+            outputs=[
+                prompt,
+                system_prompt,
+                input_video,
+                input_image,
+                aspect_ratio,
+                height,
+                width,
+                real_size,
+                num_frames,
+                enable_frame_interpolation,
+                resolution,
+                output_video,
+                output_image,
+                output_text,
+                video_generation_examples_group,
+                video_edit_examples_group,
+                video_understanding_examples_group,
+                image_generation_examples_group,
+                image_edit_examples_group,
+                image_understanding_examples_group,
+            ],
+        )
+        aspect_ratio.change(
+            fn=update_size_from_aspect_ratio,
+            inputs=[task, aspect_ratio],
+            outputs=[height, width, real_size],
+            queue=False,
+            show_api=False,
+        )
+        for examples_component in (video_edit_examples, video_understanding_examples, image_edit_examples, image_understanding_examples):
+            examples_component.load_input_event.then(
+                fn=reset_generation_defaults_for_task,
+                inputs=[task],
+                outputs=[aspect_ratio, height, width, num_frames, resolution, real_size],
+                queue=False,
+                show_api=False,
+            )
+        video_generation_examples.select(
+            fn=apply_prompt_example,
+            inputs=[task],
+            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
+            queue=False,
+            show_api=False,
+        )
+        image_generation_examples.select(
+            fn=apply_prompt_example,
+            inputs=[task],
+            outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
+            queue=False,
+            show_api=False,
+        )
+        run_button.click(
+            fn=run_task,
+            inputs=[
+                task,
+                prompt,
+                system_prompt,
+                input_video,
+                input_image,
+                height,
+                width,
+                num_frames,
+                seed,
+                resolution,
+                validation_num_timesteps,
+                validation_timestep_shift,
+                cfg_text_scale,
+                enable_frame_interpolation,
+            ],
+            outputs=[output_video, output_image, output_text, status, logs],
+        )
+    return demo
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Lance multimodal Gradio")
+    parser.add_argument("--server-name", default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
+    parser.add_argument("--server-port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
+    parser.add_argument("--share", action="store_true", default=env_flag("GRADIO_SHARE", False))
+    parser.add_argument(
+        "--gpus",
+        default=os.getenv("LANCE_GPUS", DEFAULT_GPUS),
+        help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=int(os.getenv("LANCE_QUEUE_SIZE", str(DEFAULT_QUEUE_SIZE))),
+        help="Maximum number of queued Gradio requests.",
+    )
+    return parser.parse_args()
+def parse_gpu_ids(gpu_string: str) -> list[int]:
+    gpu_ids: list[int] = []
+    for item in gpu_string.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        gpu_ids.append(int(item))
+    if not gpu_ids:
+        raise ValueError("No valid GPU IDs were parsed.")
+    return gpu_ids
+if __name__ == "__main__":
+    args = parse_args()
+    os.environ["LANCE_GPUS"] = args.gpus
+    QUEUE_MAX_SIZE = args.queue_size
+    gpu_ids = parse_gpu_ids(args.gpus)
+    preload_pipeline_pools(gpu_ids, PRELOAD_MODEL_VARIANTS)
+    default_concurrency_limit = max(1, len(gpu_ids))
+    demo = build_demo()
+    demo.queue(
+        max_size=args.queue_size,
+        default_concurrency_limit=default_concurrency_limit,
+    ).launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+    )

assets/image-understanding/cases/image-understanding-case-02.png ADDED Viewed

Git LFS Details

SHA256: b6401dab4435a094d7b4bc31924bc85331f7d4e75f43f114b728522095e13365
Pointer size: 130 Bytes
Size of remote file: 50.3 kB

assets/image-understanding/cases/image-understanding-case-05.png ADDED Viewed

Git LFS Details

SHA256: 3853e3af85824429178a14a63255eba6f0e2e44dd47f7cc64c27fe3eefe765cb
Pointer size: 131 Bytes
Size of remote file: 949 kB

assets/image-understanding/cases/image-understanding-case-06.png ADDED Viewed

Git LFS Details

SHA256: d16216657d25789af4e6f7ef68adfea7f161bf944b65f9a6657c9b019bea34b3
Pointer size: 131 Bytes
Size of remote file: 223 kB

assets/logo/lance-logo.webp ADDED Viewed

Git LFS Details

SHA256: 5359f986a6a29e25b4eb92fd470e74c83d92581dc6fb22d2c4ac789637842934
Pointer size: 131 Bytes
Size of remote file: 461 kB

assets/video-understanding/videos/video-understanding-caption-long-01.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f070eefe27dc3f3d065926345299b996124dc1ee4372c223164ddfd0792ce1a
+size 5318845

assets/video-understanding/videos/video-understanding-caption-short-01.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fcb4c18846571444ae024331a64e8740716e3b151f3e05a0d901b405b608da6
+size 2209818

assets/video-understanding/videos/video-understanding-vqa-01.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f382ee52b21942d7840eef2843bf5c57ed4e5ff4bb958e2c4fa23635030c02b
+size 2673972

benchmarks/image_gen/DPG/DPG.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/image_gen/DPG/README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+[Chinese Version](./README_zh.md)
+# DPG Image Generation Evaluation
+Benchmark evaluation scripts for DPG based on the Lance model.
+## Files
+- `sample_DPG.py` - Python inference script
+- `sample_DPG.sh` - Launch script
+- `DPG.jsonl` - Evaluation dataset
+## Quick Start
+### Basic Usage
+```bash
+bash benchmarks/image_gen/DPG/sample_DPG.sh
+```
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/DPG/sample_DPG.sh`.
+## Parameters
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | Task type. DPG is fixed to image generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | Number of images generated per case for the final grid. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | Image resolution. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/DPG/DPG.jsonl` | Path to the evaluation data. |
+## How To Modify
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/DPG/sample_DPG.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/DPG/sample_DPG.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+## Output Format
+Results are saved in a structure like this:
+```
+results/DPG_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 0.png
+├── 1.png
+├── 2.png
+└── ...
+```

benchmarks/image_gen/DPG/README_zh.md ADDED Viewed

	@@ -0,0 +1,57 @@

+[English Version](./README.md)
+# DPG 图像生成评估
+基于 Lance 模型的 DPG 评估基准测试脚本。
+## 文件说明
+- `sample_DPG.py` - 推理 Python 脚本
+- `sample_DPG.sh` - 启动脚本
+- `DPG.jsonl` - 评估数据集
+## 快速开始
+### 基本用法
+```bash
+bash benchmarks/image_gen/DPG/sample_DPG.sh
+```
+运行前请直接修改 `benchmarks/image_gen/DPG/sample_DPG.sh` 顶部的“推理参数配置”区。
+## 参数说明
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | 任务类型，DPG 固定为图像生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | 每个 case 生成的图像数量，用于拼接最终网格图 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | 图像分辨率 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/DPG/DPG.jsonl` | 评估数据路径 |
+## 修改方式
+- 请手动编辑 `benchmarks/image_gen/DPG/sample_DPG.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/DPG/sample_DPG.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+## 保存格式
+结果会按照以下结构保存：
+```
+results/DPG_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 0.png
+├── 1.png
+├── 2.png
+└── ...
+```

benchmarks/image_gen/DPG/sample_DPG.py ADDED Viewed

	@@ -0,0 +1,509 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+import os.path as osp
+from copy import deepcopy
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from torchvision.utils import make_grid
+import numpy as np
+from tqdm import trange
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.utils.logging import get_logger
+from common.val.utils import make_padded_latent
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, TrainingArguments, EvaluationArguments, get_model_path
+def init_from_vlm_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments, log_rank0):
+    # NOTE: 初始化加载VLM模型走这里
+    def load_safetensors_state_dict(folder_path):
+        # 只选取safetensors文件，按文件名排序保证顺序
+        safetensor_files = sorted(
+            f for f in os.listdir(folder_path) if f.endswith(".safetensors")
+        )
+        state_dict = {}
+        for filename in safetensor_files:
+            file_path = osp.join(folder_path, filename)
+            state_dict.update(load_file(file_path))
+        return state_dict
+    state_dict = load_safetensors_state_dict(model_args.llm_path)
+    # 参数名的更改以适配Lance的参数名
+    for k in list(state_dict.keys()):
+        if "visual" in k:  # ViT and connector
+            state_dict[k.replace("visual", "vit_model")] = state_dict.pop(k)
+        else:
+            # 添加language_model前缀
+            state_dict["language_model." + k] = state_dict.pop(k)
+    result = model.load_state_dict(state_dict, strict=False)
+    clean_memory(state_dict)
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    # 统一从 model_path 加载训练好的 Lance checkpoint。
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+    # NOTE: position embeds are fixed sinusoidal embeddings, so we can just pop it off,
+    # which makes it easier to adapt to different resolutions.
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+    msg = model.load_state_dict(model_state_dict, strict=False)
+    clean_memory(model_state_dict)
+    return msg
+def clean_memory(*objects):
+    """清理内存并释放 GPU 缓存"""
+    for obj in objects:
+        del obj
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def decode_video_tensor_for_dpg(v_list):
+    """
+    专门为 DPG 解码视频张量，保持原有的保存格式
+    """
+    N_target = len(v_list)
+    if N_target != 1:
+        from einops import rearrange
+        padded_videos_latent = [v.permute(1, 0, 2, 3) for v in v_list]
+        v_tc_hw = rearrange(padded_videos_latent, "n t c h w -> t c h (n w)")
+    else:
+        v_tc_hw = v_list[0].permute(1, 0, 2, 3)
+    v_tc_hw = v_tc_hw.float().clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round().clamp(0, 255).to(torch.uint8)
+    return v_tc_hw
+def resolve_dpg_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("DPG requires --model_path to be provided explicitly.")
+    if not model_args.llm_path:
+        model_args.llm_path = model_args.model_path
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("dpg.data")
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    tokenizer: Qwen2Tokenizer,
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    inference_args: EvaluationArguments,
+    curr_step: int,
+    logger,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_source_video: bool = False,
+    save_path_gen: str = "",
+    save_path_gt: str = "",
+    sample_num_per_prompt: int = 1,
+):
+    """
+    验证逻辑，保持与原文件相同的保存格式
+    """
+    # 检查是否初始化了分布式环境
+    if dist.is_initialized():
+        is_rank0 = (dist.get_rank() == 0)
+    else:
+        is_rank0 = True
+    log_rank0 = logger.info if is_rank0 else (lambda *_: None)
+    val_data = val_data_cpu.cuda(device).to_dict()
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        # 计算 padded_latent
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+        # -------------------- GEN 分支 --------------------
+        tensor_list_for_grid = []
+        loop_iterator = trange(sample_num_per_prompt) if is_rank0 else range(sample_num_per_prompt)
+        # 支持断点重新生成
+        save_name = f"{save_path_gen}/{val_data['index']}.png"
+        if os.path.exists(save_name):
+            return None
+        for sample_num_per_prompt_index in loop_iterator:
+            # 采样生成（保持原参数）
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": training_args.validation_timestep_shift,
+                "num_timesteps": training_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": training_args.cfg_interval,
+                "cfg_renorm_min": training_args.cfg_renorm_min,
+                "cfg_renorm_type": training_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": training_args.validation_max_samples,
+                "validation_noise_seed": training_args.validation_noise_seed + sample_num_per_prompt_index,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": training_args.cfg_type,
+                "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": val_data["padded_videos"] if save_source_video else None,
+            }
+            if training_args.use_KVcache:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen(**params)
+            # 解码 + 保存
+            for i_val, latent in enumerate(denoise_latent):
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+                # 保持与原文件相同的保存格式
+                v_thwc = decode_video_tensor_for_dpg(v_list)
+                # 直接取第0帧
+                if v_thwc.shape[0] == 1:
+                    tensor_list_for_grid.append(v_thwc.squeeze(0).cpu())
+                else:
+                    raise NotImplementedError("需要保存图像")
+    # 保持原有的保存格式
+    grid_tensor = make_grid(tensor_list_for_grid, nrow=int(np.sqrt(sample_num_per_prompt)), padding=0, pad_value=255)
+    grid_numpy = grid_tensor.permute(1, 2, 0).numpy()
+    Image.fromarray(grid_numpy).save(save_name)
+def main():
+    # ========================= Env setup ==============================
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+    # ========================= Args and logger setup ==============================
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+    # ========================= DPG 路径解析 ==============================
+    resolve_dpg_paths(model_args, data_args)
+    # NOTE validation_noise_seed 与 validation_data_seed 相同
+    training_args.validation_noise_seed = inference_args.evaluation_seed
+    training_args.validation_data_seed = inference_args.evaluation_seed
+    logger = get_logger()
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+    # Set seed:
+    seed = training_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+    # ========================= LLM model setup ==============================
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+        clean_memory(vit_weights)
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+    # Lance的配置
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(DEVICE)
+    # Setup tokenizer for model:
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+    # 在加载ckpt前，初始化moe
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+    init_from_model_path_if_needed(model, model_args)
+    # 现在再 resize
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+    # Setup packed dataloader - 直接初始化简单的 DataConfig 对象
+    dataset_config = DataConfig(grouped_datasets={})
+    # 配置基本参数
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    # 配置 VIT 相关参数
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+    # 配置 VAE 相关参数
+    if training_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        vae_downsample = tuple_mul(
+            model_args.latent_patch_size, (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial)
+        )
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = vae_downsample
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+    # fix: 共享dropout
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+    # 创建数据集
+    val_dataset = ValidationDataset(
+        jsonl_path= data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=GLOBAL_RANK,
+        world_size=WORLD_SIZE,
+    )
+    val_loader = DataLoader(
+            val_dataset,
+            batch_size=1,
+            num_workers=0,
+            pin_memory=True,
+            collate_fn=simple_custom_collate,
+            drop_last=True,
+            prefetch_factor=None,
+            persistent_workers=False,
+            multiprocessing_context=None,
+        )
+    val_loader_iter = iter(val_loader)
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen, exist_ok=True)
+    # 主循环
+    from tqdm import tqdm
+    import time
+    from datetime import datetime, timedelta
+    total_batches = len(val_loader)
+    pbar = tqdm(total=total_batches, desc="Validating", unit="batch", leave=True, ncols=120, disable=(GLOBAL_RANK != 0))
+    start_time = time.time()
+    for i in range(total_batches):
+        val_data_cpu = next(val_loader_iter)
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            tokenizer=tokenizer,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            data_args=data_args,
+            inference_args=inference_args,
+            curr_step=0,
+            logger=logger,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_source_video=False,
+            save_path_gen=inference_args.save_path_gen,
+            save_path_gt="",
+            sample_num_per_prompt=inference_args.sample_num_per_prompt,
+        )
+        if GLOBAL_RANK == 0:
+            elapsed = time.time() - start_time
+            avg_time = elapsed / (i + 1)
+            eta_seconds = avg_time * (total_batches - i - 1)
+            expected_finish = datetime.now() + timedelta(seconds=eta_seconds)
+            finish_str = expected_finish.strftime('%Y-%m-%d %H:%M:%S')
+            pbar.set_postfix_str(f"ETA: {timedelta(seconds=int(eta_seconds))} | Finish: {finish_str}")
+            pbar.update(1)
+    if GLOBAL_RANK == 0:
+        pbar.close()
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

benchmarks/image_gen/DPG/sample_DPG.sh ADDED Viewed

	@@ -0,0 +1,113 @@

+#!/bin/bash
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2i"
+NUM_GPUS=8
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=4
+USE_KVCACHE=true
+VIDEO_HEIGHT=768
+VIDEO_WIDTH=768
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/DPG/DPG.jsonl"
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/DPG_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "DPG T2I 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/DPG/sample_DPG.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --use_flex              true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --num_replicate         $NUM_REPLICATE \
+    --num_shard             $NUM_SHARD \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --video_height          $VIDEO_HEIGHT \
+    --video_width           $VIDEO_WIDTH \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"

benchmarks/image_gen/GEdit/GEdit_en.json ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/image_gen/GEdit/README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+[Chinese Version](./README_zh.md)
+# GEdit Image Editing Evaluation
+Benchmark evaluation scripts for GEdit based on the Lance model.
+## Files
+- `sample_GEdit.py` - Python inference script
+- `sample_GEdit.sh` - Launch script
+- `GEdit_en.json` - Evaluation dataset
+## Quick Start
+### Basic Usage
+```bash
+bash benchmarks/image_gen/GEdit/sample_GEdit.sh
+```
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GEdit/sample_GEdit.sh`.
+Please follow `https://github.com/stepfun-ai/Step1X-Edit` to download the source images in GEdit-Bench and put all images in `benchmarks/image_gen/GEdit/images/`.
+## Parameters
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `image_edit` | Task type. GEdit is fixed to image editing. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GEdit/GEdit_en.json` | Path to the evaluation data. |
+## How To Modify
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GEdit/sample_GEdit.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/GEdit/sample_GEdit.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+## Output Format
+Results are saved in a structure like this:
+```
+results/GEdit_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── fullset/
+│   ├── add/
+│   │   ├── en/
+│   │   │   ├── 000001.webp
+│   │   │   └── ...
+│   ├── remove/
+│   │   └── en/
+│   │       └── ...
+├── prompt.json
+```
+Each case generates one edited image by default and stores it as a `.webp` file under `task_type/instruction_language/key`. A `prompt.json` file is also written to record the generated text.
+## Notes
+- If you need to switch the model, dataset, or resolution, edit the script configuration at the top directly.
+- The default result directory automatically includes key parameters and a timestamp for easier experiment tracking.

benchmarks/image_gen/GEdit/README_zh.md ADDED Viewed

	@@ -0,0 +1,67 @@

+[English Version](./README.md)
+# GEdit 图像编辑评估
+基于 Lance 模型的 GEdit 评估基准测试脚本。
+## 文件说明
+- `sample_GEdit.py` - 推理 Python 脚本
+- `sample_GEdit.sh` - 启动脚本
+- `GEdit_en.json` - 评估数据集
+## 快速开始
+### 基本用法
+```bash
+bash benchmarks/image_gen/GEdit/sample_GEdit.sh
+```
+运行前请直接修改 `benchmarks/image_gen/GEdit/sample_GEdit.sh` 顶部的“推理参数配置”区。
+请参考 `https://github.com/stepfun-ai/Step1X-Edit` 下载 GEdit-Bench 的源图，并将所有图片放到 `benchmarks/image_gen/GEdit/images/` 中。
+## 参数说明
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `image_edit` | 任务类型，GEdit 固定为图像编辑 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GEdit/GEdit_en.json` | 评估数据路径 |
+## 修改方式
+- 请手动编辑 `benchmarks/image_gen/GEdit/sample_GEdit.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/GEdit/sample_GEdit.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+## 保存格式
+结果会按照以下结构保存：
+```
+results/GEdit_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── fullset/
+│   ├── add/
+│   │   ├── en/
+│   │   │   ├── 000001.webp
+│   │   │   └── ...
+│   ├── remove/
+│   │   └── en/
+│   │       └── ...
+├── prompt.json
+```
+每个 case 默认生成 1 张编辑结果图，并按 `task_type/instruction_language/key` 分目录保存为 `.webp` 文件；同时会额外写出 `prompt.json` 用于记录生成文本。
+## 注意事项
+- 如果需要切换模型、数据集或分辨率，请直接修改脚本顶部配置。
+- 默认结果目录会自动包含关键参数和时间戳，方便区分不同实验。

benchmarks/image_gen/GEdit/sample_GEdit.py ADDED Viewed

	@@ -0,0 +1,425 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+import os.path as osp
+from copy import deepcopy
+import json
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from tqdm import trange
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.val.utils import make_padded_latent, decode_video_tensor
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, TrainingArguments, EvaluationArguments, get_model_path
+def init_from_vlm_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments, log_rank0):
+    def load_safetensors_state_dict(folder_path):
+        safetensor_files = sorted(
+            f for f in os.listdir(folder_path) if f.endswith(".safetensors")
+        )
+        state_dict = {}
+        for filename in safetensor_files:
+            file_path = osp.join(folder_path, filename)
+            state_dict.update(load_file(file_path))
+        return state_dict
+    state_dict = load_safetensors_state_dict(model_args.llm_path)
+    for k in list(state_dict.keys()):
+        if "visual" in k:
+            state_dict[k.replace("visual", "vit_model")] = state_dict.pop(k)
+        else:
+            state_dict["language_model." + k] = state_dict.pop(k)
+    result = model.load_state_dict(state_dict, strict=False)
+    del state_dict
+    import gc; gc.collect(); torch.cuda.empty_cache()
+    return result
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+    msg = model.load_state_dict(model_state_dict, strict=False)
+    del model_state_dict
+    import gc; gc.collect(); torch.cuda.empty_cache()
+    return msg
+def save_prompt_results(prompt_data_dict, save_path_gen):
+    prompt_json_path = os.path.join(save_path_gen, "prompt.json")
+    with open(prompt_json_path, 'w', encoding='utf-8') as f:
+        json.dump(prompt_data_dict, f, ensure_ascii=False, indent=2)
+def resolve_gedit_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("GEdit requires --model_path to be provided explicitly.")
+    if not model_args.llm_path:
+        model_args.llm_path = model_args.model_path
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("gedit.data")
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_path_gen: str = "",
+):
+    val_data = val_data_cpu.cuda(device).to_dict()
+    fsdp_model = fsdp_model.to(device=device, dtype=torch.bfloat16)
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+        metadata = val_data["additional_info"]
+        task_type = metadata["task_type"]
+        instruction_language = metadata["instruction_language"]
+        save_key = metadata["key"]
+        save_dir_current = os.path.join(save_path_gen, "fullset/{}/{}".format(task_type, instruction_language))
+        os.makedirs(save_dir_current, exist_ok=True)
+        # -------------------- GEN 分支 --------------------
+        params = {
+            "val_packed_text_ids": val_data["packed_text_ids"],
+            "val_packed_text_indexes": val_data["packed_text_indexes"],
+            "val_sample_lens": val_data["sample_lens"],
+            "val_packed_position_ids": val_data["packed_position_ids"],
+            "val_split_lens": val_data["split_lens"],
+            "val_attn_modes": val_data["attn_modes"],
+            "val_sample_N_target": val_data["sample_N_target"],
+            "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+            "timestep_shift": training_args.validation_timestep_shift,
+            "num_timesteps": training_args.validation_num_timesteps,
+            "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+            "val_padded_latent": val_data["padded_latent"],
+            "video_sizes": val_data["video_sizes"],
+            "cfg_text_scale": model_args.cfg_text_scale,
+            "cfg_interval": training_args.cfg_interval,
+            "cfg_renorm_min": training_args.cfg_renorm_min,
+            "cfg_renorm_type": training_args.cfg_renorm_type,
+            "device": device,
+            "dtype": torch.bfloat16,
+            "new_token_ids": new_token_ids,
+            "max_samples": training_args.validation_max_samples,
+            "validation_noise_seed": training_args.validation_noise_seed,
+            "apply_chat_template": training_args.apply_chat_template,
+            "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+            "image_token_id": image_token_id,
+            "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+            "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+            "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+            "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+            "video_grid_thw": val_data.get("video_grid_thw", None),
+            "caption": val_data.get("caption", None),
+            "sample_task": val_data["sample_task"],
+            "sample_modality": val_data["sample_modality"],
+            "cfg_type": training_args.cfg_type,
+            "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+            "index": val_data["index"],
+            "val_padded_videos": None,
+        }
+        if inference_args.use_KVcache:
+            denoise_latent, captions, _, _ = fsdp_model.validation_gen_KVcache(**params)
+        else:
+            denoise_latent, captions, _, _ = fsdp_model.validation_gen(**params)
+        for i_val, latent in enumerate(denoise_latent):
+            target_latent = latent[-1]
+            v_target = vae_model.vae_decode([target_latent])[0]
+            v_thwc = decode_video_tensor([v_target], save_path="", save_half=False)
+            if v_thwc.shape[0] != 1:
+                raise NotImplementedError(
+                    "GEdit benchmark only supports image output (max_num_frames=1), "
+                    f"but got {v_thwc.shape[0]} frames."
+                )
+            save_name = f'{save_dir_current}/{save_key}.webp'
+            Image.fromarray(v_thwc[0]).save(save_name)
+            inference_args.prompt_data_dict[save_name] = captions[i_val]
+def main():
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+    training_args.validation_noise_seed = training_args.validation_data_seed
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+    seed = training_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+    resolve_gedit_paths(model_args, data_args)
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+        del vit_weights
+        import gc; gc.collect(); torch.cuda.empty_cache()
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(DEVICE)
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+    init_from_model_path_if_needed(model, model_args)
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+    dataset_config = DataConfig(grouped_datasets={})
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+    if training_args.visual_gen:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        vae_downsample = tuple_mul(
+            model_args.latent_patch_size, (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial)
+        )
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = vae_downsample
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    val_dataset = ValidationDataset(
+        jsonl_path=data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+    )
+    val_loader_iter = iter(val_loader)
+    if not hasattr(inference_args, "prompt_data_dict"):
+        inference_args.prompt_data_dict = {}
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen)
+    for epoch in trange(len(val_loader), desc="Validating", unit="batch", leave=True, ncols=80, disable=(GLOBAL_RANK != 0)):
+        try:
+            val_data_cpu = next(val_loader_iter)
+        except StopIteration:
+            break
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_path_gen=inference_args.save_path_gen,
+        )
+    if dist.is_initialized():
+        dist.barrier()
+        gathered = [None for _ in range(dist.get_world_size())]
+        dist.all_gather_object(gathered, inference_args.prompt_data_dict)
+        if GLOBAL_RANK == 0:
+            merged = {}
+            for d in gathered:
+                merged.update(d)
+            inference_args.prompt_data_dict = merged
+            save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+    elif GLOBAL_RANK == 0:
+        save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

benchmarks/image_gen/GEdit/sample_GEdit.sh ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/bin/bash
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+# ========================= 推理参数配置 =========================
+TASK_NAME="image_edit"
+NUM_GPUS=8
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+USE_KVCACHE=true
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/GEdit/GEdit_en.json"
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/GEdit_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "GEdit 图像编辑评估"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/GEdit/sample_GEdit.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --use_flex              true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --num_replicate         $NUM_REPLICATE \
+    --num_shard             $NUM_SHARD \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --validation_max_samples 100000 \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt 1 \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"

benchmarks/image_gen/GenEVAL/GenEVAL.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/image_gen/GenEVAL/README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+[Chinese Version](./README_zh.md)
+# GenEVAL Image Generation Evaluation
+Benchmark evaluation scripts for GenEVAL based on the Lance model.
+## Files
+- `sample_GenEVAL.py` - Python inference script
+- `sample_GenEVAL.sh` - Launch script (recommended)
+- `GenEVAL.jsonl` - Evaluation dataset
+## Quick Start
+### Basic Usage
+```bash
+bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
+```
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`.
+## Parameters
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | Task type. GenEVAL is fixed to image generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | Number of images generated per case. GenEVAL defaults to 4 images. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | Image resolution. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GenEVAL/GenEVAL.jsonl` | Path to the evaluation data. |
+## How To Modify
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+## Output Format
+Results are saved in a structure like this:
+```
+results/GenEVAL_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 00000/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ├── 0.png
+│       ├── 1.png
+│       ├── 2.png
+│       └── 3.png
+├── 00001/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ...
+```
+Each case generates 4 images by default (`sample_num_per_prompt=4`).
+## Notes
+- If you need to switch the model, dataset, or resolution, edit the script configuration at the top directly.
+- The ViT path is resolved automatically by the code and usually does not need to be configured separately.

benchmarks/image_gen/GenEVAL/README_zh.md ADDED Viewed

	@@ -0,0 +1,73 @@

+[English Version](./README.md)
+# GenEVAL 图像生成评估
+基于 Lance 模型的 GenEVAL 评估基准测试脚本。
+## 文件说明
+- `sample_GenEVAL.py` - 推理 Python 脚本
+- `sample_GenEVAL.sh` - 启动脚本（推荐使用）
+- `GenEVAL.jsonl` - 评估数据集
+## 快速开始
+### 基本用法
+```bash
+bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
+```
+运行前请直接修改 `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` 顶部的“推理参数配置”区。
+## 参数说明
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | 任务类型，GenEVAL 固定为图像生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | 每个 case 生成的图像数量（GenEVAL 默认为 4 张图） |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | 图像分辨率 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GenEVAL/GenEVAL.jsonl` | 评估数据路径 |
+## 修改方式
+- 请手动编辑 `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+## 保存格式
+结果会按照以下结构保存：
+```
+results/GenEVAL_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 00000/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ├── 0.png
+│       ├── 1.png
+│       ├── 2.png
+│       └── 3.png
+├── 00001/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ...
+```
+每个案例生成 4 张图像（`sample_num_per_prompt=4`）。
+## 注意事项
+- 如果需要切换模型、数据集或分辨率，请直接修改脚本顶部配置。
+- ViT 路径默认由代码内部自动解析，无需单独配置。

benchmarks/image_gen/GenEVAL/sample_GenEVAL.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+import os.path as osp
+from copy import deepcopy
+import json
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from torchvision.utils import make_grid
+import numpy as np
+from tqdm import trange
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.val.utils import make_padded_latent
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, EvaluationArguments, get_model_path
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    # 统一从 model_path 加载训练好的 Lance checkpoint。
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+    # NOTE: position embeds are fixed sinusoidal embeddings, so we can just pop it off,
+    # which makes it easier to adapt to different resolutions.
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+    model.load_state_dict(model_state_dict, strict=False)
+    clean_memory(model_state_dict)
+def clean_memory(*objects):
+    """清理内存并释放 GPU 缓存"""
+    for obj in objects:
+        del obj
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def decode_video_tensor_for_geneval(v_list):
+    """
+    专门为 GenEVAL 解码视频张量，保持原有的保存格式
+    """
+    N_target = len(v_list)
+    if N_target != 1:
+        from einops import rearrange
+        padded_videos_latent = [v.permute(1, 0, 2, 3) for v in v_list]
+        v_tc_hw = rearrange(padded_videos_latent, "n t c h w -> t c h (n w)")
+    else:
+        v_tc_hw = v_list[0].permute(1, 0, 2, 3)
+    v_tc_hw = v_tc_hw.float().clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round().clamp(0, 255).to(torch.uint8)
+    return v_tc_hw
+def resolve_geneval_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("GenEVAL requires --model_path to be provided explicitly.")
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("geneval.data")
+def build_runtime_dataset_config(
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    vae_config: Optional[AutoEncoderParams],
+) -> DataConfig:
+    """
+    当前推理链不再依赖 dataset_config_file，运行期 DataConfig 由显式参数拼装。
+    """
+    dataset_config = DataConfig()
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    dataset_config.max_duration = inference_args.max_duration
+    dataset_config.system_prompt_type = inference_args.system_prompt_type
+    if inference_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+    if inference_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = tuple_mul(
+            model_args.latent_patch_size,
+            (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial),
+        )
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+    return dataset_config
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_source_video: bool = False,
+    save_path_gen: str = "",
+    sample_num_per_prompt: int = 1,
+):
+    """
+    验证逻辑，保持与原文件相同的保存格式
+    """
+    # 检查是否初始化了分布式环境
+    if dist.is_initialized():
+        is_rank0 = (dist.get_rank() == 0)
+    else:
+        is_rank0 = True
+    val_data = val_data_cpu.cuda(device).to_dict()
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        # 计算 padded_latent
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+        # 先根据val_data["index"]生成一个新的文件夹
+        index_save = val_data["index"]
+        index_save = f"{index_save:05d}"
+        os.makedirs(os.path.join(save_path_gen, index_save), exist_ok=True)
+        os.makedirs(os.path.join(save_path_gen, index_save, "samples"), exist_ok=True)
+        # 保存metadata.jsonl
+        metadata = val_data["additional_info"]
+        with open(os.path.join(save_path_gen, index_save, "metadata.jsonl"), 'w') as f:
+            f.write(json.dumps(metadata, ensure_ascii=False) + "\n")
+        # -------------------- GEN 分支 --------------------
+        tensor_list_for_grid = []
+        loop_iterator = trange(sample_num_per_prompt) if is_rank0 else range(sample_num_per_prompt)
+        for sample_num_per_prompt_index in loop_iterator:
+            # 采样生成
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": inference_args.validation_timestep_shift,
+                "num_timesteps": inference_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": inference_args.cfg_interval,
+                "cfg_renorm_min": inference_args.cfg_renorm_min,
+                "cfg_renorm_type": inference_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": inference_args.validation_max_samples,
+                "validation_noise_seed": inference_args.validation_noise_seed + sample_num_per_prompt_index,
+                "apply_chat_template": inference_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": inference_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": inference_args.cfg_type,
+                "cfg_uncond_token_id": inference_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": val_data["padded_videos"] if save_source_video else None,
+            }
+            if inference_args.use_KVcache:
+                denoise_latent, _, _, _ = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, _, _, _ = fsdp_model.validation_gen(**params)
+            # 解码 + 保存
+            for latent in denoise_latent:
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+                # 保持与原文件相同的保存格式
+                v_thwc = decode_video_tensor_for_geneval(v_list)
+                # 直接取第0帧
+                if v_thwc.shape[0] == 1:
+                    tensor_list_for_grid.append(v_thwc.squeeze(0).cpu())
+                    # 保存单张图像
+                    save_name = f"{save_path_gen}/{index_save}/samples/{sample_num_per_prompt_index}.png"
+                    Image.fromarray((v_thwc.squeeze(0).permute(1, 2, 0).cpu().numpy()).astype('uint8')).save(save_name)
+                else:
+                    raise NotImplementedError("需要保存图像")
+        # 保存 grid 图
+        save_name = f"{save_path_gen}/{index_save}/grid.png"
+        grid_tensor = make_grid(tensor_list_for_grid, nrow=int(np.sqrt(sample_num_per_prompt)), padding=0, pad_value=255)
+        grid_numpy = grid_tensor.permute(1, 2, 0).numpy()
+        Image.fromarray(grid_numpy).save(save_name)
+def main():
+    # ========================= Env setup ==============================
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+    # ========================= Args and logger setup ==============================
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(Tuple[ModelArguments, DataArguments, EvaluationArguments], parser.parse_args_into_dataclasses())
+    # ========================= GenEVAL 路径解析 ==============================
+    resolve_geneval_paths(model_args, data_args)
+    # NOTE validation_noise_seed 与 validation_data_seed 相同
+    inference_args.validation_noise_seed = inference_args.evaluation_seed
+    inference_args.validation_data_seed = inference_args.evaluation_seed
+    # Set seed:
+    seed = inference_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+    # ========================= LLM model setup ==============================
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = inference_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+    if inference_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+        clean_memory(vit_weights)
+    if inference_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+    # Lance的配置
+    config = LanceConfig(
+        visual_gen=inference_args.visual_gen,
+        visual_und=inference_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if inference_args.visual_und else None,
+        vae_config=vae_config if inference_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=inference_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if inference_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=inference_args,
+    )
+    model = model.to(DEVICE)
+    # Setup tokenizer for model:
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+    # 在加载ckpt前，初始化moe
+    if inference_args.copy_init_moe:
+        language_model.init_moe()
+    init_from_model_path_if_needed(model, model_args)
+    # 现在再 resize
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    # Some VAE wrappers (e.g. `WanVideoVAE`) are plain helper objects rather
+    # than `nn.Module`s, and their internal model is already switched to eval.
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+    dataset_config = build_runtime_dataset_config(
+        model_args=model_args,
+        inference_args=inference_args,
+        vae_config=vae_config,
+    )
+    # 创建数据集
+    val_dataset = ValidationDataset(
+        jsonl_path= data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=inference_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=GLOBAL_RANK,
+        world_size=WORLD_SIZE,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+        prefetch_factor=None,
+        persistent_workers=False,
+        multiprocessing_context=None,
+    )
+    val_loader_iter = iter(val_loader)
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen, exist_ok=True)
+    # 主循环
+    for _ in trange(len(val_loader), desc="Validating", unit="batch", leave=True, ncols=80, disable=(GLOBAL_RANK != 0)):
+        val_data_cpu = next(val_loader_iter)
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_source_video=False,
+            save_path_gen=inference_args.save_path_gen,
+            sample_num_per_prompt=inference_args.sample_num_per_prompt,
+        )
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/bin/bash
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2i"
+NUM_GPUS=8
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=4
+USE_KVCACHE=true
+VIDEO_HEIGHT=768
+VIDEO_WIDTH=768
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/GenEVAL/GenEVAL.jsonl"
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/GenEVAL_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "GenEVAL T2I 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/GenEVAL/sample_GenEVAL.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --video_height          $VIDEO_HEIGHT \
+    --video_width           $VIDEO_WIDTH \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"

benchmarks/sample_env.sh ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/bin/bash
+find_available_port() {
+    local start_port="${1:-6666}"
+    local end_port="${2:-8888}"
+    python3 - "$start_port" "$end_port" <<'PY'
+import socket
+import sys
+start_port = int(sys.argv[1])
+end_port = int(sys.argv[2])
+for port in range(start_port, end_port):
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind(("", port))
+        sock.close()
+        print(port)
+        raise SystemExit(0)
+    except OSError:
+        continue
+print(start_port)
+PY
+}
+lance_setup_common_env() {
+    export EXP_HW_20250819="${EXP_HW_20250819:-False}"
+    echo "EXP_HW_20250819: $EXP_HW_20250819"
+    export POSITION_EMBEDDING_3D_VERSION="${POSITION_EMBEDDING_3D_VERSION:-v2}"
+    echo "(shell) POSITION_EMBEDDING_3D_VERSION: $POSITION_EMBEDDING_3D_VERSION"
+    # Default to async CUDA execution for benchmark/inference throughput.
+    # Override with CUDA_LAUNCH_BLOCKING=1 only when debugging kernel failures.
+    export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-0}"
+    export NCCL_DEBUG="${NCCL_DEBUG:-VERSION}"
+    export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-900}"
+}
+lance_setup_distributed_env() {
+    local num_gpus="${1:-1}"
+    local default_main_process_port
+    local has_explicit_main_process_port=0
+    NUM_GPUS="$num_gpus"
+    if [ -n "$MAIN_PROCESS_PORT" ]; then
+        has_explicit_main_process_port=1
+    fi
+    if [ -n "${ARNOLD_WORKER_NUM:-}" ]; then
+        echo "使用平台分布式环境"
+        NUM_MACHINES="${NUM_MACHINES:-$ARNOLD_WORKER_NUM}"
+        MACHINE_RANK="${MACHINE_RANK:-${ARNOLD_ID:-0}}"
+        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-${ARNOLD_WORKER_0_HOST:-127.0.0.1}}"
+        default_main_process_port="${ARNOLD_WORKER_0_PORT:-6666}"
+        if [ "$has_explicit_main_process_port" -eq 1 ]; then
+            :
+        elif [ "${NUM_MACHINES}" = "1" ]; then
+            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
+        else
+            MAIN_PROCESS_PORT="$default_main_process_port"
+            echo "多机任务使用平台 rendezvous 端口: $MAIN_PROCESS_PORT"
+        fi
+    else
+        echo "使用本地或显式配置的分布式环境"
+        NUM_MACHINES="${NUM_MACHINES:-1}"
+        MACHINE_RANK="${MACHINE_RANK:-0}"
+        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-127.0.0.1}"
+        default_main_process_port=6666
+        if [ "$has_explicit_main_process_port" -eq 1 ]; then
+            :
+        else
+            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
+        fi
+    fi
+    TOTAL_RANK=$((NUM_MACHINES * NUM_GPUS))
+    export NUM_GPUS NUM_MACHINES MACHINE_RANK MAIN_PROCESS_IP MAIN_PROCESS_PORT TOTAL_RANK
+    echo "NUM_MACHINES: $NUM_MACHINES"
+    echo "NUM_GPUS: $NUM_GPUS"
+    echo "TOTAL_RANK: $TOTAL_RANK"
+    echo "MACHINE_RANK: $MACHINE_RANK"
+    echo "MAIN_PROCESS_IP: $MAIN_PROCESS_IP"
+    echo "MAIN_PROCESS_PORT: $MAIN_PROCESS_PORT"
+}
+lance_setup_shard_env() {
+    local num_shard="${1:-1}"
+    NUM_SHARD="$num_shard"
+    NUM_REPLICATE=$((TOTAL_RANK / NUM_SHARD))
+    export NUM_SHARD NUM_REPLICATE
+    echo "NUM_REPLICATE: $NUM_REPLICATE"
+    echo "NUM_SHARD: $NUM_SHARD"
+}

benchmarks/video_gen/Vbench/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+[Chinese Version](./README_zh.md)
+# VBench Video Generation Evaluation
+Benchmark evaluation scripts for VBench based on the Lance model.
+## Files
+- `sample_vbench.py` - Python inference script
+- `sample_vbench.sh` - Launch script (recommended)
+- `Vbench_recaption.jsonl` - Evaluation dataset
+## Quick Start
+### Basic Usage
+```bash
+bash sample_vbench.sh
+```
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/video_gen/Vbench/sample_vbench.sh`.
+## Parameters
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2v` | Task type. VBench is fixed to video generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 5 | Number of videos generated for each regular prompt. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 480 | Video resolution. |
+| `NUM_FRAMES` | 50 | Number of output video frames. |
+| `MAX_NUM_FRAMES` | 121 | Maximum number of frames per sample. |
+| `MAX_LATENT_SIZE` | 64 | Maximum latent size. |
+| `RESOLUTION` | `video_480p` | Dataset resolution tag. |
+| `MODEL_PATH` | `downloads/Lance_3B_Video` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/video_gen/Vbench/Vbench_recaption.jsonl` | Path to the evaluation data. |
+| `CONFIG_JSON_PATH` | `""` | Optional training configuration JSON. |
+## How To Modify
+- Edit the "Inference Parameters" section at the top of `benchmarks/video_gen/Vbench/sample_vbench.sh`.
+- After updating the parameters, run `bash benchmarks/video_gen/Vbench/sample_vbench.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+## Output Format
+Results are saved in a structure like this:
+```
+results/Vbench_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── In a still frame, a stop sign-0.mp4
+├── In a still frame, a stop sign-1.mp4
+├── a toilet, frozen in time-0.mp4
+├── ...
+├── prompt.json
+```
+Each prompt generates `SAMPLE_NUM_PER_PROMPT` videos by default, named as `original-prompt-sample-index.mp4`. A `prompt.json` file is also written to record the generated text.
+If `temporal_flickering_prompts.json` exists in the repository, the corresponding prompts automatically use a larger sample count. If the file does not exist, the script directly uses `SAMPLE_NUM_PER_PROMPT`.
+## Notes
+- If you need to switch the model, dataset, frame count, or resolution, edit the script configuration at the top directly.
+- The ViT path is resolved automatically by the code and usually does not need to be configured separately.
+- `CONFIG_JSON_PATH` is only passed through as an optional training configuration JSON and does not override the other explicit script parameters.

benchmarks/video_gen/Vbench/README_zh.md ADDED Viewed

	@@ -0,0 +1,72 @@

+[English Version](./README.md)
+# VBench 视频生成评估
+基于 Lance 模型的 VBench 评估基准测试脚本。
+## 文件说明
+- `sample_vbench.py` - 推理 Python 脚本
+- `sample_vbench.sh` - 启动脚本（推荐使用）
+- `Vbench_recaption.jsonl` - 评估数据集
+## 快速开始
+### 基本用法
+```bash
+bash sample_vbench.sh
+```
+运行前请直接修改 `benchmarks/video_gen/Vbench/sample_vbench.sh` 顶部的“推理参数配置”区。
+## 参数说明
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2v` | 任务类型，VBench 固定为视频生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 5 | 每个普通 prompt 生成的视频数量 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 480 | 视频分辨率 |
+| `NUM_FRAMES` | 50 | 输出视频帧数 |
+| `MAX_NUM_FRAMES` | 121 | 单个样本最大帧数 |
+| `MAX_LATENT_SIZE` | 64 | latent size 上限 |
+| `RESOLUTION` | `video_480p` | 数据集分辨率标签 |
+| `MODEL_PATH` | `downloads/Lance_3B_Video` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/video_gen/Vbench/Vbench_recaption.jsonl` | 评估数据路径 |
+| `CONFIG_JSON_PATH` | `""` | 可选训练配置 JSON |
+## 修改方式
+- 请手动编辑 `benchmarks/video_gen/Vbench/sample_vbench.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/video_gen/Vbench/sample_vbench.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+## 保存格式
+结果会按照以下结构保存：
+```
+results/Vbench_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── In a still frame, a stop sign-0.mp4
+├── In a still frame, a stop sign-1.mp4
+├── a toilet, frozen in time-0.mp4
+├── ...
+├── prompt.json
+```
+每个 prompt 默认生成 `SAMPLE_NUM_PER_PROMPT` 个视频，并按 `原始 prompt-采样序号.mp4` 命名；同时会额外写出 `prompt.json` 记录生成文本。
+如果仓库中存在 `temporal_flickering_prompts.json`，对应 prompt 会自动提升采样数；当前文件不存在时，脚本会直接使用 `SAMPLE_NUM_PER_PROMPT`。
+## 注意事项
+- 如果需要切换模型、数据集、帧数或分辨率，请直接修改脚本顶部配置。
+- ViT 路径默认由代码内部自动解析，无需单独配置。
+- `CONFIG_JSON_PATH` 仅作为可选训练配置 JSON 传入，不会替代脚本顶部其它显式参数。

benchmarks/video_gen/Vbench/Vbench_recaption.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

benchmarks/video_gen/Vbench/sample_vbench.py ADDED Viewed

	@@ -0,0 +1,559 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+import json
+import os.path as osp
+from copy import deepcopy
+from dataclasses import asdict, fields
+from pathlib import Path
+from typing import Optional, Tuple, cast
+import imageio
+import torch
+import torch.distributed as dist
+from safetensors.torch import load_file
+from torch.utils.data import DataLoader
+from tqdm import trange
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from config.config_factory import (
+    DataArguments,
+    EvaluationArguments,
+    ModelArguments,
+    TrainingArguments,
+    get_model_path,
+)
+from common.model.hacks import hack_qwen2_5_vl_config
+from common.utils.misc import AutoEncoderParams, tuple_mul
+from common.val.utils import decode_video_tensor, make_padded_latent
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from data.datasets_custom import ValidationDataset
+from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+PROMPT_JSON_FILENAME = "prompt.json"
+TEMPORAL_FLICKERING_SAMPLE_NUM = 25
+DEFAULT_VBENCH_DATA = "benchmarks/video_gen/Vbench/Vbench_recaption.jsonl"
+TEMPORAL_FLICKERING_PROMPT_FILE = (
+    Path(__file__).resolve().parent / "temporal_flickering_prompts.json"
+)
+def load_temporal_flickering_prompts() -> set[str]:
+    if not TEMPORAL_FLICKERING_PROMPT_FILE.exists():
+        warnings.warn(
+            f"Temporal flickering prompt file not found: {TEMPORAL_FLICKERING_PROMPT_FILE}. "
+            "Falling back to an empty prompt set.",
+            stacklevel=2,
+        )
+        return set()
+    with TEMPORAL_FLICKERING_PROMPT_FILE.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    return set(data)
+PROMPT_WITH_TEMPORAL_FLICKERING = load_temporal_flickering_prompts()
+def clean_memory(*objects):
+    for obj in objects:
+        del obj
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+def init_from_model_path_if_needed(
+    model: Qwen2ForCausalLM,
+    model_args: ModelArguments,
+):
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+    if "latent_pos_embed.pos_embed" in model_state_dict:
+        model_state_dict.pop("latent_pos_embed.pos_embed")
+    model.load_state_dict(model_state_dict, strict=False)
+    clean_memory(model_state_dict)
+def resolve_vbench_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("VBench requires --model_path to be provided explicitly.")
+    if not getattr(model_args, "llm_path", ""):
+        model_args.llm_path = model_args.model_path
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = DEFAULT_VBENCH_DATA
+def build_runtime_dataset_config(
+    model_args: ModelArguments,
+    training_args: TrainingArguments,
+    inference_args: EvaluationArguments,
+    vae_config: Optional[AutoEncoderParams],
+) -> DataConfig:
+    dataset_config = DataConfig()
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    dataset_config.max_duration = inference_args.max_duration
+    dataset_config.system_prompt_type = inference_args.system_prompt_type
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+    if training_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = tuple_mul(
+            model_args.latent_patch_size,
+            (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial),
+        )
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+    return dataset_config
+def save_prompt_results(prompt_data_dict, save_path_gen: str):
+    prompt_json_path = os.path.join(save_path_gen, PROMPT_JSON_FILENAME)
+    with open(prompt_json_path, "w", encoding="utf-8") as f:
+        json.dump(prompt_data_dict, f, ensure_ascii=False, indent=2)
+def safe_instantiate(cls, cfg: dict, name: str):
+    valid_keys = {f.name for f in fields(cls)}
+    valid, invalid = {}, {}
+    for k, v in cfg.items():
+        if k in valid_keys:
+            valid[k] = v
+        else:
+            invalid[k] = v
+    if invalid:
+        print(f"[WARN] {name} 过滤无效参数: {invalid}")
+    return cls(**valid)
+def is_valid_value(value):
+    return value is not None
+def merge_args(original_args, override_args):
+    merged_dict = asdict(original_args)
+    override_dict = asdict(override_args)
+    for key, value in override_dict.items():
+        if is_valid_value(value):
+            merged_dict[key] = value
+    return original_args.__class__(**merged_dict)
+def apply_config_json_overrides(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    inference_args: EvaluationArguments,
+):
+    if not inference_args.config_json_path or not inference_args.config_json_path.endswith(".json"):
+        return model_args, data_args, inference_args
+    model_path_original = model_args.model_path
+    val_dataset_config_file_original = data_args.val_dataset_config_file
+    with open(inference_args.config_json_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+    if "model_args" in config:
+        model_args = merge_args(
+            model_args,
+            safe_instantiate(ModelArguments, config["model_args"], "ModelArguments"),
+        )
+    if "data_args" in config:
+        data_args = merge_args(
+            data_args,
+            safe_instantiate(DataArguments, config["data_args"], "DataArguments"),
+        )
+    if "training_args" in config:
+        inference_args = merge_args(
+            inference_args,
+            safe_instantiate(EvaluationArguments, config["training_args"], "EvaluationArguments"),
+        )
+    model_args.model_path = model_path_original
+    if getattr(model_args, "llm_path", "") == "":
+        model_args.llm_path = model_path_original
+    data_args.val_dataset_config_file = val_dataset_config_file_original
+    return model_args, data_args, inference_args
+def get_sample_num_per_prompt(
+    inference_args: EvaluationArguments,
+    prompt: str,
+) -> int:
+    if prompt in PROMPT_WITH_TEMPORAL_FLICKERING:
+        if inference_args.quick_debug:
+            return min(inference_args.sample_num_per_prompt, 5)
+        return max(inference_args.sample_num_per_prompt, TEMPORAL_FLICKERING_SAMPLE_NUM)
+    return inference_args.sample_num_per_prompt
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_path_gen: str,
+):
+    is_rank0 = not dist.is_initialized() or dist.get_rank() == 0
+    val_data = val_data_cpu.cuda(device).to_dict()
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        if "padded_videos" in val_data:
+            val_data["padded_latent"] = make_padded_latent(
+                val_data["padded_videos"],
+                val_data["vae_data_mode"],
+                vae_model,
+            )
+        prompt = val_data.get("original_prompt_en") or val_data.get("caption")
+        if not prompt:
+            raise ValueError("VBench sample requires `original_prompt_en` or `caption` in dataset.")
+        sample_num_per_prompt = get_sample_num_per_prompt(inference_args, prompt)
+        loop_iterator = trange(sample_num_per_prompt, disable=(not is_rank0), leave=False, desc="Sampling")
+        for sample_idx in loop_iterator:
+            save_name = f"{save_path_gen}/{prompt}-{sample_idx}.mp4"
+            if os.path.exists(save_name):
+                continue
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": training_args.validation_timestep_shift,
+                "num_timesteps": training_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": training_args.cfg_interval,
+                "cfg_renorm_min": training_args.cfg_renorm_min,
+                "cfg_renorm_type": training_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": training_args.validation_max_samples,
+                "validation_noise_seed": training_args.validation_noise_seed + sample_idx,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": training_args.cfg_type,
+                "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": None,
+            }
+            if inference_args.use_KVcache:
+                denoise_latent, captions, _, _ = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, captions, _, _ = fsdp_model.validation_gen(**params)
+            for i_val, latent in enumerate(denoise_latent):
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+                v_thwc = decode_video_tensor(v_list)
+                imageio.mimsave(
+                    save_name,
+                    v_thwc,
+                    fps=inference_args.validation_video_saving_fps,
+                    format="mp4",
+                )
+                inference_args.prompt_data_dict[os.path.basename(save_name)] = captions[i_val]
+                clean_memory(v_list, v_thwc)
+            clean_memory(denoise_latent, captions)
+def main():
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        global_rank = 0
+        world_size = 1
+    local_rank = global_rank % torch.cuda.device_count()
+    device = local_rank
+    torch.cuda.set_device(device)
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+    model_args, data_args, inference_args = apply_config_json_overrides(
+        model_args,
+        data_args,
+        inference_args,
+    )
+    training_args = inference_args
+    resolve_vbench_paths(model_args, data_args)
+    training_args.validation_noise_seed = inference_args.evaluation_seed
+    training_args.validation_data_seed = inference_args.evaluation_seed
+    seed = training_args.global_seed * world_size + global_rank
+    set_seed(seed)
+    log_rank0 = print if global_rank == 0 else (lambda *_: None)
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+        clean_memory(vit_weights)
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: Optional[AutoEncoderParams] = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(device)
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+    init_from_model_path_if_needed(model, model_args)
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        language_model = hack_qwen2_5_vl_config(language_model)
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert (
+            model.language_model.get_input_embeddings().weight.data.data_ptr()
+            != model.language_model.get_output_embeddings().weight.data.data_ptr()
+        ), "tie_world_embeddings 冲突"
+    model = model.to(device=device, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+    dataset_config = build_runtime_dataset_config(
+        model_args=model_args,
+        training_args=training_args,
+        inference_args=inference_args,
+        vae_config=vae_config,
+    )
+    val_dataset = ValidationDataset(
+        jsonl_path=data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=global_rank,
+        world_size=world_size,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+        prefetch_factor=None,
+        persistent_workers=False,
+        multiprocessing_context=None,
+    )
+    val_loader_iter = iter(val_loader)
+    if not hasattr(inference_args, "prompt_data_dict"):
+        inference_args.prompt_data_dict = {}
+    os.makedirs(inference_args.save_path_gen, exist_ok=True)
+    for _ in trange(
+        len(val_loader),
+        desc="Validating",
+        unit="batch",
+        leave=True,
+        ncols=80,
+        disable=(global_rank != 0),
+    ):
+        val_data_cpu = next(val_loader_iter)
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=device,
+            save_path_gen=inference_args.save_path_gen,
+        )
+    if dist.is_initialized():
+        dist.barrier()
+        gathered = [None for _ in range(dist.get_world_size())]
+        dist.all_gather_object(gathered, inference_args.prompt_data_dict)
+        if global_rank == 0:
+            merged = {}
+            for d in gathered:
+                merged.update(d)
+            inference_args.prompt_data_dict = merged
+            save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+    elif global_rank == 0:
+        save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

benchmarks/video_gen/Vbench/sample_vbench.sh ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/bin/bash
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2v"
+NUM_GPUS=8
+VALIDATION_NUM_TIMESTEPS=30 # 30 # 50 # 10 # 30 # 50
+VALIDATION_TIMESTEP_SHIFT=3.0 # 3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=5
+USE_KVCACHE=true
+VIDEO_HEIGHT=480
+VIDEO_WIDTH=848
+NUM_FRAMES=50
+MAX_NUM_FRAMES=121
+MAX_LATENT_SIZE=64
+RESOLUTION="video_480p"
+MODEL_PATH="downloads/Lance_3B_Video"
+VAL_DATASET_CONFIG_FILE="benchmarks/video_gen/Vbench/Vbench_recaption.jsonl"
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/Vbench_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "VBench T2V 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "输出帧数: ${NUM_FRAMES}"
+echo "最大帧数: ${MAX_NUM_FRAMES}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+if [ -n "$CONFIG_JSON_PATH" ]; then
+    echo "配置JSON: ${CONFIG_JSON_PATH}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - num_frames: ${NUM_FRAMES}"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines                      $NUM_MACHINES           \
+    --num_processes                     $TOTAL_RANK             \
+    --machine_rank                      $MACHINE_RANK           \
+    --main_process_ip                   $MAIN_PROCESS_IP        \
+    --main_process_port                 $MAIN_PROCESS_PORT      \
+    --mixed_precision                   bf16                    \
+    benchmarks/video_gen/Vbench/sample_vbench.py \
+    --model_path                        "$MODEL_PATH" \
+    --val_dataset_config_file           "$VAL_DATASET_CONFIG_FILE" \
+    --config_json_path                  "$CONFIG_JSON_PATH" \
+    --vit_type                          qwen_2_5_vl_original \
+    --llm_qk_norm                       true \
+    --llm_qk_norm_und                   true \
+    --llm_qk_norm_gen                   true \
+    --tie_word_embeddings               false \
+    --validation_num_timesteps          $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift         $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe                     true \
+    --use_flex                          true \
+    --max_num_frames                    $MAX_NUM_FRAMES \
+    --max_latent_size                   $MAX_LATENT_SIZE \
+    --latent_patch_size                 1 1 1 \
+    --num_replicate                     $NUM_REPLICATE \
+    --num_shard                         $NUM_SHARD \
+    --visual_und                        true \
+    --visual_gen                        true \
+    --vae_model_type                    wan \
+    --apply_qwen_2_5_vl_pos_emb         true \
+    --apply_chat_template               false \
+    --cfg_type                          0 \
+    --validation_video_saving_fps       12 \
+    --validation_log_type               direct \
+    --video_height                      $VIDEO_HEIGHT \
+    --video_width                       $VIDEO_WIDTH \
+    --num_frames                        $NUM_FRAMES \
+    --task                              $TASK_NAME \
+    --save_path_gen                     $SAVE_PATH_GEN \
+    --resolution                        $RESOLUTION \
+    --evaluation_seed                   $EVALUATION_SEED \
+    --text_template                     true \
+    --sample_num_per_prompt             $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale                    $CFG_TEXT_SCALE \
+    --cfg_interval                      $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache                       $USE_KVCACHE
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"

benchmarks/video_gen/Vbench/temporal_flickering_prompts.json ADDED Viewed

	@@ -0,0 +1,77 @@

+[
+  "In a still frame, a stop sign",
+  "a toilet, frozen in time",
+  "a laptop, frozen in time",
+  "A tranquil tableau of alley",
+  "A tranquil tableau of bar",
+  "A tranquil tableau of barn",
+  "A tranquil tableau of bathroom",
+  "A tranquil tableau of bedroom",
+  "A tranquil tableau of cliff",
+  "In a still frame, courtyard",
+  "In a still frame, gas station",
+  "A tranquil tableau of house",
+  "indoor gymnasium, frozen in time",
+  "A tranquil tableau of indoor library",
+  "A tranquil tableau of kitchen",
+  "A tranquil tableau of palace",
+  "In a still frame, parking lot",
+  "In a still frame, phone booth",
+  "A tranquil tableau of restaurant",
+  "A tranquil tableau of tower",
+  "A tranquil tableau of a bowl",
+  "A tranquil tableau of an apple",
+  "A tranquil tableau of a bench",
+  "A tranquil tableau of a bed",
+  "A tranquil tableau of a chair",
+  "A tranquil tableau of a cup",
+  "A tranquil tableau of a dining table",
+  "In a still frame, a pear",
+  "A tranquil tableau of a bunch of grapes",
+  "A tranquil tableau of a bowl on the kitchen counter",
+  "A tranquil tableau of a beautiful, handcrafted ceramic bowl",
+  "A tranquil tableau of an antique bowl",
+  "A tranquil tableau of an exquisite mahogany dining table",
+  "A tranquil tableau of a wooden bench in the park",
+  "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers",
+  "In a still frame, a park bench with a view of the lake",
+  "A tranquil tableau of a vintage rocking chair was placed on the porch",
+  "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars",
+  "A tranquil tableau of the phone booth was tucked away in a quiet alley",
+  "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time",
+  "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside",
+  "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow",
+  "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water",
+  "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape",
+  "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens",
+  "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels",
+  "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility",
+  "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity",
+  "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water",
+  "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night",
+  "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water",
+  "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square",
+  "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner",
+  "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy",
+  "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins",
+  "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes",
+  "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades",
+  "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall",
+  "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels",
+  "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour",
+  "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting",
+  "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light",
+  "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
+  "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon",
+  "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space",
+  "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk",
+  "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier",
+  "A tranquil tableau of a country estate's library featured elegant wooden shelves",
+  "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently",
+  "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm",
+  "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden",
+  "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface",
+  "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation",
+  "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms",
+  "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time"
+]

common/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+"""Common utilities package."""

common/model/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+from .hacks import hack_qwen2_5_vl_config
+__all__ = [
+    "hack_qwen2_5_vl_config",
+]

common/model/checks.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8

common/model/hacks.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+def hack_qwen2_5_vl_config(language_model):
+    # HACK!!!!!
+    language_model.config.image_token_id = 151655
+    language_model.config.video_token_id = 151656
+    language_model.config.vision_start_token_id = 151652
+    language_model.config.vision_end_token_id = 151653
+    language_model.config.vision_config = {
+        "depth": 32,
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "intermediate_size": 3420,
+        "num_heads": 16,
+        "in_chans": 3,
+        "out_hidden_size": 2048,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "window_size": 112,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "tokens_per_second": 2,
+        "temporal_patch_size": 2
+    }
+    language_model.config.rope_scaling = {
+        "type": "mrope",
+        "mrope_section": [
+            16,
+            24,
+            24
+        ]
+    }
+    return language_model

common/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+from .distributed import (
+    get_global_rank,
+    get_local_rank,
+    get_world_size,
+    is_master,
+    get_device,
+    barrier_if_distributed,
+)
+from .logging import get_logger
+from .misc import AutoEncoderParams, tuple_mul
+from .tensor_ops import (
+    flatten,
+    unflatten,
+    rearrange,
+    repeat,
+    pack,
+    unpack,
+)
+__all__ = [
+    # distributed
+    "get_global_rank",
+    "get_local_rank",
+    "get_world_size",
+    "is_master",
+    "get_device",
+    "barrier_if_distributed",
+    # logging
+    "get_logger",
+    # misc
+    "AutoEncoderParams",
+    "tuple_mul",
+    # tensor_ops
+    "flatten",
+    "unflatten",
+    "rearrange",
+    "repeat",
+    "pack",
+    "unpack",
+]

common/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+import os
+import torch
+import torch.distributed as dist
+def get_global_rank() -> int:
+    """
+    Get the global rank, the global index of the GPU.
+    """
+    return int(os.environ.get("RANK", "0"))
+def get_local_rank() -> int:
+    """
+    Get the local rank, the local index of the GPU.
+    """
+    return int(os.environ.get("LOCAL_RANK", "0"))
+def get_world_size() -> int:
+    """
+    Get the world size, the total amount of GPUs.
+    """
+    return int(os.environ.get("WORLD_SIZE", "1"))
+def is_master():
+    """
+    Check if the current process is the master process (rank 0).
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        return True
+    return dist.get_rank() == 0
+def get_device() -> torch.device:
+    """
+    Get current rank device.
+    """
+    return torch.device("cuda", get_local_rank())
+def barrier_if_distributed(*args, **kwargs):
+    """
+    Synchronizes all processes if under distributed context.
+    """
+    if dist.is_initialized():
+        return dist.barrier(*args, **kwargs)

common/utils/logging.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Logging utility functions.
+"""
+import logging
+import sys
+from typing import Optional
+from .distributed import get_global_rank, get_local_rank, get_world_size
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False # 修复: 禁用日志传播，防止日志被父级 logger 重复处理
+    if not logger.handlers:  # 只看自身，避免祖先影响
+        h = logging.StreamHandler(sys.stdout)
+        fmt = logging.Formatter(
+            "[%(asctime)s] "
+            + (f"[Rank:{get_global_rank()}]" if get_world_size() > 1 else "")
+            + (f"[LocalRank:{get_local_rank()}]" if get_world_size() > 1 else "")
+            + "[%(pathname)s:%(lineno)d][%(threadName).12s][%(name)s][%(levelname).5s] %(message)s"
+        )
+        h.setFormatter(fmt)
+        logger.addHandler(h)
+    return logger

common/utils/misc.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+from dataclasses import dataclass
+@dataclass
+class AutoEncoderParams:
+    downsample_spatial: int
+    downsample_temporal: int
+    z_channels: int
+    # for flux
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+def tuple_mul(a: tuple, b: tuple) -> tuple:
+    """
+    返回两个同长度 tuple 的按位乘积。
+    参数：
+        a (tuple of numbers)：第一个元组
+        b (tuple of numbers)：第二个元组，长度需与 a 一致
+    返回：
+        tuple：按位相乘后的结果
+    """
+    if len(a) != len(b):
+        raise ValueError("两个元组长度必须相等")
+    return tuple(x * y for x, y in zip(a, b))