diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..4fb796ca52168292a5f18c47b2d046242e58e855 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,62 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/benchmarks/benchmark-overview.png filter=lfs diff=lfs merge=lfs -text
+assets/image-editing/image-editing-overview.webp filter=lfs diff=lfs merge=lfs -text
+assets/image-understanding/cases/image-understanding-case-01.png filter=lfs diff=lfs merge=lfs -text
+assets/image-understanding/cases/image-understanding-case-03.png filter=lfs diff=lfs merge=lfs -text
+assets/image-understanding/cases/image-understanding-case-04.png filter=lfs diff=lfs merge=lfs -text
+assets/image-understanding/cases/image-understanding-case-05.png filter=lfs diff=lfs merge=lfs -text
+assets/image-understanding/cases/image-understanding-case-06.png filter=lfs diff=lfs merge=lfs -text
+assets/logo/lance-logo.webp filter=lfs diff=lfs merge=lfs -text
+assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-image/text-to-image-overview.webp filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-02.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-03.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-04.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-05.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-06.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-07.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/previews/text-to-video-demo-08.gif filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-02.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-03.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-04.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-05.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-06.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-07.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/text-to-video/videos/text-to-video-demo-08.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-02.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-03.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-04.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-05.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-06.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-07.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/previews/video-editing-demo-08.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-02.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-03.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-04.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-05.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-06.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-07.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-editing/videos/video-editing-demo-08.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-caption-long-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-caption-short-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-vqa-01.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-vqa-02.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-vqa-03.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/previews/video-understanding-vqa-04.gif filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-caption-long-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-caption-short-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-vqa-01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-vqa-02.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-vqa-03.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/video-understanding/videos/video-understanding-vqa-04.mp4 filter=lfs diff=lfs merge=lfs -text
+config/examples/image_edit_examples/index000000_cond1.jpg filter=lfs diff=lfs merge=lfs -text
+config/examples/image_edit_examples/index000001_cond1.jpg filter=lfs diff=lfs merge=lfs -text
+config/examples/video_edit_examples/index000000_cond1.mp4 filter=lfs diff=lfs merge=lfs -text
+config/examples/video_edit_examples/index000001_cond1.mp4 filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ce085919601de266040d8d576fa6bd8419965437
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,32 @@
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*.so
+
+.Python
+.python-version
+.venv/
+venv/
+env/
+ENV/
+
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+
+build/
+dist/
+*.egg-info/
+.eggs/
+
+.ipynb_checkpoints/
+
+.DS_Store
+
+# custom ignore
+results/
+downloads/
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..398a156ba77b9a88359fd2969b93c051f7fd574c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,587 @@
+<div align="center">
+  <img src="assets/logo/lance-logo.webp" alt="Lance logo" width="300">
+
+  <h1 align="center"><sup>Lance: Unified Multimodal Modeling by Multi-Task Synergy</sup></h1>
+  <p>
+    <strong>
+    <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Fengyi Fu</a><sup>*</sup>, 
+    <a href="https://corleone-huang.github.io/" style="text-decoration: none; color: inherit;">Mengqi Huang</a><sup>*,✉</sup>, 
+    <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Shaojin Wu</a><sup>*</sup>, 
+    Yunsheng Jiang<sup>*</sup>, 
+    Yufei Huo, 
+    <a href="https://guojianzhu.com/" style="text-decoration: none; color: inherit;">Jianzhu Guo</a><sup>✉,§</sup>
+    </strong><br>
+    Hao Li, 
+    Yinghang Song, 
+    Fei Ding, 
+    Qian He, 
+    Zheren Fu, 
+    Zhendong Mao, 
+    Yongdong Zhang
+    <br>
+    <em>ByteDance</em>
+    <br>
+    <sup>*</sup> Equal contribution &nbsp;&nbsp; <sup>✉</sup> Corresponding authors &nbsp;&nbsp; <sup>§</sup> Project lead
+  </p>
+  <p>
+    <a href="https://lance-project.github.io/" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat" alt="Homepage"></a>
+    <a href="http://arxiv.org/abs/2605.18678" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv" alt="arXiv"></a>
+    <a href="https://huggingface.co/bytedance-research/Lance" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface" alt="Model"></a>
+    <br>
+    English | <a href="./README_zh.md"><ins>简体中文</ins></a>
+  </p>
+</div>
+
+## 🌟 Highlights
+
+**Lance** is a 3B native unified multimodal model that supports **image and video understanding, generation, and editing** within a single framework.
+
+- **Efficient at 3B scale.** With only **3B active parameters**, Lance delivers strong performance across image generation, image editing, and video generation benchmarks.
+- **Trained from scratch.** Lance is built with a staged multi-task recipe and trained entirely from scratch within a **128-A100-GPU** budget.
+
+<div align="center">
+  <img src="assets/benchmarks/benchmark-overview.png" alt="Lance benchmark overview across image generation, image editing, video generation, and video understanding" width="980">
+</div>
+
+## 🎨 Demo
+
+### Text-to-Video
+
+<table align="center">
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-01.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-02.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-03.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-04.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-05.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-06.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-07.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-08.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### Video Editing
+
+<table align="center">
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-01.mp4"><img src="assets/video-editing/previews/video-editing-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-02.mp4"><img src="assets/video-editing/previews/video-editing-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-03.mp4"><img src="assets/video-editing/previews/video-editing-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-04.mp4"><img src="assets/video-editing/previews/video-editing-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-05.mp4"><img src="assets/video-editing/previews/video-editing-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-06.mp4"><img src="assets/video-editing/previews/video-editing-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-07.mp4"><img src="assets/video-editing/previews/video-editing-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-08.mp4"><img src="assets/video-editing/previews/video-editing-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### Multi-turn Consistency Editing
+
+<div align="center">
+  <a href="assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4">
+    <img src="assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif" width="100%">
+  </a>
+</div>
+
+### Intelligent Video Generation
+
+<table align="center">
+  <tr>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-01.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-02.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-03.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-04.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### Video Understanding
+
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> How many times did the person launch objects on the table? Options: (A) 3 (B) 2 (C) 4</p>
+        <p><strong>Response:</strong> (A) 3</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-02.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-02.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> The person makes sets of repeated actions. How many distinct repeated actions did the person do? Options: (A) 2 (B) 3 (C) 4</p>
+        <p><strong>Response:</strong> (A) 2</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-03.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-03.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> In which direction does the purple sphere move in the video? Options: (A) Down and to the right. (B) Up and to the left. (C) Up and to the right. (D) The object is stationary.</p>
+        <p><strong>Response:</strong> (A) Down and to the right.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-04.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-04.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> What is the unrealistic phenomenon displayed in the video? Options: (A) The man can manipulate time via phone. (B) Man grabs an object through a phone screen. (C) Chocolate transforms into different objects. (D) Visible means of propulsion enables flight.</p>
+        <p><strong>Response:</strong> (B) Man grabs an object through a phone screen.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-short-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-short-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> Offer a succinct account of the culinary process shown in this video.</p>
+        <p><strong>Response:</strong> Add tomato puree and mix it well with chicken pieces.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-long-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-long-01.gif" width="100%">
+        </a>
+        <p><strong>Question:</strong> Provide a detailed description of the given video, capturing its key moments.</p>
+        <p><strong>Response:</strong> In a sunlit meadow, a small tortoiseshell butterfly rests on a purple flower. A bee, with black and yellow stripes, lands on the same flower. The butterfly flaps its wings gently, while the bee busies itself, collecting nectar. The flower sways slightly in the breeze. The butterfly then takes off, and the bee follows, both heading to the next flower. The scene is a vivid display of insect interaction in a natural setting, with the colors of the butterfly and the bee contrasting against the green background of the meadow. The video captures this peaceful moment in a short 6-second duration.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+
+### Text-to-Image Generation
+
+<div align="center">
+  <img src="assets/text-to-image/text-to-image-overview.webp" alt="Lance text-to-image generation examples" width="100%">
+</div>
+
+### Image Editing
+
+<div align="center">
+  <img src="assets/image-editing/image-editing-overview.webp" alt="Lance image editing examples" width="100%">
+</div>
+
+### Image Understanding
+
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-01.png" alt="Pie chart reasoning case" width="100%">
+        <p><strong>Question:</strong> Is the largest segment greater than sum of all the other segments?</p>
+        <p><strong>Response:</strong> Yes, the largest segment in the pie chart is the blue one, which is the most populous segment.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-02.png" alt="Survey chart about border security priorities" width="100%">
+        <p><strong>Question:</strong> What percentage of respondents want better border security?</p>
+        <p><strong>Response:</strong> 29%</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-03.png" alt="License plate recognition case" width="100%">
+        <p><strong>Question:</strong> What is the license plate number of the car?</p>
+        <p><strong>Response:</strong> The license plate number of the car is BX62 BFY.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-04.png" alt="Market research chart reasoning case" width="100%">
+        <p><strong>Question:</strong> According to the data from the proprietary market research, how much amount was spent on the promotional meetings and events during 1998?</p>
+        <p><strong>Response:</strong> According to the data from the proprietary market research, the total amount spent on the promotional meetings and events during 1998 was approximately $1.3 billion.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-05.png" alt="Colosseum description case" width="100%">
+        <p><strong>Question:</strong> What is the appearance of the Colosseum in Rome, Italy?</p>
+        <p><strong>Response:</strong> The Colosseum in Rome, Italy is a magnificent ancient amphitheater made of stone and concrete. It is characterized by its large, arched openings and its semi-circular shape. The Colosseum has been partially restored, and its original color has faded, but the architectural details, like the columns and the semi-circular arches, are still visible. The lighting in the image highlights the grandeur and historical significance of the Colosseum, making it stand out against the backdrop of the evening sky.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-06.png" alt="Total solar eclipse case" width="100%">
+        <p><strong>Question:</strong> How does a total solar eclipse look like from Earth?</p>
+        <p><strong>Response:</strong> A total solar eclipse looks like a dark circle with a bright, white outer edge. This is a result of the Earth's rotation and the Earth's shadow, which partially obscures the sun's light. The bright white edge of the eclipse is caused by the sun's high energy and its interaction with the Earth's atmosphere, while the dark part of the eclipse is due to the Earth's shadow and the surrounding air currents. The solar eclipse's shape, with its bright white edge and dark center, is similar to the shape of a full moon or a dark disk. It is a natural phenomenon that occurs in the atmosphere of the Earth and is an important part of the solar system.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+
+## 🚀 Installation
+
+### Recommended Environment
+
+- **Software:** Python 3.10+, CUDA 12.4+ (required)
+- **Hardware:** A GPU with at least 40GB VRAM is required for inference
+
+### Installation Steps
+```bash
+bash ./setup_env.sh
+```
+
+### Download Model Weights
+
+Please download all necessary model checkpoints from [Lance-3B on Hugging Face](https://huggingface.co/bytedance-research/Lance) and place them in the `downloads/` directory.
+
+## 📚 Usage
+
+### Inference
+
+Lance provides a unified command-line interface for all generation / editing / understanding tasks:
+
+```bash
+bash inference_lance.sh
+```
+
+- Before running, please configure the inference parameters at the top of `inference_lance.sh`.
+- **Supported tasks:** `t2i`, `t2v`, `image_edit`, `video_edit`, `x2t_image`, and `x2t_video`. You can modify `TASK_DEFAULT_CONFIGS` in `inference_lance.py` to customize the default data samples for each task.
+- **Note:** For all tasks, we recommend following the `prompt` format used in the provided examples when writing input prompts, as this typically leads to better generation quality.
+
+#### Available Tasks
+
+| Task Name              | Description                                      | Example JSON                                 |
+|------------------------|--------------------------------------------------|----------------------------------------------|
+| `t2v`                  | Text-to-Video generation                         | `config/examples/t2v_example.json`           |
+| `t2i`                  | Text-to-Image generation                         | `config/examples/t2i_example.json`           |
+| `image_edit`           | Image editing                                    | `config/examples/image_edit_example.json`    |
+| `video_edit`           | Video editing                                    | `config/examples/video_edit_example.json`    |
+| `x2t_image`            | Image understanding            | `config/examples/x2t_image_example.json`    |
+| `x2t_video`            | Video understanding            | `config/examples/x2t_video_example.json`    |
+
+For understanding examples:
+
+- `config/examples/x2t_image_example.json`: image understanding examples for visual question answering and image-based reasoning.
+- `config/examples/x2t_video_example.json`: video understanding examples for video question answering and video captioning.
+
+#### Parameters
+
+You can configure the following hyperparameters at the top of the `inference_lance.sh` script:
+
+| Parameter | Default Value | Description |
+| --- | --- | --- |
+| `MODEL_PATH` | `"downloads/Lance_3B"` | Path to the downloaded Lance model weights  (`Lance_3B` or `Lance_3B_Video`). |
+| `NUM_GPUS` | `1` | Number of GPUs to use for inference. |
+| `VALIDATION_NUM_TIMESTEPS` | `30` | Number of denoising steps (e.g., 30 or 50). |
+| `VALIDATION_TIMESTEP_SHIFT` | `3.5` | Timestep shift parameter for flow matching scheduling. |
+| `CFG_TEXT_SCALE` | `4.0` | Classifier-Free Guidance (CFG) scale for text conditioning. |
+| `VALIDATION_DATA_SEED` | `42` | Random seed for generation reproducibility. |
+| `NUM_FRAMES` | `50` | Number of frames for video generation (Max: 121). *Unused for image tasks.* |
+| `VIDEO_HEIGHT` / `VIDEO_WIDTH`| `768` | Spatial resolution. *Unused for editing tasks (determined by input image/video).* |
+| `RESOLUTION` | `"video_480p"` | Base resolution preset (`image_768res` or `video_480p`). |
+
+### Gradio
+```bash
+python lance_gradio_t2v_v2t.py --gpus 0 --server-port 7860
+```
+
+### Benchmarks
+
+#### DPG-Bench Evaluation
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Global</th>
+      <th align="center">Entity</th>
+      <th align="center">Attribute</th>
+      <th align="center">Relation</th>
+      <th align="center">Other</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="8"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">83.27</td><td align="center">82.43</td><td align="center">80.91</td><td align="center">86.76</td><td align="center">80.41</td><td align="center">74.65</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">90.97</td><td align="center">89.61</td><td align="center">88.39</td><td align="center">90.58</td><td align="center">89.83</td><td align="center">83.50</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">87.90</td><td align="center">91.01</td><td align="center">88.83</td><td align="center">80.70</td><td align="center">88.68</td><td align="center">84.08</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">74.35</td><td align="center">90.00</td><td align="center">88.96</td><td align="center">90.87</td><td align="center">88.33</td><td align="center">83.84</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">91.32</td><td align="center">91.56</td><td align="center">92.02</td><td align="center">94.31</td><td align="center">92.73</td><td align="center">88.32</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="8"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center">86.90</td><td align="center">88.90</td><td align="center">89.40</td><td align="center">89.32</td><td align="center">89.48</td><td align="center">84.19</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center">88.81</td><td align="center">88.83</td><td align="center">90.18</td><td align="center">89.37</td><td align="center">90.27</td><td align="center">83.57</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center">89.00</td><td align="center"><b>91.78</b></td><td align="center">89.96</td><td align="center">91.81</td><td align="center"><b>91.64</b></td><td align="center">86.14</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">88.94</td><td align="center">90.37</td><td align="center"><u>91.29</u></td><td align="center">90.82</td><td align="center">88.67</td><td align="center">85.07</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>90.39</u></td><td align="center">90.78</td><td align="center">90.68</td><td align="center">90.29</td><td align="center">88.77</td><td align="center">85.18</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>90.42</b></td><td align="center"><u>91.68</u></td><td align="center">90.94</td><td align="center"><u>91.87</u></td><td align="center"><u>90.73</u></td><td align="center"><b>86.76</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center">89.50</td><td align="center">91.40</td><td align="center"><b>92.07</b></td><td align="center">91.91</td><td align="center">88.81</td><td align="center"><u>86.54</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>83.89</b></td><td align="center"><b>91.07</b></td><td align="center"><b>89.36</b></td><td align="center"><b>93.38</b></td><td align="center"><b>80.80</b></td><td align="center"><b>84.67</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+<p align="center"><em><sup>†</sup> indicates methods that use LLM rewriters for prompt rewriting before generation.</em></p>
+
+#### GenEval Evaluation
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">1-Obj.</th>
+      <th align="center">2-Obj.</th>
+      <th align="center">Count</th>
+      <th align="center">Colors</th>
+      <th align="center">Position</th>
+      <th align="center">Attr.</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="9"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">0.98</td><td align="center">0.74</td><td align="center">0.39</td><td align="center">0.85</td><td align="center">0.15</td><td align="center">0.23</td><td align="center">0.55</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">0.96</td><td align="center">0.87</td><td align="center">0.47</td><td align="center">0.83</td><td align="center">0.43</td><td align="center">0.45</td><td align="center">0.67</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">0.99</td><td align="center">0.94</td><td align="center">0.72</td><td align="center">0.89</td><td align="center">0.33</td><td align="center">0.60</td><td align="center">0.74</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">0.98</td><td align="center">0.93</td><td align="center">0.75</td><td align="center">0.93</td><td align="center">0.68</td><td align="center">0.65</td><td align="center">0.82</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">0.99</td><td align="center">0.92</td><td align="center">0.89</td><td align="center">0.88</td><td align="center">0.76</td><td align="center">0.77</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="9"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center">0.89</td><td align="center">0.59</td><td align="center">0.90</td><td align="center">0.79</td><td align="center">0.66</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center"><b>1.00</b></td><td align="center">0.95</td><td align="center">0.64</td><td align="center">0.88</td><td align="center">0.55</td><td align="center">0.76</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center">0.87</td><td align="center">0.58</td><td align="center">0.92</td><td align="center">0.52</td><td align="center">0.62</td><td align="center">0.76</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">0.98</td><td align="center">0.95</td><td align="center"><b>0.84</b></td><td align="center"><u>0.95</u></td><td align="center">0.78</td><td align="center">0.77</td><td align="center">0.88</td>
+    </tr>
+    <tr>
+      <td align="left">Mogao</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center"><u>0.83</u></td><td align="center">0.93</td><td align="center">0.84</td><td align="center">0.80</td><td align="center"><u>0.89</u></td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>0.99</u></td><td align="center">0.94</td><td align="center">0.74</td><td align="center">0.91</td><td align="center">0.77</td><td align="center">0.74</td><td align="center">0.85</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center">0.81</td><td align="center">0.91</td><td align="center"><b>0.88</b></td><td align="center"><b>0.83</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center"><u>0.96</u></td><td align="center">0.80</td><td align="center">0.91</td><td align="center">0.84</td><td align="center">0.76</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>1.00</b></td><td align="center"><b>0.94</b></td><td align="center"><b>0.84</b></td><td align="center"><b>0.97</b></td><td align="center"><b>0.87</b></td><td align="center"><b>0.81</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+<p align="center"><em><sup>†</sup> indicates methods that use LLM rewriters for prompt rewriting before generation.</em></p>
+
+#### GEdit-Bench Evaluation
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Models</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">BC</th>
+      <th align="center">CA</th>
+      <th align="center">MM</th>
+      <th align="center">MC</th>
+      <th align="center">PB</th>
+      <th align="center">ST</th>
+      <th align="center">SA</th>
+      <th align="center">SR</th>
+      <th align="center">SRp</th>
+      <th align="center">TM</th>
+      <th align="center">TT</th>
+      <th align="center">Avg/G_O</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="14"><i>Generation-only Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Gemini 2.0</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">6.32</td>
+    </tr>
+    <tr>
+      <td align="left">GPT Image 1</td><td align="center">-</td><td align="center">6.96</td><td align="center">6.85</td><td align="center">7.10</td><td align="center">5.41</td><td align="center">6.74</td><td align="center">7.44</td><td align="center">7.51</td><td align="center">8.73</td><td align="center">8.55</td><td align="center">8.45</td><td align="center">8.69</td><td align="center">7.49</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image-Edit</td><td align="center">20B</td><td align="center">8.23</td><td align="center">8.30</td><td align="center">7.33</td><td align="center">8.05</td><td align="center">7.49</td><td align="center">6.74</td><td align="center">8.57</td><td align="center">8.09</td><td align="center">8.29</td><td align="center">8.48</td><td align="center">8.50</td><td align="center">8.01</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="14"><i>Unified Models</i></td>
+    </tr>
+    <tr>
+      <td align="left">Lumina-DiMOO</td><td align="center">8B</td><td align="center">3.43</td><td align="center">4.27</td><td align="center">3.08</td><td align="center">2.77</td><td align="center">4.74</td><td align="center">5.19</td><td align="center">4.44</td><td align="center">3.80</td><td align="center">4.38</td><td align="center">2.68</td><td align="center">4.20</td><td align="center">3.91</td>
+    </tr>
+    <tr>
+      <td align="left">Ovis-U1</td><td align="center">1.2B</td><td align="center"><u>7.49</u></td><td align="center">6.88</td><td align="center">6.21</td><td align="center">4.79</td><td align="center">5.98</td><td align="center"><u>6.46</u></td><td align="center">7.49</td><td align="center"><u>7.25</u></td><td align="center"><u>7.27</u></td><td align="center">4.48</td><td align="center">6.31</td><td align="center">6.42</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL</td><td align="center">7B</td><td align="center">7.32</td><td align="center">6.91</td><td align="center">6.38</td><td align="center">4.75</td><td align="center">4.57</td><td align="center">6.15</td><td align="center"><b>7.90</b></td><td align="center">7.16</td><td align="center">7.02</td><td align="center"><u>7.32</u></td><td align="center">6.22</td><td align="center">6.52</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center">7.08</td><td align="center">7.05</td><td align="center">6.38</td><td align="center"><u>7.02</u></td><td align="center"><u>6.03</u></td><td align="center">6.27</td><td align="center">7.13</td><td align="center">6.55</td><td align="center">6.33</td><td align="center">6.59</td><td align="center"><u>6.85</u></td><td align="center">6.66</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U (w/ CoT)</td><td align="center">1.7B</td><td align="center">7.05</td><td align="center"><b>7.87</b></td><td align="center"><u>6.50</u></td><td align="center">6.99</td><td align="center">5.77</td><td align="center">6.10</td><td align="center">7.33</td><td align="center">7.16</td><td align="center">7.12</td><td align="center"><b>7.36</b></td><td align="center">6.46</td><td align="center"><u>6.88</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>7.73</b></td><td align="center"><u>7.74</u></td><td align="center"><b>7.28</b></td><td align="center"><b>7.83</b></td><td align="center"><b>7.50</b></td><td align="center"><b>7.03</b></td><td align="center"><u>7.64</u></td><td align="center"><b>7.85</b></td><td align="center"><b>7.71</b></td><td align="center">4.46</td><td align="center"><b>7.57</b></td><td align="center"><b>7.30</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+#### VBench Evaluation (Video Generation)
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">Type</th>
+      <th align="left">Model</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Total Score ↑</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" rowspan="12"><i>Gen. Only</i></td>
+      <td align="left">ModelScope</td><td align="center">1.7B</td><td align="center">75.75</td>
+    </tr>
+    <tr>
+      <td align="left">LaVie</td><td align="center">3B</td><td align="center">77.08</td>
+    </tr>
+    <tr>
+      <td align="left">Show-1</td><td align="center">6B</td><td align="center">78.93</td>
+    </tr>
+    <tr>
+      <td align="left">AnimateDiff-V2</td><td align="center">-</td><td align="center">80.27</td>
+    </tr>
+    <tr>
+      <td align="left">VideoCrafter-2.0</td><td align="center">-</td><td align="center">80.44</td>
+    </tr>
+    <tr>
+      <td align="left">CogVideoX</td><td align="center">5B</td><td align="center">81.61</td>
+    </tr>
+    <tr>
+      <td align="left">Kling</td><td align="center">-</td><td align="center">81.85</td>
+    </tr>
+    <tr>
+      <td align="left">Open-Sora-2.0</td><td align="center">-</td><td align="center">81.71</td>
+    </tr>
+    <tr>
+      <td align="left">Gen-3</td><td align="center">-</td><td align="center">82.32</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Video-T2V</td><td align="center">30B</td><td align="center">81.83</td>
+    </tr>
+    <tr>
+      <td align="left">Hunyuan Video</td><td align="center">-</td><td align="center">83.43</td>
+    </tr>
+    <tr>
+      <td align="left">Wan2.1-T2V</td><td align="center">14B</td><td align="center">83.69</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="6"><i>Unified</i></td>
+      <td align="left">HaproOmni</td><td align="center">7B</td><td align="center">78.10</td>
+    </tr>
+    <tr>
+      <td align="left">Emu3</td><td align="center">8B</td><td align="center">80.96</td>
+    </tr>
+    <tr>
+      <td align="left">VILA-U</td><td align="center">7B</td><td align="center">74.01</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">2B</td><td align="center">81.34</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">1.5B</td><td align="center"><u>84.06</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>85.11</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+#### Running Benchmarks
+
+Ready-to-run benchmark scripts are provided under `benchmarks/`:
+
+| Benchmark              | Modality | Script                                                        |
+|------------------------|----------|---------------------------------------------------------------|
+| GenEVAL (image gen)    | Image    | `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`              |
+| DPG (image gen)        | Image    | `benchmarks/image_gen/DPG/sample_DPG.sh`                      |
+| GEdit (image edit)     | Image    | `benchmarks/image_gen/GEdit/sample_GEdit.sh`                  |
+| VBench (video gen)     | Video    | `benchmarks/video_gen/Vbench/sample_vbench.sh`                |
+
+
+## 📄 License
+
+Copyright 2025 Bytedance Ltd. and/or its affiliates.
+
+## 🙏 Acknowledgements
+
+We would like to thank the contributors of [BAGEL](https://github.com/ByteDance-Seed/bagel), [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), and [Wan2.2](https://github.com/Wan-Video/Wan2.2) for their open research and contributions.
+
+## 💖 Citation
+
+If you find **Lance** useful for your project or research, welcome to 🌟 this repo and cite our work using the following BibTeX:
+
+```bibtex
+@misc{fu2026lanceunifiedmultimodalmodeling,
+      title         = {Lance: Unified Multimodal Modeling by Multi-Task Synergy},
+      author        = {Fengyi Fu and Mengqi Huang and Shaojin Wu and Yunsheng Jiang and Yufei Huo and Hao Li and Yinghang Song and Fei Ding and Jianzhu Guo and Qian He and Zheren Fu and Zhendong Mao and Yongdong Zhang},
+      year          = {2026},
+      eprint        = {2605.18678},
+      archivePrefix = {arXiv},
+      primaryClass  = {cs.CV},
+      url           = {https://arxiv.org/abs/2605.18678},
+}
+```
+
+## 📞 Contact
+
+For questions, issues, or collaborations, please contact [Mengqi Huang](https://corleone-huang.github.io/) and [Jianzhu Guo](https://guojianzhu.com/).
diff --git a/README_zh.md b/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..15034fd9d269406887e2b0bcd0180355bcc349db
--- /dev/null
+++ b/README_zh.md
@@ -0,0 +1,587 @@
+<div align="center">
+  <img src="assets/logo/lance-logo.webp" alt="Lance logo" width="300">
+
+  <h1 align="center"><sup>Lance: Unified Multimodal Modeling by Multi-Task Synergy</sup></h1>
+  <p>
+    <strong>
+    <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Fengyi Fu</a><sup>*</sup>, 
+    <a href="https://corleone-huang.github.io/" style="text-decoration: none; color: inherit;">Mengqi Huang</a><sup>*,✉</sup>, 
+    <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" style="text-decoration: none; color: inherit;">Shaojin Wu</a><sup>*</sup>, 
+    Yunsheng Jiang<sup>*</sup>, 
+    Yufei Huo, 
+    <a href="https://guojianzhu.com/" style="text-decoration: none; color: inherit;">Jianzhu Guo</a><sup>✉,§</sup>
+    </strong><br>
+    Hao Li, 
+    Yinghang Song, 
+    Fei Ding, 
+    Qian He, 
+    Zheren Fu, 
+    Zhendong Mao, 
+    Yongdong Zhang
+    <br>
+    <em>ByteDance</em>
+    <br>
+    <sup>*</sup> 共同一作 &nbsp;&nbsp; <sup>✉</sup> 通讯作者 &nbsp;&nbsp; <sup>§</sup> Project lead
+  </p>
+  <p>
+    <a href="https://lance-project.github.io/" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat" alt="Homepage"></a>
+    <a href="http://arxiv.org/abs/2605.18678" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv" alt="arXiv"></a>
+    <a href="https://huggingface.co/bytedance-research/Lance" style="text-decoration: none; margin: 0 8px;"><img src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface" alt="Model"></a>
+    <br>
+    <a href="./README.md"><ins>English</ins></a> | 简体中文
+  </p>
+</div>
+
+## 🌟 亮点
+
+**Lance** 是一个3B参数、原生统一的多模态模型，在单一框架下同时支持 **图像与视频的理解、生成和编辑**。
+
+- **3B 规模高效强大。** 仅使用 **3B active parameters**，Lance 即可在图像生成、图像编辑和视频生成等基准上取得强劲表现。
+- **从零训练。** Lance 采用分阶段多任务训练配方，在 **128 张 A100 GPU** 的预算内从零完成训练。
+
+<div align="center">
+  <img src="assets/benchmarks/benchmark-overview.png" alt="Lance benchmark overview across image generation, image editing, video generation, and video understanding" width="980">
+</div>
+
+## 🎨 演示
+
+### 文生视频
+
+<table align="center">
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-01.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-02.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-03.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-04.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-05.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-06.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-07.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/text-to-video/videos/text-to-video-demo-08.mp4"><img src="assets/text-to-video/previews/text-to-video-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### 视频编辑
+
+<table align="center">
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-01.mp4"><img src="assets/video-editing/previews/video-editing-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-02.mp4"><img src="assets/video-editing/previews/video-editing-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-03.mp4"><img src="assets/video-editing/previews/video-editing-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-04.mp4"><img src="assets/video-editing/previews/video-editing-demo-04.gif" width="100%"></a></td>
+  </tr>
+  <tr>
+    <td><a href="assets/video-editing/videos/video-editing-demo-05.mp4"><img src="assets/video-editing/previews/video-editing-demo-05.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-06.mp4"><img src="assets/video-editing/previews/video-editing-demo-06.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-07.mp4"><img src="assets/video-editing/previews/video-editing-demo-07.gif" width="100%"></a></td>
+    <td><a href="assets/video-editing/videos/video-editing-demo-08.mp4"><img src="assets/video-editing/previews/video-editing-demo-08.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### 多轮一致性编辑
+
+<div align="center">
+  <a href="assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4">
+    <img src="assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif" width="100%">
+  </a>
+</div>
+
+### 智能视频生成
+
+<table align="center">
+  <tr>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-01.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-01.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-02.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-02.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-03.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-03.gif" width="100%"></a></td>
+    <td><a href="assets/intelligent-video/videos/intelligent-video-demo-04.mp4"><img src="assets/intelligent-video/previews/intelligent-video-demo-04.gif" width="100%"></a></td>
+  </tr>
+</table>
+
+### 视频理解
+
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> How many times did the person launch objects on the table? Options: (A) 3 (B) 2 (C) 4</p>
+        <p><strong>Response:</strong> (A) 3</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-02.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-02.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> The person makes sets of repeated actions. How many distinct repeated actions did the person do? Options: (A) 2 (B) 3 (C) 4</p>
+        <p><strong>Response:</strong> (A) 2</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-03.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-03.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> In which direction does the purple sphere move in the video? Options: (A) Down and to the right. (B) Up and to the left. (C) Up and to the right. (D) The object is stationary.</p>
+        <p><strong>Response:</strong> (A) Down and to the right.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-vqa-04.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-vqa-04.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> What is the unrealistic phenomenon displayed in the video? Options: (A) The man can manipulate time via phone. (B) Man grabs an object through a phone screen. (C) Chocolate transforms into different objects. (D) Visible means of propulsion enables flight.</p>
+        <p><strong>Response:</strong> (B) Man grabs an object through a phone screen.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-short-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-short-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> Offer a succinct account of the culinary process shown in this video.</p>
+        <p><strong>Response:</strong> Add tomato puree and mix it well with chicken pieces.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <a href="assets/video-understanding/videos/video-understanding-caption-long-01.mp4">
+          <img src="assets/video-understanding/previews/video-understanding-caption-long-01.gif" width="100%">
+        </a>
+        <p><strong>问题：</strong> Provide a detailed description of the given video, capturing its key moments.</p>
+        <p><strong>Response:</strong> In a sunlit meadow, a small tortoiseshell butterfly rests on a purple flower. A bee, with black and yellow stripes, lands on the same flower. The butterfly flaps its wings gently, while the bee busies itself, collecting nectar. The flower sways slightly in the breeze. The butterfly then takes off, and the bee follows, both heading to the next flower. The scene is a vivid display of insect interaction in a natural setting, with the colors of the butterfly and the bee contrasting against the green background of the meadow. The video captures this peaceful moment in a short 6-second duration.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+
+### 文生图
+
+<div align="center">
+  <img src="assets/text-to-image/text-to-image-overview.webp" alt="Lance text-to-image generation examples" width="100%">
+</div>
+
+### 图像编辑
+
+<div align="center">
+  <img src="assets/image-editing/image-editing-overview.webp" alt="Lance image editing examples" width="100%">
+</div>
+
+### 图像理解
+
+<div align="center">
+  <table align="center">
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-01.png" alt="Pie chart reasoning case" width="100%">
+        <p><strong>问题：</strong> Is the largest segment greater than sum of all the other segments?</p>
+        <p><strong>Response:</strong> Yes, the largest segment in the pie chart is the blue one, which is the most populous segment.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-02.png" alt="Survey chart about border security priorities" width="100%">
+        <p><strong>问题：</strong> What percentage of respondents want better border security?</p>
+        <p><strong>Response:</strong> 29%</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-03.png" alt="License plate recognition case" width="100%">
+        <p><strong>问题：</strong> What is the license plate number of the car?</p>
+        <p><strong>Response:</strong> The license plate number of the car is BX62 BFY.</p>
+      </td>
+    </tr>
+    <tr>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-04.png" alt="Market research chart reasoning case" width="100%">
+        <p><strong>问题：</strong> According to the data from the proprietary market research, how much amount was spent on the promotional meetings and events during 1998?</p>
+        <p><strong>Response:</strong> According to the data from the proprietary market research, the total amount spent on the promotional meetings and events during 1998 was approximately $1.3 billion.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-05.png" alt="Colosseum description case" width="100%">
+        <p><strong>问题：</strong> What is the appearance of the Colosseum in Rome, Italy?</p>
+        <p><strong>Response:</strong> The Colosseum in Rome, Italy is a magnificent ancient amphitheater made of stone and concrete. It is characterized by its large, arched openings and its semi-circular shape. The Colosseum has been partially restored, and its original color has faded, but the architectural details, like the columns and the semi-circular arches, are still visible. The lighting in the image highlights the grandeur and historical significance of the Colosseum, making it stand out against the backdrop of the evening sky.</p>
+      </td>
+      <td align="left" valign="top" width="33%">
+        <img src="assets/image-understanding/cases/image-understanding-case-06.png" alt="Total solar eclipse case" width="100%">
+        <p><strong>问题：</strong> How does a total solar eclipse look like from Earth?</p>
+        <p><strong>Response:</strong> A total solar eclipse looks like a dark circle with a bright, white outer edge. This is a result of the Earth's rotation and the Earth's shadow, which partially obscures the sun's light. The bright white edge of the eclipse is caused by the sun's high energy and its interaction with the Earth's atmosphere, while the dark part of the eclipse is due to the Earth's shadow and the surrounding air currents. The solar eclipse's shape, with its bright white edge and dark center, is similar to the shape of a full moon or a dark disk. It is a natural phenomenon that occurs in the atmosphere of the Earth and is an important part of the solar system.</p>
+      </td>
+    </tr>
+  </table>
+</div>
+
+## 🚀 安装
+
+### 推荐环境
+
+- **软件环境：** Python 3.10+，CUDA 12.4+（必需）
+- **硬件环境：** 推理至少需要一张显存不低于 40GB 的 GPU
+
+### 安装步骤
+```bash
+bash ./setup_env.sh
+```
+
+### 下载模型权重
+
+请从 [Hugging Face 上的 Lance-3B](https://huggingface.co/bytedance-research/Lance) 下载所需的全部模型权重，并放置到 `downloads/` 目录下。
+
+## 📚 使用方法
+
+### 推理
+
+Lance 为生成、编辑和理解任务提供了统一的命令行入口：
+
+```bash
+bash inference_lance.sh
+```
+
+- 运行前，请先在 `inference_lance.sh` 顶部配置推理参数。
+- **支持任务：** `t2i`、`t2v`、`image_edit`、`video_edit`、`x2t_image` 和 `x2t_video`。你也可以在 `inference_lance.py` 中修改 `TASK_DEFAULT_CONFIGS`，自定义每个任务默认使用的数据样例。
+- **注意：** 对于所有任务，建议在编写输入 prompt 时参考提供示例中的 `prompt` 格式，这通常有助于获得更好的生成效果。
+  
+#### 可用任务
+
+| 任务名 | 说明 | 示例 JSON |
+|------------------------|--------------------------------------------------|----------------------------------------------|
+| `t2v` | 文生视频 | `config/examples/t2v_example.json` |
+| `t2i` | 文生图 | `config/examples/t2i_example.json` |
+| `image_edit` | 图像编辑 | `config/examples/image_edit_example.json` |
+| `video_edit` | 视频编辑 | `config/examples/video_edit_example.json` |
+| `x2t_image` | 图像理解 | `config/examples/x2t_image_example.json` |
+| `x2t_video` | 视频理解 | `config/examples/x2t_video_example.json` |
+
+关于理解任务的示例文件：
+
+- `config/examples/x2t_image_example.json`：用于图像理解示例，包括视觉问答和基于图像的推理。
+- `config/examples/x2t_video_example.json`：用于视频理解示例，包括视频问答和视频描述。
+
+#### 参数说明
+
+你可以在 `inference_lance.sh` 顶部配置以下超参数：
+
+| 参数 | 默认值 | 说明 |
+| --- | --- | --- |
+| `MODEL_PATH` | `"downloads/Lance_3B"` | 下载后的 Lance 模型权重路径（如 `Lance_3B` 或 `Lance_3B_Video`）。 |
+| `NUM_GPUS` | `1` | 用于推理的 GPU 数量。 |
+| `VALIDATION_NUM_TIMESTEPS` | `30` | 去噪步数（例如 30 或 50）。 |
+| `VALIDATION_TIMESTEP_SHIFT` | `3.5` | Flow matching 调度中的 timestep shift 参数。 |
+| `CFG_TEXT_SCALE` | `4.0` | 文本条件的 CFG（Classifier-Free Guidance）系数。 |
+| `VALIDATION_DATA_SEED` | `42` | 用于复现实验的随机种子。 |
+| `NUM_FRAMES` | `50` | 视频生成帧数（最大 121）。*图像任务不使用该参数。* |
+| `VIDEO_HEIGHT` / `VIDEO_WIDTH`| `768` | 空间分辨率。*编辑任务不使用该参数（由输入图像/视频决定）。* |
+| `RESOLUTION` | `"video_480p"` | 基础分辨率预设（如 `image_768res` 或 `video_480p`）。 |
+
+### Gradio
+```bash
+python lance_gradio_t2v_v2t.py --gpus 0 --server-port 7860
+```
+
+### 基准评测
+
+#### DPG-Bench 评测
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Global</th>
+      <th align="center">Entity</th>
+      <th align="center">Attribute</th>
+      <th align="center">Relation</th>
+      <th align="center">Other</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="8"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">83.27</td><td align="center">82.43</td><td align="center">80.91</td><td align="center">86.76</td><td align="center">80.41</td><td align="center">74.65</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">90.97</td><td align="center">89.61</td><td align="center">88.39</td><td align="center">90.58</td><td align="center">89.83</td><td align="center">83.50</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">87.90</td><td align="center">91.01</td><td align="center">88.83</td><td align="center">80.70</td><td align="center">88.68</td><td align="center">84.08</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">74.35</td><td align="center">90.00</td><td align="center">88.96</td><td align="center">90.87</td><td align="center">88.33</td><td align="center">83.84</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">91.32</td><td align="center">91.56</td><td align="center">92.02</td><td align="center">94.31</td><td align="center">92.73</td><td align="center">88.32</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="8"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center">86.90</td><td align="center">88.90</td><td align="center">89.40</td><td align="center">89.32</td><td align="center">89.48</td><td align="center">84.19</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center">88.81</td><td align="center">88.83</td><td align="center">90.18</td><td align="center">89.37</td><td align="center">90.27</td><td align="center">83.57</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center">89.00</td><td align="center"><b>91.78</b></td><td align="center">89.96</td><td align="center">91.81</td><td align="center"><b>91.64</b></td><td align="center">86.14</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">88.94</td><td align="center">90.37</td><td align="center"><u>91.29</u></td><td align="center">90.82</td><td align="center">88.67</td><td align="center">85.07</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>90.39</u></td><td align="center">90.78</td><td align="center">90.68</td><td align="center">90.29</td><td align="center">88.77</td><td align="center">85.18</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>90.42</b></td><td align="center"><u>91.68</u></td><td align="center">90.94</td><td align="center"><u>91.87</u></td><td align="center"><u>90.73</u></td><td align="center"><b>86.76</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center">89.50</td><td align="center">91.40</td><td align="center"><b>92.07</b></td><td align="center">91.91</td><td align="center">88.81</td><td align="center"><u>86.54</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>83.89</b></td><td align="center"><b>91.07</b></td><td align="center"><b>89.36</b></td><td align="center"><b>93.38</b></td><td align="center"><b>80.80</b></td><td align="center"><b>84.67</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+<p align="center"><em><sup>†</sup> 表示该方法在生成前使用 LLM rewriter 进行提示词改写。</em></p>
+
+#### GenEval 评测
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">1-Obj.</th>
+      <th align="center">2-Obj.</th>
+      <th align="center">Count</th>
+      <th align="center">Colors</th>
+      <th align="center">Position</th>
+      <th align="center">Attr.</th>
+      <th align="center">Overall</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="9"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">SDXL</td><td align="center">3.5B</td><td align="center">0.98</td><td align="center">0.74</td><td align="center">0.39</td><td align="center">0.85</td><td align="center">0.15</td><td align="center">0.23</td><td align="center">0.55</td>
+    </tr>
+    <tr>
+      <td align="left">DALL-E 3</td><td align="center">-</td><td align="center">0.96</td><td align="center">0.87</td><td align="center">0.47</td><td align="center">0.83</td><td align="center">0.43</td><td align="center">0.45</td><td align="center">0.67</td>
+    </tr>
+    <tr>
+      <td align="left">SD3-Medium</td><td align="center">2B</td><td align="center">0.99</td><td align="center">0.94</td><td align="center">0.72</td><td align="center">0.89</td><td align="center">0.33</td><td align="center">0.60</td><td align="center">0.74</td>
+    </tr>
+    <tr>
+      <td align="left">FLUX.1-dev</td><td align="center">12B</td><td align="center">0.98</td><td align="center">0.93</td><td align="center">0.75</td><td align="center">0.93</td><td align="center">0.68</td><td align="center">0.65</td><td align="center">0.82</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image</td><td align="center">20B</td><td align="center">0.99</td><td align="center">0.92</td><td align="center">0.89</td><td align="center">0.88</td><td align="center">0.76</td><td align="center">0.77</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="9"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Janus-Pro-7B</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center">0.89</td><td align="center">0.59</td><td align="center">0.90</td><td align="center">0.79</td><td align="center">0.66</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">OmniGen2</td><td align="center">4B</td><td align="center"><b>1.00</b></td><td align="center">0.95</td><td align="center">0.64</td><td align="center">0.88</td><td align="center">0.55</td><td align="center">0.76</td><td align="center">0.80</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center">0.87</td><td align="center">0.58</td><td align="center">0.92</td><td align="center">0.52</td><td align="center">0.62</td><td align="center">0.76</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL<sup>†</sup></td><td align="center">7B</td><td align="center">0.98</td><td align="center">0.95</td><td align="center"><b>0.84</b></td><td align="center"><u>0.95</u></td><td align="center">0.78</td><td align="center">0.77</td><td align="center">0.88</td>
+    </tr>
+    <tr>
+      <td align="left">Mogao</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center"><u>0.83</u></td><td align="center">0.93</td><td align="center">0.84</td><td align="center">0.80</td><td align="center"><u>0.89</u></td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center"><u>0.99</u></td><td align="center">0.94</td><td align="center">0.74</td><td align="center">0.91</td><td align="center">0.77</td><td align="center">0.74</td><td align="center">0.85</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">7B</td><td align="center"><b>1.00</b></td><td align="center"><b>0.97</b></td><td align="center">0.81</td><td align="center">0.91</td><td align="center"><b>0.88</b></td><td align="center"><b>0.83</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+    <tr>
+      <td align="left">TUNA-2</td><td align="center">7B</td><td align="center"><u>0.99</u></td><td align="center"><u>0.96</u></td><td align="center">0.80</td><td align="center">0.91</td><td align="center">0.84</td><td align="center">0.76</td><td align="center">0.87</td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>1.00</b></td><td align="center"><b>0.94</b></td><td align="center"><b>0.84</b></td><td align="center"><b>0.97</b></td><td align="center"><b>0.87</b></td><td align="center"><b>0.81</b></td><td align="center"><b>0.90</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+<p align="center"><em><sup>†</sup> 表示该方法在生成前使用 LLM rewriter 进行提示词改写。</em></p>
+
+#### GEdit-Bench 评测
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">模型</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">BC</th>
+      <th align="center">CA</th>
+      <th align="center">MM</th>
+      <th align="center">MC</th>
+      <th align="center">PB</th>
+      <th align="center">ST</th>
+      <th align="center">SA</th>
+      <th align="center">SR</th>
+      <th align="center">SRp</th>
+      <th align="center">TM</th>
+      <th align="center">TT</th>
+      <th align="center">Avg/G_O</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" colspan="14"><i>仅生成模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Gemini 2.0</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">-</td><td align="center">6.32</td>
+    </tr>
+    <tr>
+      <td align="left">GPT Image 1</td><td align="center">-</td><td align="center">6.96</td><td align="center">6.85</td><td align="center">7.10</td><td align="center">5.41</td><td align="center">6.74</td><td align="center">7.44</td><td align="center">7.51</td><td align="center">8.73</td><td align="center">8.55</td><td align="center">8.45</td><td align="center">8.69</td><td align="center">7.49</td>
+    </tr>
+    <tr>
+      <td align="left">Qwen-Image-Edit</td><td align="center">20B</td><td align="center">8.23</td><td align="center">8.30</td><td align="center">7.33</td><td align="center">8.05</td><td align="center">7.49</td><td align="center">6.74</td><td align="center">8.57</td><td align="center">8.09</td><td align="center">8.29</td><td align="center">8.48</td><td align="center">8.50</td><td align="center">8.01</td>
+    </tr>
+    <tr>
+      <td align="center" colspan="14"><i>统一模型</i></td>
+    </tr>
+    <tr>
+      <td align="left">Lumina-DiMOO</td><td align="center">8B</td><td align="center">3.43</td><td align="center">4.27</td><td align="center">3.08</td><td align="center">2.77</td><td align="center">4.74</td><td align="center">5.19</td><td align="center">4.44</td><td align="center">3.80</td><td align="center">4.38</td><td align="center">2.68</td><td align="center">4.20</td><td align="center">3.91</td>
+    </tr>
+    <tr>
+      <td align="left">Ovis-U1</td><td align="center">1.2B</td><td align="center"><u>7.49</u></td><td align="center">6.88</td><td align="center">6.21</td><td align="center">4.79</td><td align="center">5.98</td><td align="center"><u>6.46</u></td><td align="center">7.49</td><td align="center"><u>7.25</u></td><td align="center"><u>7.27</u></td><td align="center">4.48</td><td align="center">6.31</td><td align="center">6.42</td>
+    </tr>
+    <tr>
+      <td align="left">BAGEL</td><td align="center">7B</td><td align="center">7.32</td><td align="center">6.91</td><td align="center">6.38</td><td align="center">4.75</td><td align="center">4.57</td><td align="center">6.15</td><td align="center"><b>7.90</b></td><td align="center">7.16</td><td align="center">7.02</td><td align="center"><u>7.32</u></td><td align="center">6.22</td><td align="center">6.52</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U</td><td align="center">1.7B</td><td align="center">7.08</td><td align="center">7.05</td><td align="center">6.38</td><td align="center"><u>7.02</u></td><td align="center"><u>6.03</u></td><td align="center">6.27</td><td align="center">7.13</td><td align="center">6.55</td><td align="center">6.33</td><td align="center">6.59</td><td align="center"><u>6.85</u></td><td align="center">6.66</td>
+    </tr>
+    <tr>
+      <td align="left">InternVL-U (w/ CoT)</td><td align="center">1.7B</td><td align="center">7.05</td><td align="center"><b>7.87</b></td><td align="center"><u>6.50</u></td><td align="center">6.99</td><td align="center">5.77</td><td align="center">6.10</td><td align="center">7.33</td><td align="center">7.16</td><td align="center">7.12</td><td align="center"><b>7.36</b></td><td align="center">6.46</td><td align="center"><u>6.88</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>7.73</b></td><td align="center"><u>7.74</u></td><td align="center"><b>7.28</b></td><td align="center"><b>7.83</b></td><td align="center"><b>7.50</b></td><td align="center"><b>7.03</b></td><td align="center"><u>7.64</u></td><td align="center"><b>7.85</b></td><td align="center"><b>7.71</b></td><td align="center">4.46</td><td align="center"><b>7.57</b></td><td align="center"><b>7.30</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+#### VBench 评测（视频生成）
+
+<div align="center">
+<table align="center">
+  <thead>
+    <tr>
+      <th align="left">类型</th>
+      <th align="left">Model</th>
+      <th align="center">#&nbsp;Params.</th>
+      <th align="center">Total Score ↑</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td align="center" rowspan="12"><i>Gen. Only</i></td>
+      <td align="left">ModelScope</td><td align="center">1.7B</td><td align="center">75.75</td>
+    </tr>
+    <tr>
+      <td align="left">LaVie</td><td align="center">3B</td><td align="center">77.08</td>
+    </tr>
+    <tr>
+      <td align="left">Show-1</td><td align="center">6B</td><td align="center">78.93</td>
+    </tr>
+    <tr>
+      <td align="left">AnimateDiff-V2</td><td align="center">-</td><td align="center">80.27</td>
+    </tr>
+    <tr>
+      <td align="left">VideoCrafter-2.0</td><td align="center">-</td><td align="center">80.44</td>
+    </tr>
+    <tr>
+      <td align="left">CogVideoX</td><td align="center">5B</td><td align="center">81.61</td>
+    </tr>
+    <tr>
+      <td align="left">Kling</td><td align="center">-</td><td align="center">81.85</td>
+    </tr>
+    <tr>
+      <td align="left">Open-Sora-2.0</td><td align="center">-</td><td align="center">81.71</td>
+    </tr>
+    <tr>
+      <td align="left">Gen-3</td><td align="center">-</td><td align="center">82.32</td>
+    </tr>
+    <tr>
+      <td align="left">Step-Video-T2V</td><td align="center">30B</td><td align="center">81.83</td>
+    </tr>
+    <tr>
+      <td align="left">Hunyuan Video</td><td align="center">-</td><td align="center">83.43</td>
+    </tr>
+    <tr>
+      <td align="left">Wan2.1-T2V</td><td align="center">14B</td><td align="center">83.69</td>
+    </tr>
+    <tr>
+      <td align="center" rowspan="6"><i>Unified</i></td>
+      <td align="left">HaproOmni</td><td align="center">7B</td><td align="center">78.10</td>
+    </tr>
+    <tr>
+      <td align="left">Emu3</td><td align="center">8B</td><td align="center">80.96</td>
+    </tr>
+    <tr>
+      <td align="left">VILA-U</td><td align="center">7B</td><td align="center">74.01</td>
+    </tr>
+    <tr>
+      <td align="left">Show-o2</td><td align="center">2B</td><td align="center">81.34</td>
+    </tr>
+    <tr>
+      <td align="left">TUNA</td><td align="center">1.5B</td><td align="center"><u>84.06</u></td>
+    </tr>
+    <tr>
+      <td align="left">🌟 <b>Lance (Ours)</b></td><td align="center"><b>3B</b></td><td align="center"><b>85.11</b></td>
+    </tr>
+  </tbody>
+</table>
+</div>
+
+#### 运行基准评测
+
+`benchmarks/` 目录下提供了可直接运行的基准评测脚本：
+
+| 基准 | 模态 | 脚本 |
+|------------------------|----------|---------------------------------------------------------------|
+| GenEVAL（图像生成） | 图像 | `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` |
+| DPG（图像生成） | 图像 | `benchmarks/image_gen/DPG/sample_DPG.sh` |
+| GEdit（图像编辑） | 图像 | `benchmarks/image_gen/GEdit/sample_GEdit.sh` |
+| VBench（视频生成） | 视频 | `benchmarks/video_gen/Vbench/sample_vbench.sh` |
+
+
+## 📄 许可证
+
+Copyright 2025 Bytedance Ltd. and/or its affiliates.
+
+## 🙏 致谢
+
+我们感谢 [BAGEL](https://github.com/ByteDance-Seed/bagel)、[Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) 和 [Wan2.2](https://github.com/Wan-Video/Wan2.2) 的贡献者，感谢他们开放的研究与社区贡献。
+
+## 💖 引用
+
+如果 **Lance** 对您的项目或研究有帮助，欢迎 🌟 本仓库，并使用以下 BibTeX 引用我们的工作：
+
+```bibtex
+@misc{fu2026lanceunifiedmultimodalmodeling,
+      title         = {Lance: Unified Multimodal Modeling by Multi-Task Synergy},
+      author        = {Fengyi Fu and Mengqi Huang and Shaojin Wu and Yunsheng Jiang and Yufei Huo and Hao Li and Yinghang Song and Fei Ding and Jianzhu Guo and Qian He and Zheren Fu and Zhendong Mao and Yongdong Zhang},
+      year          = {2026},
+      eprint        = {2605.18678},
+      archivePrefix = {arXiv},
+      primaryClass  = {cs.CV},
+      url           = {https://arxiv.org/abs/2605.18678},
+}
+```
+
+## 📞 联系方式
+
+如有问题、反馈或合作需求，请联系 [Mengqi Huang](https://corleone-huang.github.io/) 和 [Jianzhu Guo](https://guojianzhu.com/)。
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..b9c4e583bde0cdf5385d4993a344f9c1696be630
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,11 @@
+# Security and privacy
+If you discover potential security issues in the project, or believe you may have found a security issue, please notify the ByteDance security team through our [security center](https://security.bytedance.com/src) or [vulnerability reporting email](mailto:src@bytedance.com). Please **do not** create public GitHub Issues.
+ 
+We will assess the vulnerability based on the Common Vulnerability Scoring System (CVSS 3.1). The security team will keep you updated on key progress and may request further information or guidance from you. You are welcome to contact us via the email or website mentioned above to ask questions or discuss disclosure matters.
+ 
+To protect the security of our customers, ByteDance requests that you do not publish or share information regarding the vulnerability in any public forum, nor publish or share data involving users, until the vulnerability has been remediated and our users have been notified. Please understand that the time required for remediation depends on the severity of the vulnerability and the scope of the impact.
+
+Individuals, companies, and security teams may wish to publish security advisories on their own websites or other forums. Please contact us via the email or website mentioned above prior to publication to discuss the information that can be disclosed and to coordinate the disclosure timeline.
+
+# Bug Bounty Reward
+[For the policy of bug bounty reward](https://bytedance.larkoffice.com/docx/ZstQd7bbooDctqxBCAmcFasOngd), if you have any questions about the rules, please contact [https://src.bytedance.com/home](https://src.bytedance.com/home) for consultation.
\ No newline at end of file
diff --git a/assets/benchmarks/benchmark-overview.png b/assets/benchmarks/benchmark-overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..41e22eed8564007ed62087b08378579789c75714
--- /dev/null
+++ b/assets/benchmarks/benchmark-overview.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:317bd35ef72d37ac743eac2dcdcd66e9b83d7f7181b6578702cf54ec10ea61d5
+size 1342098
diff --git a/assets/image-editing/image-editing-overview.webp b/assets/image-editing/image-editing-overview.webp
new file mode 100644
index 0000000000000000000000000000000000000000..0a405720c1e6a4f6a9da129e72600b973a3ac521
--- /dev/null
+++ b/assets/image-editing/image-editing-overview.webp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ea51f3d7b64859b5c60e8da9863e4674fea60acaf1e4c29995837137e88360f
+size 459994
diff --git a/assets/image-understanding/cases/image-understanding-case-01.png b/assets/image-understanding/cases/image-understanding-case-01.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f63b15b227901626c908f3a02448ad059c56226
--- /dev/null
+++ b/assets/image-understanding/cases/image-understanding-case-01.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dca7c23bbdc486815df8cdfe681effbf323f8f324a1180b1626347ee60c56e04
+size 126264
diff --git a/assets/image-understanding/cases/image-understanding-case-02.png b/assets/image-understanding/cases/image-understanding-case-02.png
new file mode 100644
index 0000000000000000000000000000000000000000..30549e9bd9c77aee9952ce172f4706107c912eed
Binary files /dev/null and b/assets/image-understanding/cases/image-understanding-case-02.png differ
diff --git a/assets/image-understanding/cases/image-understanding-case-03.png b/assets/image-understanding/cases/image-understanding-case-03.png
new file mode 100644
index 0000000000000000000000000000000000000000..96f3deea3b984e09d4e9a26e9102779dcaaa85a0
--- /dev/null
+++ b/assets/image-understanding/cases/image-understanding-case-03.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07ec27822042a52ab89c299846863849e9c4bb974a338a59eb068cbcdca18d67
+size 1073728
diff --git a/assets/image-understanding/cases/image-understanding-case-04.png b/assets/image-understanding/cases/image-understanding-case-04.png
new file mode 100644
index 0000000000000000000000000000000000000000..77d4406803e3b5f842a4941c130eef35d34f4e11
--- /dev/null
+++ b/assets/image-understanding/cases/image-understanding-case-04.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b47c54a47e552b00db42b8867dbb4f46844aa4b95c5caba605691c787ad27e40
+size 112103
diff --git a/assets/image-understanding/cases/image-understanding-case-05.png b/assets/image-understanding/cases/image-understanding-case-05.png
new file mode 100644
index 0000000000000000000000000000000000000000..b38bc7829601f45890391568ce3e9e0cd6ba51aa
--- /dev/null
+++ b/assets/image-understanding/cases/image-understanding-case-05.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3853e3af85824429178a14a63255eba6f0e2e44dd47f7cc64c27fe3eefe765cb
+size 948665
diff --git a/assets/image-understanding/cases/image-understanding-case-06.png b/assets/image-understanding/cases/image-understanding-case-06.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbbb939939425ffeae8cad02c99ae3dc5725b337
--- /dev/null
+++ b/assets/image-understanding/cases/image-understanding-case-06.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d16216657d25789af4e6f7ef68adfea7f161bf944b65f9a6657c9b019bea34b3
+size 223310
diff --git a/assets/intelligent-video/previews/intelligent-video-demo-01.gif b/assets/intelligent-video/previews/intelligent-video-demo-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..411118bcb3dbd96f835083677a86fb932d9fb753
Binary files /dev/null and b/assets/intelligent-video/previews/intelligent-video-demo-01.gif differ
diff --git a/assets/intelligent-video/previews/intelligent-video-demo-02.gif b/assets/intelligent-video/previews/intelligent-video-demo-02.gif
new file mode 100644
index 0000000000000000000000000000000000000000..18b61d07b3dfb5ce3a004919ab86244bc896fa07
Binary files /dev/null and b/assets/intelligent-video/previews/intelligent-video-demo-02.gif differ
diff --git a/assets/intelligent-video/previews/intelligent-video-demo-03.gif b/assets/intelligent-video/previews/intelligent-video-demo-03.gif
new file mode 100644
index 0000000000000000000000000000000000000000..9ae4ab63b1c58a4b085e66b59e84813b2de5d1f3
Binary files /dev/null and b/assets/intelligent-video/previews/intelligent-video-demo-03.gif differ
diff --git a/assets/intelligent-video/previews/intelligent-video-demo-04.gif b/assets/intelligent-video/previews/intelligent-video-demo-04.gif
new file mode 100644
index 0000000000000000000000000000000000000000..ce1481ff3593b2eefbb4c8638e7f5282361b54c1
Binary files /dev/null and b/assets/intelligent-video/previews/intelligent-video-demo-04.gif differ
diff --git a/assets/intelligent-video/videos/intelligent-video-demo-01.mp4 b/assets/intelligent-video/videos/intelligent-video-demo-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c4b643c6fcc06fc1e18c2ffb28850247b4219261
Binary files /dev/null and b/assets/intelligent-video/videos/intelligent-video-demo-01.mp4 differ
diff --git a/assets/intelligent-video/videos/intelligent-video-demo-02.mp4 b/assets/intelligent-video/videos/intelligent-video-demo-02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..e1479faf215ae22401de2cac990c39b49965793b
Binary files /dev/null and b/assets/intelligent-video/videos/intelligent-video-demo-02.mp4 differ
diff --git a/assets/intelligent-video/videos/intelligent-video-demo-03.mp4 b/assets/intelligent-video/videos/intelligent-video-demo-03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..06d11bb6aef28e28680e54b9751a1617701fb71a
Binary files /dev/null and b/assets/intelligent-video/videos/intelligent-video-demo-03.mp4 differ
diff --git a/assets/intelligent-video/videos/intelligent-video-demo-04.mp4 b/assets/intelligent-video/videos/intelligent-video-demo-04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2fdc429de8187425fba9e1cbadf7f41b05f89477
Binary files /dev/null and b/assets/intelligent-video/videos/intelligent-video-demo-04.mp4 differ
diff --git a/assets/logo/lance-logo.webp b/assets/logo/lance-logo.webp
new file mode 100644
index 0000000000000000000000000000000000000000..a3d89c457858783e3af467dca94fd2536654c080
--- /dev/null
+++ b/assets/logo/lance-logo.webp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5359f986a6a29e25b4eb92fd470e74c83d92581dc6fb22d2c4ac789637842934
+size 460894
diff --git a/assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif b/assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..d33bd916fef490b7973e0947bd70b62d22296b35
--- /dev/null
+++ b/assets/multi-turn-editing/previews/multi-turn-editing-demo-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fe1c324cf4244d5f22f4efc009664a45cc71beb1830a076eee569f8690bfc40
+size 1687897
diff --git a/assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4 b/assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..08c6cbe77b57b0150bd3168687133d9b079383e6
--- /dev/null
+++ b/assets/multi-turn-editing/videos/multi-turn-editing-demo-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf8a36b7a94b0d4769d6ce4c5ccad25e3cf5bb22a000f57e4e5cbb05b7854fb4
+size 1915287
diff --git a/assets/text-to-image/text-to-image-overview.webp b/assets/text-to-image/text-to-image-overview.webp
new file mode 100644
index 0000000000000000000000000000000000000000..89670cdee635128a37263c9f549cd36be7c26294
--- /dev/null
+++ b/assets/text-to-image/text-to-image-overview.webp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:643ab6df451db427702cfda3319d471f37c9a7f8bb0646b966961495590a665e
+size 1145204
diff --git a/assets/text-to-video/previews/text-to-video-demo-01.gif b/assets/text-to-video/previews/text-to-video-demo-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..d00ccb5e5dc0461c3e9a844acf566fab10400248
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0a9258e7f77cfd96aa759e2cdad36faedb646067e43ad53160dc3484120d1ce
+size 1433406
diff --git a/assets/text-to-video/previews/text-to-video-demo-02.gif b/assets/text-to-video/previews/text-to-video-demo-02.gif
new file mode 100644
index 0000000000000000000000000000000000000000..7828039aa87bc652a1d1b8ed2d2df1f492775f84
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-02.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3583aa5f687d288e94807b24256f7b248cc9fb990978f77ef54a925ee1ff3a
+size 1093312
diff --git a/assets/text-to-video/previews/text-to-video-demo-03.gif b/assets/text-to-video/previews/text-to-video-demo-03.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4bca3a6807d81381c90a48a5ff9e7a43fbc510cc
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-03.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e68d947fad2e9190a668ca5dcf55445a06ca422563763a1f1557f458aa1c49f
+size 1066542
diff --git a/assets/text-to-video/previews/text-to-video-demo-04.gif b/assets/text-to-video/previews/text-to-video-demo-04.gif
new file mode 100644
index 0000000000000000000000000000000000000000..415bc1e7f6a8c7a6b29c1b247992e09b5639ae59
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-04.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6ae9f7a192f823071f6d72bd58f376e9911c07c8213a755dfee94efae16f3e
+size 1294291
diff --git a/assets/text-to-video/previews/text-to-video-demo-05.gif b/assets/text-to-video/previews/text-to-video-demo-05.gif
new file mode 100644
index 0000000000000000000000000000000000000000..1476deb719d28accb9d6da72913a4cfd7e6a0b54
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-05.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:389975bd2d5b5ce5669d46382543e1a059a5753215146a80d27ccdc7c13d5399
+size 1834178
diff --git a/assets/text-to-video/previews/text-to-video-demo-06.gif b/assets/text-to-video/previews/text-to-video-demo-06.gif
new file mode 100644
index 0000000000000000000000000000000000000000..5e66854a92084fda34eaa481fcec0778a0cf6953
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-06.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1645a20ebf29b955f33a759d475e35f119223e67591e0d4e347bb56a68df3bc
+size 1436696
diff --git a/assets/text-to-video/previews/text-to-video-demo-07.gif b/assets/text-to-video/previews/text-to-video-demo-07.gif
new file mode 100644
index 0000000000000000000000000000000000000000..a8030274b04d67716f9dad479df87292eaf4e3a9
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-07.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c59dfd54006305ac4f502df50dc1158ed2f25382aff38e7f021f72980c12ba62
+size 1259506
diff --git a/assets/text-to-video/previews/text-to-video-demo-08.gif b/assets/text-to-video/previews/text-to-video-demo-08.gif
new file mode 100644
index 0000000000000000000000000000000000000000..b0695b074382a78c82c9a692539b09438fa403e1
--- /dev/null
+++ b/assets/text-to-video/previews/text-to-video-demo-08.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7047d46e0977970ffd23c334cc4aefe1d356e71592a7ba8311be5bc53def7c33
+size 1806277
diff --git a/assets/text-to-video/videos/text-to-video-demo-01.mp4 b/assets/text-to-video/videos/text-to-video-demo-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..af4407bea18870032a47ec252b8f4fd9b1595f57
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b76e0bffa453aa61da2be454dab728da967badcd2c80d5bb3b330e1810a65cc
+size 7186230
diff --git a/assets/text-to-video/videos/text-to-video-demo-02.mp4 b/assets/text-to-video/videos/text-to-video-demo-02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..77a444c78c117ff44042c5a63ba21d84888bb049
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-02.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0b04500bd433bc5705e080001a02d7647252dff4e7cdb48e42efba5a8c721a1
+size 2826861
diff --git a/assets/text-to-video/videos/text-to-video-demo-03.mp4 b/assets/text-to-video/videos/text-to-video-demo-03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..3698e769244906975b4eb43fd501f013fc607233
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-03.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6d1254b88c0ee823083a0c256c25adbe2b20ad0eb1f27a31e6dc62e35f4ecb5
+size 2854502
diff --git a/assets/text-to-video/videos/text-to-video-demo-04.mp4 b/assets/text-to-video/videos/text-to-video-demo-04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..00e68e6380200c1fde252acb86b200f3f10c1a76
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-04.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:05f42163771e05b9104b27e2133793dd94697a1ecba2bd1bf2ce103de8d3ddfb
+size 4744471
diff --git a/assets/text-to-video/videos/text-to-video-demo-05.mp4 b/assets/text-to-video/videos/text-to-video-demo-05.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0944bc8650c52e239790bcdfd24faa77f6c2f8ce
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-05.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2fc022110d19f12f5f3222990d69812dbbf3bd6ae7beb953d5d1d4a12858674f
+size 2854867
diff --git a/assets/text-to-video/videos/text-to-video-demo-06.mp4 b/assets/text-to-video/videos/text-to-video-demo-06.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5d282cc517183e8d00f520583535af39ace2529c
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-06.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02d0fc0fe120589e94b7739caee15ff2ddef0e461a7d124b8eaa4724853d995f
+size 3397515
diff --git a/assets/text-to-video/videos/text-to-video-demo-07.mp4 b/assets/text-to-video/videos/text-to-video-demo-07.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a487aafe563a7e5d0b553ecd8474a3d7201abfaf
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-07.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17466d43ce3439a6d58148d0634f2d365ae20debf12bfa87d3d39e29708a91ff
+size 5506590
diff --git a/assets/text-to-video/videos/text-to-video-demo-08.mp4 b/assets/text-to-video/videos/text-to-video-demo-08.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..4546475016575a134543086dd1fa2d5ecb7a7edb
--- /dev/null
+++ b/assets/text-to-video/videos/text-to-video-demo-08.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22215f0fdfe4a6ddc21977fecf972c98a3c9be739fb3b96e2523706a5da90362
+size 3313897
diff --git a/assets/video-editing/previews/video-editing-demo-01.gif b/assets/video-editing/previews/video-editing-demo-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..2f12e274c9df9fb742aea9b32a7d742f531f09c4
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dff9e03f3729a1fd8bc75d7077be61201554a45b9c1a222131b98bc7cc4993f6
+size 1099065
diff --git a/assets/video-editing/previews/video-editing-demo-02.gif b/assets/video-editing/previews/video-editing-demo-02.gif
new file mode 100644
index 0000000000000000000000000000000000000000..6f616571b143a3bbb3683a514e653f638c5d3b2e
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-02.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e42f280ad1d6a378a634b8936c34b23a360ef94995b26bf18e9e5cf0d01fb1f7
+size 1075431
diff --git a/assets/video-editing/previews/video-editing-demo-03.gif b/assets/video-editing/previews/video-editing-demo-03.gif
new file mode 100644
index 0000000000000000000000000000000000000000..19f804e8a7ad21c77c9bafb11e1922c74b0cff11
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-03.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09800ebe2b5d58d83052b614176e9c25b346f9bc548f134ce051476a0e76cd8b
+size 1463279
diff --git a/assets/video-editing/previews/video-editing-demo-04.gif b/assets/video-editing/previews/video-editing-demo-04.gif
new file mode 100644
index 0000000000000000000000000000000000000000..0bd96e241d0c6579be393a35270f2067f2fe7fdc
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-04.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e39f3ba103d157d258832b93aada67fc1090be882809db0db2bc86335433a011
+size 817773
diff --git a/assets/video-editing/previews/video-editing-demo-05.gif b/assets/video-editing/previews/video-editing-demo-05.gif
new file mode 100644
index 0000000000000000000000000000000000000000..988ee3eaec613ea3caf901c2a44c390474b99c7b
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-05.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5104ded46ecb6acd40cea01280bf7b8760bd8fb6c4a6f0af2d3dc782f0d8ee72
+size 1595814
diff --git a/assets/video-editing/previews/video-editing-demo-06.gif b/assets/video-editing/previews/video-editing-demo-06.gif
new file mode 100644
index 0000000000000000000000000000000000000000..a0d48537f844c71f6296d39d5524f2a303cbac21
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-06.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6e0863f9269901ad560407476a06c38a9947f46b5d0e40af378a989e8741427a
+size 1372032
diff --git a/assets/video-editing/previews/video-editing-demo-07.gif b/assets/video-editing/previews/video-editing-demo-07.gif
new file mode 100644
index 0000000000000000000000000000000000000000..f400f4814bde50d296a3c76bc5c92ced1848461b
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-07.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:636a6653462d38116a2431f727750afbb25faf5d0a1ec120430c46dffe0d0acd
+size 783546
diff --git a/assets/video-editing/previews/video-editing-demo-08.gif b/assets/video-editing/previews/video-editing-demo-08.gif
new file mode 100644
index 0000000000000000000000000000000000000000..2cb3cae388a5021d4cf7389427faadb02beea81d
--- /dev/null
+++ b/assets/video-editing/previews/video-editing-demo-08.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:773958d112d939f83df708c9cc206707b8cc1eb1a1bf8f0bed826c42d5cb3283
+size 1045809
diff --git a/assets/video-editing/videos/video-editing-demo-01.mp4 b/assets/video-editing/videos/video-editing-demo-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5f041046b9df68de6612243bd8552bfec7bbe031
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d64ade5b3c4c1cb658211e95ab0cc3a5b0b05c5ec3debcad56117ed968cb2d5a
+size 1495837
diff --git a/assets/video-editing/videos/video-editing-demo-02.mp4 b/assets/video-editing/videos/video-editing-demo-02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..923c7806ca55e373994d77cdf0e818c0877902b9
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-02.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3a36f748c8de337c4c63efccde66939116dd28538c95a46714aae64a3609eb
+size 1511686
diff --git a/assets/video-editing/videos/video-editing-demo-03.mp4 b/assets/video-editing/videos/video-editing-demo-03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9acc255a207ca7dd5f06229cc0270cab8f7ad57e
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-03.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff730fa5da6c03e6cdcd6ca439a5bb18c6bacf842e21924b4e549c70cd1a197
+size 1654537
diff --git a/assets/video-editing/videos/video-editing-demo-04.mp4 b/assets/video-editing/videos/video-editing-demo-04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..edf66e6844718e409fcdbc4b156b304e60fe4912
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-04.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef54e4925d7ac1e4d682bb438aa1e10dae889d5c3a23f3c96da7c33e6403f07c
+size 686458
diff --git a/assets/video-editing/videos/video-editing-demo-05.mp4 b/assets/video-editing/videos/video-editing-demo-05.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..47076a5fbe9e04b7b32f4e7702c8ad857bd36cb4
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-05.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b9178fd4239b775b5690221bd9212ef220d64bf84e255e2dcaba3675ace5541
+size 892673
diff --git a/assets/video-editing/videos/video-editing-demo-06.mp4 b/assets/video-editing/videos/video-editing-demo-06.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c572e801dac07355d7f9927ea19f4fc0a3bd8008
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-06.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88388ea12ce0b1e556f1e7923d1c0acedf77295bb35d034ca976ebc039d27f41
+size 476837
diff --git a/assets/video-editing/videos/video-editing-demo-07.mp4 b/assets/video-editing/videos/video-editing-demo-07.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..67e30980a56048afe6c3848d2baea4f4ed4d2cf1
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-07.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d775b2cddfba65f480753d20b87a7973a341ca5dc44b8e87ab75d30a0c908e2d
+size 531418
diff --git a/assets/video-editing/videos/video-editing-demo-08.mp4 b/assets/video-editing/videos/video-editing-demo-08.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..19a5b520bf645a23ba8705eafbe89dc761a9d217
--- /dev/null
+++ b/assets/video-editing/videos/video-editing-demo-08.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9541ee67391691c58409ae97a5921747d0c38ad759b216bbbc5826d246b2d74
+size 1468408
diff --git a/assets/video-understanding/previews/video-understanding-caption-long-01.gif b/assets/video-understanding/previews/video-understanding-caption-long-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..eedd6ac56f2d7f5821ad0d0caade016fd808bbd1
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-caption-long-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:48b125e68667e3d06fdd25b0b0b5d77f58d4550e551bb63cf577b5fca3b7e785
+size 3053170
diff --git a/assets/video-understanding/previews/video-understanding-caption-short-01.gif b/assets/video-understanding/previews/video-understanding-caption-short-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..c0495f0d7c7227515ad48cab8530dcb4bc6586c7
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-caption-short-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ed01f2bd5658a4550ea8d0b80b9eacb291f73ac3370d285384df00d2e5c1d9e
+size 1060291
diff --git a/assets/video-understanding/previews/video-understanding-vqa-01.gif b/assets/video-understanding/previews/video-understanding-vqa-01.gif
new file mode 100644
index 0000000000000000000000000000000000000000..28b3b376965487155291f4f208e9d579b30c45f1
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-vqa-01.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb1618f7fd417bca8be3a705ebc75eef13e536b848354a2368b15a93614e35c
+size 1433496
diff --git a/assets/video-understanding/previews/video-understanding-vqa-02.gif b/assets/video-understanding/previews/video-understanding-vqa-02.gif
new file mode 100644
index 0000000000000000000000000000000000000000..8a68d16474c9fae3bfa92a28847ea4bb1a77f91c
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-vqa-02.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6af48bf796b792ae31d950573629a6b86620761aaa91f41b5cdfff2cd09e9688
+size 1053056
diff --git a/assets/video-understanding/previews/video-understanding-vqa-03.gif b/assets/video-understanding/previews/video-understanding-vqa-03.gif
new file mode 100644
index 0000000000000000000000000000000000000000..362e85d6c7cf3107a6f66774e9ea98fdd37a92fb
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-vqa-03.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1fec5e13137c6594652c56a524ef1a61612af2c38d6dcd8ab1eac1de223b1f1
+size 170163
diff --git a/assets/video-understanding/previews/video-understanding-vqa-04.gif b/assets/video-understanding/previews/video-understanding-vqa-04.gif
new file mode 100644
index 0000000000000000000000000000000000000000..19aca48b3a97718db6872612b149b86be193146d
--- /dev/null
+++ b/assets/video-understanding/previews/video-understanding-vqa-04.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1430704532c298c02943e12f12633a56e7122dd959d7adb1dd1c1b09d180178
+size 1596549
diff --git a/assets/video-understanding/videos/video-understanding-caption-long-01.mp4 b/assets/video-understanding/videos/video-understanding-caption-long-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..18ef888dc37e7fc94a3f379d98d48b0a1198cdd1
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-caption-long-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f070eefe27dc3f3d065926345299b996124dc1ee4372c223164ddfd0792ce1a
+size 5318845
diff --git a/assets/video-understanding/videos/video-understanding-caption-short-01.mp4 b/assets/video-understanding/videos/video-understanding-caption-short-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..993bcddeb0abe4941be2e6d42d932102cffcdcb7
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-caption-short-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5fcb4c18846571444ae024331a64e8740716e3b151f3e05a0d901b405b608da6
+size 2209818
diff --git a/assets/video-understanding/videos/video-understanding-vqa-01.mp4 b/assets/video-understanding/videos/video-understanding-vqa-01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7a9b2e331969aedae2a150682b28341ac74d454f
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-vqa-01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f382ee52b21942d7840eef2843bf5c57ed4e5ff4bb958e2c4fa23635030c02b
+size 2673972
diff --git a/assets/video-understanding/videos/video-understanding-vqa-02.mp4 b/assets/video-understanding/videos/video-understanding-vqa-02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1562546f0fff499fdb607c187776472b8dca565f
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-vqa-02.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:02572c7b087e7b36867aade045d9c43e9e62e6496aff36dbcbdc599eed03a276
+size 2323559
diff --git a/assets/video-understanding/videos/video-understanding-vqa-03.mp4 b/assets/video-understanding/videos/video-understanding-vqa-03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..06b1ee5e8fef304b8e763dd77291dfbe1a99eb9b
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-vqa-03.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cc93946ae538475878a9af4238a50a690b661795975d8cb513f3320d71ae0ca
+size 195524
diff --git a/assets/video-understanding/videos/video-understanding-vqa-04.mp4 b/assets/video-understanding/videos/video-understanding-vqa-04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7b3a9eac5d5a9a063849ceee0da2d574ff3b3
--- /dev/null
+++ b/assets/video-understanding/videos/video-understanding-vqa-04.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5dc46b40014a4a4685d081ae9c5dfb60b48ae17f5f9dbbd8cd6a746be11d3ce
+size 2486318
diff --git a/benchmarks/image_gen/DPG/DPG.jsonl b/benchmarks/image_gen/DPG/DPG.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8cb9945b300136da1928ac260524d65c8e514128
--- /dev/null
+++ b/benchmarks/image_gen/DPG/DPG.jsonl
@@ -0,0 +1,1065 @@
+{"index": "0", "data": "An expansive field, blanketed by the soft light of morning, cradles a collection of eight cabbages, their green heads round and plump. These vegetables are nestled among rows of rich soil, dotted with glistening droplets of dew that cling to their crinkled leaves. As wisps of mist begin to lift, the cabbages lie poised, ready for the day's impending harvest."}
+{"index": "1", "data": "An eye-catching vibrant red pickup truck with a stout and rectangular build is parked on the sandy shores as dusk sets in. The truck's glossy paint contrasts with the soft, amber hues of the setting sun reflected off the vehicle's surface. In the background, the gentle waves of the ocean can be heard as they meet the beach, with the silhouette of palm trees swaying gently in the evening breeze."}
+{"index": "10", "data": "On a rustic wooden table, three ripe eggplants with a glossy royal purple skin are carefully arranged in a neat row. Their plump, oblong shapes complement the table's textured surface, and they cast soft shadows in the warm, ambient light. Nearby, the woven pattern of a tan-colored napkin peeks out from beneath the vibrant, richly colored vegetables."}
+{"index": "100", "data": "An immaculate white vanity desk featuring an array of beauty products, among which a soft-bristled cosmetics brush and a sleek black eyeliner pencil are neatly arranged. To the side, a stark and dissonant element, a metallic handgun, lies on the textured surface of a grey concrete floor, creating a jarring juxtaposition against the delicate makeup tools. The vanity itself is positioned against a white wall, illuminated by the natural light streaming in from a nearby window."}
+{"index": "101", "data": "Inside a dimly lit room, the low luminance emanates from a bedside lamp casting a soft glow upon the nightstand. There lies a travel magazine, its pages open to a vivid illustration of a car driving along a picturesque landscape. Positioned on the image is a light pink toothbrush, its bristles glistening in the ambient light. Beside the magazine, the textured fabric of the bedspread is just discernible, contributing to the composed and quiet scene."}
+{"index": "102", "data": "Two multicolored butterflies with delicate, veined wings gently balance atop a vibrant, orange tangerine in a bustling garden. The tangerine, with its glossy, dimpled texture, is situated on a wooden table, contrasting with the greenery of the surrounding foliage and flowers. The butterflies, appearing nearly small in comparison, add a touch of grace to the scene, complementing the natural colors of the verdant backdrop."}
+{"index": "103", "data": "In a spacious loft with high ceilings and exposed brick walls, the morning light filters through large windows, casting a soft glow on a pair of trendy, high-top sneakers. These sneakers, made of rugged leather with bold laces, contrast sharply with the ornate, metallic vintage coffee machine standing next to them. The coffee machine, with its intricate details and polished finish, reflects the light beautifully, setting a striking juxtaposition against the practical, street-style footwear on the polished concrete floor."}
+{"index": "104", "data": "Two sleek blue showerheads, mounted against a backdrop of white ceramic tiles, release a steady stream of water. The water cascades down onto a vivid, crisp green pear that is centrally positioned directly beneath them. The pear's smooth and shiny surface gleams as the water droplets rhythmically bounce off, creating a tranquil, almost rhythmic sound in the otherwise silent bathroom."}
+{"index": "105", "data": "A traditional Venetian mask with intricate designs and feather embellishments is placed on a polished wooden table. Next to the mask, there sits a substantial golden trophy, shining with reflected light, its surface etched with small, detailed engravings. The table and its prestigious items are set against a deep sepia-toned backdrop that enhances the objects' visual appeal."}
+{"index": "106", "data": "During the twilight hour, an individual can be seen extending an arm towards the sky, pointing at a trio of wild birds gliding through the rich deep blue of the early evening sky. The birds' silhouettes contrast distinctly against the fading light, their wings spread wide as they soar. The person is silhouetted against the dusky sky, creating a peaceful scene of human connection with nature."}
+{"index": "107", "data": "On a rich maroon stage, a quintet of grand pianos gleams under the stage lights, their polished black surfaces reflecting their surroundings. Each piano is encircled by a small scatter of walnuts, their round, brown shapes contrasting with the sleek lines of the instruments. The pianos are methodically arranged, allowing ample space for performers to maneuver around them during a recital."}
+{"index": "108", "data": "In a silent library setting, bathed in the soft glow of ceiling lights, stand two metal tripods, each clasping a single piece of white notepaper. Both tripods are positioned at the end of a long, mahogany table scattered with books and reference materials, hinting at a late-night study session. The library's rows of bookshelves cast long shadows on the carpeted floor, creating an atmosphere of calm scholarly focus as the clock strikes midnight."}
+{"index": "109", "data": "Adjacent to each other in a room, a large rectangular bed draped in a navy-blue comforter sits parallel to a square-shaped nightstand with a matte finish. The nightstand holds an angular lamp and a small stack of hardcover books. The two pieces of furniture are positioned on a plush beige carpet that covers the majority of the floor space."}
+{"index": "11", "data": "A vintage-style kitchen featuring an integrated dishwasher that's finished with a panel matching the surrounding warm wood cabinetry. Above the dishwasher, a muted stone countertop is adorned with an assortment of vintage kitchen tools and fresh green herbs in terracotta pots. The room exudes a rustic charm, with exposed ceiling beams and a classic farmhouse sink completing the homely setting."}
+{"index": "110", "data": "A single, long green onion with its pristine white root end and vibrant green top stands upright in a tall, thin, clear glass vase. The delicate onion contrasts with the smooth and solid form of the vase which rests upon the speckled granite kitchen countertop. The morning light filters through a nearby window, bathing the onion and vase in a gentle, diffused glow that highlights their subtle textures and colors."}
+{"index": "111", "data": "In the dim light, five pairs of diverse shoes are gathered together on the concrete ground, near the wheel of a gleaming gold sports car. The car boasts a polished finish that rivals the sun's radiance, and its sleek curves are evident even in the shadows. The shoes, ranging from worn sneakers to shiny leather loafers, create a contrasting ensemble against the luxury vehicle's opulent appearance."}
+{"index": "112", "data": "Scattered across the polished hardwood floor, five vibrantly yellow cobs of corn lie in a haphazard formation, untouched and contrasting with the dark grain of the wood. Nearby, a mop with a wooden handle rests against a light grey wall, its stringy head soaked and discolored from use. The room is illuminated by the subdued, monochrome light of early morning, casting soft shadows across the cobs and the floor."}
+{"index": "113", "data": "A polished round silver bracelet rests beside a large square game board made of rich mahogany wood. Despite its compact size, the bracelet radiates with a bright shimmer, contrasting starkly against the muted tones of the chess squares on the game board. Each chess piece is meticulously aligned, creating a visual harmony between the circular curves of the bracelet and the straight edges of the board."}
+{"index": "114", "data": "In an elegantly simple room, two pairs of sandals—one blue and one red—sit tidily beneath a quaint, square wooden side table featuring a weathered finish that suggests a touch of rustic charm. The side table, which casts a soft shadow onto the textured salmon pink wall behind it, provides a harmonious balance to the vibrant footwear. The floor beneath the table is a polished light hardwood, reflecting a faint glow from the natural light entering the room."}
+{"index": "115", "data": "A delicate silver ring with a sleek band sits gracefully next to a large, dark leather wallet that seems to overflow with cards and receipts. Both items rest on a polished wooden bedside table, whose grain texture subtly complements the metallic luster of the ring. The wallet's bulky appearance emphasizes the fine simplicity of the ring, highlighting the stark difference in their sizes and designs."}
+{"index": "116", "data": "In the fading light of late afternoon, a scene unfolds in the autumn park, where a pair of worn brown boots stands firm upon a bed of fallen orange leaves. Attached to these boots are two vibrant blue balloons, gently swaying in the cool breeze. The balloons cast soft shadows on the ground, nestled among the trees with their leaves transitioning to auburn hues. Nearby, a wooden bench sits empty, inviting passersby to witness the quiet juxtaposition of the still footwear and the dancing balloons."}
+{"index": "117", "data": "A gleaming pair of golden cymbals lies in stark contrast to a collection of round, static bottles of toiletries arranged neatly on a shelf. The metallic sheen of the cymbals is emphasized by the surrounding muted tones of the shampoo and lotion containers. The bathroom shelf upon which they rest is made of polished oak, adding a warm touch to the setting."}
+{"index": "118", "data": "A tranquil scene where two brown, fibrous coconuts rest on the lush green grass beside a solitary deer, which is lying down, taking a respite. The deer has a rich, brown coat with white spots and appears to be at ease in the natural surroundings. In the background, a few trees with verdant leaves provide a quiet backdrop to this peaceful setting."}
+{"index": "119", "data": "In the warm hue of the setting sun, a well-used wooden cutting board leans against the gray, splintered slats of an aging backyard fence. Nearby, a bright red stop sign, its paint slightly faded and peeling from years of service, is planted firmly beside a quaint garden shed with peeling blue paint and a rusty door handle. The grass, tinged orange by the sunset's glow, is dotted with dandelions and whispers of the day's end breeze."}
+{"index": "12", "data": "Two spiraling strands of rich, crimson-colored pasta rest elegantly on the surface of a polished dark wooden table, the grain of the wood accentuating their vibrant hue. This rustic Italian kitchen is bathed in the warm, golden light of the late afternoon sun, which highlights the intricate texture of the pasta. The table, set amidst traditional décor and terracotta pots filled with fresh herbs, offers a tranquil setting for this simple yet captivating culinary display."}
+{"index": "120", "data": "In an otherwise silent room, the melodic rhythm of a recorder being played gently reverberates throughout the space. A single lantern with a hexagonal frame and intricate metalwork sits nearby, emitting a soft, warm light that flickers subtly, illuminating the immediate vicinity. The walls, which are painted a muted cream color, catch the lantern's glow, creating a cozy ambiance around the musician."}
+{"index": "121", "data": "A rustic, vintage wooden desk sitting in the corner of an antique store, its surface aged to a warm honey color. Upon the desk lies a silver flute, its polished surface gently reflecting the soft, golden light from the late afternoon sun streaming through a nearby window. Behind the flute, an array of eclectic cleaning products, from old-fashioned feather dusters to vintage glass bottles, is artfully arranged, adding to the charm of the setting. These items catch the sunlight in a way that casts an array of subtle shadows and highlights across the desk's surface."}
+{"index": "122", "data": "An old fashioned, metallic fan with a rounded base and rusted blades stands next to an ornate, antique knife with a weathered bone handle. Both items are propped against a timeworn wooden wall that bears the marks of age and the warm, golden hue of the afternoon sunlight filtering through a nearby window. The dust particles in the air catch the light, highlighting the vintage aura of the scene."}
+{"index": "123", "data": "In the bustling heart of the city, under the clear midday sky, stands a sparkling white sink, its porcelain surface gleaming in the sunlight. It is an unusual sight: the sink, vastly larger than a nearby old burgundy bicycle with a wicker basket, positioned as if it were a modern art installation. The surrounding concrete ground of the cityscape contrasts sharply with the clean and polished texture of the oversized sink."}
+{"index": "124", "data": "Under the bright midday light, a sleek glass display shelf within a boutique shop showcases an array of cosmetics. Three lipsticks, each a different shade of pink, red, and burgundy, are aligned meticulously beside a pair of shining black leather shoes, reflecting the overhead lighting. The shoes boast a perfect sheen, hinting at their unworn condition, and they sit adjacent to the lipsticks, adding contrast with their sharp, polished appearance against the soft textures of the makeup."}
+{"index": "125", "data": "A sizable rectangular storage box with a smooth, beige surface is positioned directly adjacent to a diminutive, round cherry with a glossy red exterior. The cherry's vibrant color contrasts sharply with the muted hue of the box. Both the box and the cherry rest on a white marble countertop that amplifies their distinct shapes and sizes."}
+{"index": "126", "data": "A tall metallic baseball bat, standing upright on a lush green field, is gently enshrouded by a soft pink towel with frayed edges. The clear blue sky overhead provides a stark contrast to the vibrant color of the towel. Surrounding the bat, there are a few white daisies scattered, their petals swaying slightly in the gentle breeze."}
+{"index": "127", "data": "A giant faux oyster, with a rough textured exterior akin to rock, opens to reveal its sleek interior resembling a lustrous pearl. From within its spacious maw, a vintage camera with black leather casing and silver details gently slides out as if it were an oversized pearl escaping the confines of its shell. The camera is caught mid-motion, creating a sense of dynamic action against the still life of the display."}
+{"index": "128", "data": "Beneath the expansive night sky, sprinkled with myriad twinkling stars, a flag adorning the pinnacle of a towering lighthouse drifts gently in the evening breeze, its hues subdued by the soft silver glow of the moon. The sturdy structure of the lighthouse, painted in alternating bands of white and red, stands as a stalwart sentinel on the rugged shoreline. At the water's edge, the rough texture of the rocks is intermittently made visible by the intermittent wash of foamy waves, where a solitary lobster, its dark, glossy carapace reflecting the moonlight, slowly makes its way out of the embrace of the ocean."}
+{"index": "129", "data": "A well-organized glass display cabinet contains a collection of seven hairdryers, each boasting a sleek design and different hues. Adjacent to this assortment, three luxurious silver watches are elegantly showcased, their reflective surfaces glinting under the soft glow of the evening light filtering through a nearby window. The watches are arranged on a plush velvet stand that enhances their sophisticated appearance."}
+{"index": "13", "data": "A sleek, black rectangular keyboard lies comfortably on the luxurious beige carpet of a quiet home office, bathed in the gentle sunlight of early afternoon. The keys of the keyboard show signs of frequent use, and it's positioned diagonally across the plush carpet, which is textured with subtle patterns. Nearby, a rolling office chair with a high back and adjustable armrests sits invitingly, hinting at a quick break taken by its usual occupant."}
+{"index": "130", "data": "On a brisk morning, a delicate white paper napkin gently enfolds a vibrant yellow mango, contrasting sharply with the fruit’s robust size. The mango, plump and juicy, sits prominently at the center of a light wooden dining table. Around it, the early daylight casts soft shadows, highlighting the texture of the napkin's folds as it cradles the mango."}
+{"index": "131", "data": "A spherical tissue dispenser sits flush against the curved edge of a white porcelain sink, creating a harmonious visual continuity. The tissue dispenser, with its polished silver finish, reflects the soft light in the room, contrasting with the sink's matte surface. The basin itself is neatly embedded into a marble countertop, which stretches across the bathroom, punctuated by chrome faucets that gleam under the overhead lighting."}
+{"index": "132", "data": "In the corner of a dimly lit room, a rectangular, charcoal-gray computer box exhibits its sharp edges as it rests beside an elegantly draped, cherry-red silk bow tie. The contrast between the electronic's matte finish and the tie's glossy sheen is striking. The computer appears dormant, while the bow tie seems almost ready to be picked up and worn to an event. Behind these objects, the ambient lighting casts soft shadows against the plain, light-colored wall."}
+{"index": "133", "data": "In the midst of a bustling cityscape under the bright midday sun, a solitary wooden bench with peeling green paint sits empty on the sidewalk. A city worker dressed in a reflective orange vest is actively disinfecting the bench surface, using a clear spray bottle filled with a blue cleaning solution. Passersby continue with their day, navigating around the cleaning activity, while the noise of the city hums in the background."}
+{"index": "134", "data": "A giant brass tuba with its large bell facing upwards, eclipsing a petite penguin below. The penguin, with its characteristic black and white plumage, appears to waddle curiously around the towering instrument. This peculiar scene unfolds against the backdrop of a rich crimson sunset, casting a warm glow over the entire setting."}
+{"index": "135", "data": "Chrome silver medals, both engraved with intricate designs, can be seen gently spinning within the drum of a white washing machine. The machine is placed beside a window through which bright sunlight streams, casting a warm glow on the machine's glossy surface. The gentle rotation of the medals creates a soft clinking sound against the steel interior of the washer, which is otherwise filled with clothes on a slow spin cycle."}
+{"index": "136", "data": "As the dawn breaks, two personal care items, a toothbrush with white and blue bristles alongside a tube of toothpaste, are methodically placed under the protective cover of an awning adorned with a geometric hexagonal pattern. The soft morning light gently illuminates the scene, casting subtle shadows beneath the items on the reflective glass surface they rest upon. The awning's shade provides a tranquil, organized backdrop to the start of the day."}
+{"index": "137", "data": "Amidst the serene setting of a river bank, a single, bright rectangular bar of soap sits prominently on the rough, earthy terrain. Its surface gleams under the sunlight, contrasting with the natural surroundings. Several feet away, a curious black bear with a glossy coat cautiously approaches, nostrils flaring as it investigates the unfamiliar scent. The dense evergreen forest that lines the river's edge creates a lush, green backdrop to this unusual encounter."}
+{"index": "138", "data": "Three perfectly shaped dumplings, with their pleats meticulously crimped, sitting in solitude on a reflective glass surface which captures their image under the soft glow of a moonlit night. The large mirror is resting against a wall with a faint pattern, amplifying the quietude of midnight. The room is hushed, with the only movement being the gentle shift of shadows as the night deepens, highlighting the stillness of the scene."}
+{"index": "139", "data": "a pyramid-shaped tablet made of a smooth, matte grey stone stands in the foreground, its sharp edges contrasting with the wild, verdant foliage of the surrounding jungle. nearby, a crescent-shaped swing hangs from a sturdy tree branch, crafted from a polished golden wood that glimmers slightly under the dappled sunlight filtering through the dense canopy above. the swing's smooth surface and gentle curve invite a sense of calm amidst the lush greenery."}
+{"index": "14", "data": "Three vibrant green lettuce leaves gently float on the surface of crystal-clear water in a shallow white porcelain basin. The sunlight catches the delicate veins of the leaves, highlighting their fresh, crisp texture. Nearby, tiny air bubbles cling to the edges of the leaves and the smooth inner surface of the basin."}
+{"index": "140", "data": "An outdoor setting illuminated by sunlight, showcasing three vibrantly red towels, meticulously folded and placed on a concrete surface. Beside this tidy arrangement, a sleek white scooter stands parked, its handlebar casting a slender shadow on the ground. The contrasting colors create a striking visual against the backdrop of a clear blue sky."}
+{"index": "141", "data": "A solitary, white swan gracefully makes its way across the tranquil surface of a still lake, its reflection almost perfect in the water. Above, mounted on the sturdy branches of dense, leafy trees, three black surveillance cameras silently observe the scene. Their lenses, though inactive and unblinking, appear to follow the swan's serene passage, contrasting starkly with the natural beauty of the early morning. The lake is surrounded by a lush greenery that gently sways in the light breeze, undisturbed by the technological sentinels."}
+{"index": "142", "data": "a bright neon pink hurdle stands out against the backdrop of a golden setting sun, casting long shadows on the sand below. perched upon the hurdle, a vibrant orange crab clings tightly, its pincers silhouetted against the dimming sky. in the distance, waves can be seen gently crashing onto the shore, reflecting the rich hues of the sunset."}
+{"index": "143", "data": "A compact blue speaker with a sleek design sits atop the edge of a vibrant yellow bathroom sink. The sink is situated against a pristine white tiled wall, contrasting sharply with its vivid color. The texture of the speaker appears smooth and modern, clearly distinguishable from the glossy finish of the ceramic sink basin."}
+{"index": "144", "data": "During a tranquil evening, a grey seal with inquisitive eyes explores a vibrantly illustrated book left on the coarse sands of a deserted beach. The book's pages flutter in the gentle sea breeze, revealing bursts of purples, greens, and reds. Nearby, scattered seashells and washed-up seaweed provide a natural backdrop to this unusual scene."}
+{"index": "145", "data": "Three brown chickens with glossy feathers are gathered around a large, metallic silver hammer lying on the ground. The hammer's handle is engraved with intricate designs, and it reflects the sunlight, drawing the birds' attention. The chickens appear to be pecking and bobbing their heads curiously, as if they are performing a dance in a circle around the tool. Nearby, blades of green grass can be seen poking through the soil, adding contrast to the scene."}
+{"index": "146", "data": "A surreal scene of a giant pink, rubber glove emerging from the golden sands of the beach during the orange hues of sunset. The enormous glove, with its fingers outstretched, gently grips a tiny, sunlit yellow scallop shell. The tranquil ocean in the background ripples gently beneath a sky painted with the colors of dusk."}
+{"index": "147", "data": "In the midst of a bustling cityscape, during the golden hour of sunset, stands an enormous trombone that stretches upwards like a unique city monument. Its brass surface is painted with bold sections of red, green, and yellow, mimicking the familiar sequence of a traffic signal. Around its base, people and vehicles move about, creating a dynamic contrast between the stillness of the instrument and the motion of the city life."}
+{"index": "148", "data": "A vivid yellow chair with a smooth, plastic texture sits adjacent to a sleek red treadmill in a compact, square-shaped room. The wall behind these pieces of equipment is painted a vibrant turquoise blue, which contrasts sharply with the equipment. The sun shines through a window, illuminating the space at midday, casting soft shadows on the light grey flooring."}
+{"index": "149", "data": "A quaint Parisian bistro table, with an ornate metal base, sits on a cobbled street, its surface hosting a classic French kettle with an elegant, sweeping curvilinear profile and a glossy finish that catches the sunlight. Next to it lies a soft, felt French beret, in a deep shade of navy blue, adding a touch of artistic flair to the setting. The backdrop is a bustling Paris afternoon, with the silhouette of the Eiffel Tower looming in the distance, framed by the vibrant green leaves of trees lining the avenue."}
+{"index": "15", "data": "An ornate silver cosmetics mirror gracefully positions itself upon a pristine white marble vanity top. Surrounding the mirror are various high-end makeup products and delicate perfume bottles, each catching the room's natural light in their uniquely colored glass. The vanity itself, sleek in design with clean lines, is nestled in a space that feels both modern and timeless."}
+{"index": "150", "data": "In a dimly lit room during twilight, a sleek, white toothbrush with angular bristles rests against the curvature of a modern, black computer monitor. The soft glow of the screen illuminates the bristles, casting long shadows on the desk. Nearby, the monitor's base reflects a faint shimmer, hinting at the gradual transition from day to night outside the window."}
+{"index": "151", "data": "In a dimly lit room, a sleek, black projector casts a bright image onto a large white screen for an evening presentation. Beside the bed, a small nightstand made of polished dark wood carries a solitary leather wallet, lying undisturbed amidst the quietude of the space. The wallet's smooth surface reflects the faint glow emanating from the projector, subtly highlighting its details and the meticulous stitching along its edges."}
+{"index": "152", "data": "An array of bathroom essentials arranged neatly on a cool, gray marble countertop, consisting of a toothbrush holder, soap dispenser, and matching container, all with a metallic finish. To the side, a juicy, pink halved grapefruit rests on a delicate ceramic plate adorned with a subtle floral pattern. The toiletry set is thoughtfully placed to create an inviting and organized display next to the grapefruit."}
+{"index": "153", "data": "A transparent glass flask stands upright on a smooth, dark surface, filled with bright green beans that sharply contrast against the stunning backdrop depicting a starry night sky. The intricately painted celestial scene features deep blues and purples, with dots of white representing distant stars. The curvature of the flask magnifies the beans and some of the stars, creating an enchanting visual effect."}
+{"index": "154", "data": "A bright white air conditioner, its sleek rectangular form protruding slightly, is mounted high on a beige wall above eye level. Directly below, on the tiled kitchen floor, stands a vintage round gas stove, painted in a vibrant shade of red and featuring classic white knobs. The contrast between the modernity of the air conditioner and the rustic charm of the gas stove creates a distinctive look in the room."}
+{"index": "155", "data": "A vibrant hot pink cosmetic bag with a textured quilted pattern sits on the gray tile floor between two glossy, white ceramic urinals. The bag is partially unzipped, revealing an array of makeup brushes and beauty products peeking out. To the side, the chrome flush pipes of the urinals glint under the bright bathroom lighting."}
+{"index": "156", "data": "In a dimly lit room during the nighttime, a red, circular router casts a soft blinking light, indicating activity. Beside it, a paintbrush with bristles stained in hues of blue and green lies abandoned on a wooden desk. The surface of the desk is slightly cluttered, with various papers scattered around the still equipment."}
+{"index": "157", "data": "A fluffy white pillow rests against the cool glass of a large window, accompanied by a pair of sleek black binoculars positioned on the ledge. The windowsill, bathed in the early morning light, also houses a small potted plant with vibrant green leaves, offering a contrast to the neutral tones of the room. Outside the window, the faint pre-dawn hues hint at the promise of a new day."}
+{"index": "158", "data": "In the early morning light, a vibrant mango-colored hot-air balloon, with its size dwarfing a majestic brass trumpet, begins its peaceful ascent. The balloon's large, bulbous shape stands out sharply against the pale blues and soft pinks of the dawning sky. Its woven basket, crafted from sturdy pale wood, carries passengers who peer out in awe over the landscape below. The fabric of the balloon is smooth and taut, capturing the warming rays of the sun as it climbs higher into the expansive atmosphere."}
+{"index": "16", "data": "A hefty, red-painted chainsaw rests prominently on the surface of a sturdy, solid oak table. The metal components of the chainsaw catch the soft glow of the early morning sunlight, causing a shimmering effect. The wood's rich, deep grain stands in contrast to the boldness of the chainsaw, and there are wood shavings scattered nearby, hinting at recent activity."}
+{"index": "160", "data": "As the sky transitions into hues of orange and purple during twilight, a sleek silver sailboat cuts through the calm waters. On the deck of the boat, a small makeshift kitchen setup can be seen, where someone is carefully preparing long, string-like noodles. The gleaming surface of the boat reflects the fading sunlight, and the gentle ripples on the water create a tranquil scene. Nearby, the silhouette of the coastline is just barely visible against the evening sky."}
+{"index": "161", "data": "Within the confines of a modern bathroom bathed in the subtle light of early morning, a sleek chrome showerhead glistens, mounted on a wall of finely veined marble tiles. Aside from the minimalist fixtures, a luxurious leather satchel with a hint of luster from its high-quality material casually occupies space on the smooth, cool surface of a marble countertop beside a pristine white sink. In the background, fluffy white towels are neatly stacked on a wooden shelf, providing a soft contrast to the room's hard surfaces."}
+{"index": "162", "data": "A playful monkey with a chestnut coat and bright eyes is clumsily handling a crimson red heart-shaped tea pot. The monkey sits in a verdant jungle environment, surrounded by an array of glossy green leaves and suspended vines. The tea pot, with its glossy ceramic finish, reflects the dappled sunlight that filters through the dense canopy overhead."}
+{"index": "163", "data": "Two vibrant red jugs are carefully positioned below a trio of open black umbrellas, which stand stark against the backdrop of a grey, stormy sky. The jugs rest on the wet, glistening concrete, while the umbrellas, with their smooth, nylon fabric catching the breeze, provide a sharp contrast in both color and texture. Each umbrella casts a protective shadow over the jugs, seemingly safeguarding them from the impending rain."}
+{"index": "164", "data": "In an open outdoor setting, a decorative coffee table with elaborate wood carvings and curved legs is positioned under the expansive blue sky of a clear afternoon. On the surface of the table, an out-of-place toothbrush with white and blue bristles sits alone. The table stands on a patch of vibrant green grass, and no other items or furniture are immediately visible in the vicinity."}
+{"index": "165", "data": "In a modern kitchen, a square, chrome toaster with a sleek finish sits prominently on the marble countertop, its size dwarfing the nearby red vintage rotary telephone, which is placed quaintly on a wooden dining table. The telephone's vibrant red hue contrasts with the neutral tones of the kitchen, and its cord coils gracefully beside it. The polished surfaces of both the toaster and the telephone catch the ambient light, adding a subtle shine to their respective textures."}
+{"index": "166", "data": "A modern office space featuring a sleek, transparent glass desk upon which five identical flat, rectangular smartphones are arranged in a precise, evenly spaced line. Each phone reflects the overhead lighting, emphasizing their dark, glossy screens. To the side of this technological display, a single roll of white toilet paper stands out, its soft texture a stark contrast to the smooth glass surface, poised precariously at the desk's edge as if forgotten in haste."}
+{"index": "167", "data": "Three sleek remotes are aligned perfectly next to a simple black picture frame, which encloses a monochrome photograph. These objects rest on the glossy surface of a rich mahogany coffee table. Surrounding them, the soft glow of a nearby lamp illuminates the living room's plush, earth-toned furniture, casting subtle shadows that contribute to a peaceful evening ambiance."}
+{"index": "168", "data": "A whimsical scene unfolds as two pairs of spectacles rest comically on the bridge of a horse's nose, glinting slightly against the fading light of a vibrant sunset. The backdrop is a quintessential rural farm, complete with wooden fences and distant rolling hills painting the horizon in hues of orange and purple. The glasses, one with round, golden frames and the other with square, jet-black rims, add a touch of eccentricity to the tranquil pastoral setting."}
+{"index": "169", "data": "A white radiator, attached to a pale yellow wall with a faint floral wallpaper border, radiates warmth and gently increases the temperature of the room. Near the radiator, a black analog scale sits idle on the bathroom's tiled floor, marked with white and gray hexagonal patterns. The scale stands alone, surrounded by the room's muted colors and simple decor, its needle motionless, poised for the next reading."}
+{"index": "17", "data": "A bright red lighter rests atop a polished wooden desk, its surface reflecting the soft overhead lighting. The desk itself boasts a rich and deep mahogany color, free from clutter except for the solitary lighter. Off to the side of the desk stands a green potted plant, adding a touch of vibrancy to the otherwise methodical arrangement."}
+{"index": "170", "data": "Amidst the subtle lighting of a storefront display as dusk sets in, three striking neon green high heels captivate the attention of passersby. Each shoe features a sleek stiletto heel and is carefully arranged in a coordinated fashion, creating an eye-catching contrast against the shop's understated backdrop. Positioned off to one side, a mop with a silver, curved metal handle leans casually against the window, its presence adding an unexpected twist to the display's overall composition."}
+{"index": "171", "data": "A vibrant yellow rabbit, its fur almost glowing with cheerfulness, bounds energetically across a sprawling meadow dotted with a constellation of wildflowers. The creature's sizeable, red-framed glasses slip comically to the tip of its nose with each jubilant leap. As the first rays of sunlight cascade over the horizon, they illuminate the dew-draped blades of grass, casting the rabbit's exuberant shadow against the fresh green canvas."}
+{"index": "172", "data": "A well-loved silver pot emits a gentle steam on a modern gas stove with blue flames licking at its base. In the foreground, a hand wields a vivid green marker that dances across the open pages of a sketchbook, which is sprawled casually on a nearby wooden kitchen table. The sketchbook contains whimsical drawings, random doodles intertwined with occasional splashes of color, capturing the spontaneous bursts of creativity."}
+{"index": "173", "data": "In the dim light of twilight, a room is visible with a backpack resting against the wall; the backpack is a deep blue with hints of grey and sports several small pockets along its exterior. Next to it, a neon green toothbrush stands upright, propped against a clear drinking glass on a wooden nightstand. Outside the window behind them, the sky transitions to a deep navy hue, dotted with twinkling stars that create a tranquil backdrop."}
+{"index": "174", "data": "Two intricately designed bracelets with patterns of gold and silver hues rest at the base of a tall playground slide, which towers above them with its vibrant red and yellow plastic sides. The bracelets catch the fading light of a radiant sunset that paints the sky in brilliant shades of orange and pink. In the near distance, the silhouette of the playground swings can be briefly made out against the colorful backdrop."}
+{"index": "175", "data": "Within the discolored interior of a derelict building, an old claw-foot bathtub sits against a crumbling wall, its once-white enamel stained with time. Inside the tub rests a pair of vibrant, high-top sneakers—reds, blues, and yellows clashing with the drab surroundings. The sneakers, juxtaposed with the peeled paint and cracked tiles, suggest an untold story of hasty departure. Despite being inanimate, they seem to take on a life of their own as shadows elongate with the approach of midnight, casting an uncanny aura over the scene. Nearby, a window with broken panes allows the moonlight to filter in, further illuminating the sneakers' abandonment."}
+{"index": "176", "data": "Within an expansive auditorium, a single megaphone with a metallic finish and a bold black stripe lies abandoned on a vast, textured orange carpet. Rows of empty seats surround the area, hinting at the silence of an event now passed. The natural afternoon light filters in through large windows, casting long, soft shadows across the carpet and the solitary object at rest."}
+{"index": "177", "data": "A small, red candle with a flickering flame is placed on the bathroom countertop, emitting a soft glow beside the large, square, white porcelain toilet. The candle's subtle shimmer reflects off the polished chrome fixtures of the bathroom, creating a warm ambiance. The size contrast between the tall, slender candle and the robust toilet form a unique visual pairing in the compact space."}
+{"index": "178", "data": "A dining room setting showcasing an unusually large red bell pepper with a shiny, slightly wrinkled texture, prominently placed beside a diminutive golden medal with a red ribbon on a polished wooden dining table. The pepper's vibrant hue contrasts with the medal's gleaming surface. The scene is composed in natural light, highlighting the intricate details of the pepper's surface and the reflective quality of the medal."}
+{"index": "179", "data": "An animated scene depicts a bright orange traffic cone in a playful balancing act with a small, pinkish-red hat on an imaginary, straight line. The cone's texture appears rough and rigid, in stark contrast to the soft, fabric texture of the diminutive hat. Both items are similarly hued but differ vastly in shape and size, creating a whimsical and absurd visual."}
+{"index": "18", "data": "In the illumination of the evening, a trio of square-shaped white induction cookers sit neatly arranged on a large central cooking island with a smooth, marbled countertop. Around them are scattered various cooking utensils, a cutting board hosting a half-chopped onion, and a crystal-clear glass bowl filled with ripe cherry tomatoes. The overhead lights cast a soft glow on the metallic surface of the kitchen vent hood above the island, reflecting the cookers below."}
+{"index": "180", "data": "A slice of vibrant red watermelon, with its green rind visible, serves as a plate for a single, orange-hued cooked shrimp. The curves of the shrimp follow the crescent shape of the melon, contrasting in both color and texture. Beside the watermelon, there are droplets of water, suggesting the fruit's juicy freshness."}
+{"index": "181", "data": "A picturesque winter morning is captured within a cozy kitchen, where a sturdy blue ladder stands erect near a window veiled with delicate patterns of frost. The morning light reveals the contrast between the ladder and the set of sleek, ebony chopsticks lying in repose on the surface of the highly polished wooden kitchen table. The table also bears the serene presence of a white porcelain teapot, poised as if waiting to pour warmth into the chilly day."}
+{"index": "182", "data": "A solitary camel, with its characteristic humps and creamy beige coat, slowly ambles beside a striking, plush, round red couch, which seems oddly out of place in the vast desert landscape. The harsh midday sun casts a sharp shadow from the camel, dwarfing the small couch in comparison. Around the odd couple, the endless sea of sand contrasts with the vibrant red upholstery, as no other objects or signs of life interrupt the peculiar desert scene."}
+{"index": "183", "data": "In the fading light of dusk, three circular targets aglow with bright neon lights hang suspended in the tranquil evening air. Below, a well-worn pair of skating shoes, marked with scuffs and bearing the tale of countless rides, sits abandoned on the weathered wooden slats of an old park bench. Around the bench, the soft glow of nearby street lamps casts a subtle light, creating a gentle contrast with the vivid colors of the floating targets."}
+{"index": "184", "data": "A spacious study room featuring a large rectangular oak desk dominated by a scarlet barbell used as an unconventional paperweight. The desk surface is scattered with papers, books, and a sleek, silver laptop. Flanking the desk are tall bookshelves filled with an array of books, and the sunlight filters through a large window, illuminating the space with natural light."}
+{"index": "185", "data": "Under a vast expanse of clear blue sky, a dilapidated piece of machinery with peeling orange paint and signs of rust is carefully maneuvering a pair of red bricks. The vehicle, possibly an aging forklift or crane, creaks as it transports the heavy materials across a barren construction site. Each brick has a rough texture, accentuated by the bright sunlight casting sharp shadows on the ground."}
+{"index": "186", "data": "During the warm glow of a dwindling summer evening, a particular fussy feline with distinctive calico markings is perched atop a garden table. The cat, seemingly indifferent to its surroundings, sports a pair of large, reflective aviator sunglasses that sit comically upon its small, furry face. Around the cat, there are scattered pots of blooming flowers, contributing to the charm of the scene, and in the background, hints of orange and pink skies are visible through the foliage."}
+{"index": "187", "data": "On a smooth, polished wooden desk, there are five small square-shaped power converters with a metallic finish neatly placed beside each other. Behind them, two round pink erasers lie in contrast to the wood grain, slightly showing signs of use with pencil shavings nearby. The desktop itself is bathed in the warm glow of the room's lighting, highlighting the gentle textures of the wood and the matte surfaces of the converters and erasers."}
+{"index": "188", "data": "In the midst of a sprawling savanna, a regal lion with a golden mane moves gracefully near a white washing machine and matching drying machine, which appear incongruous against the rolling dunes. The sun blazes down, casting sharp shadows that contour the sand around the appliances and the feline figure. Around this unusual scene, tufts of dry grass sway slightly in the hot breeze, accentuating the isolation of the domestic objects in this wild, open landscape."}
+{"index": "189", "data": "Under the warm glow of an overhead light, a shiny chrome showerhead is poised above a pristine white bathtub with clawed feet. The porcelain surface of the tub is speckled with droplets of water, ready to embrace the evening's tranquility. To the side of the bathtub, an assortment of lavender-scented bath products and fluffy towels are neatly arranged, hinting at the luxurious bath time ritual that awaits."}
+{"index": "19", "data": "An elegant brass saxophone rests upright on a stand in the center of a dimly lit stage, its polished surface catching the limited light and casting a warm glow. The slight shimmer of the instrument stands in stark contrast to the dark, empty chairs surrounding it, evoking a sense of anticipation. Its curves and keys are highlighted by a single beam of spotlight, waiting to come to life in the quiet twilight of the auditorium."}
+{"index": "190", "data": "A stainless steel fork with tines pointed upward lies atop a navy blue pencil case with white zippers, bathed in the warm glow of early morning sunlight. The pencil case, slightly ajar, reveals an assortment of colored pencils and a pair of scissors peeking out. The items are resting on a wooden desk, which has the grain pattern gently highlighted by the sun's rays."}
+{"index": "191", "data": "Two worn pairs of leather boots lie haphazardly on a dusty barn floor, their laces tangled and their muddy soles facing outward. They are adjacent to a tall, weathered wooden barrel that stands upright, bearing the marks and scratches of frequent use. The golden light from the setting sun filters through the gaps in the barn's wooden slats, casting elongated shadows that stretch towards the old, creaking doorway."}
+{"index": "192", "data": "A curious vessel, with an architecture reminiscent of a giant green broccoli, basks in the bright sunlight, casting a shimmer across its intricate, leaf-like structures. It floats serenely in the midst of a vast ocean, the water around it sparkling as if sprinkled with diamonds due to the sun's reflection. The horizon stretches endlessly, with the clear blue sky meeting the deep azure of the sea at a distant line."}
+{"index": "193", "data": "An aged wooden nightstand with an elegant finish stands beside a tall bed, supporting an oversized violin that stretches beyond its edges. The gentle evening light cascades through an adjacent window, casting a warm amber glow on the instrument's polished surface. Intricate details on the violin's body glimmer subtly, capturing the essence of its classical beauty."}
+{"index": "194", "data": "At a playground, there are two colorful plushie toys, fashioned to resemble cheerful little characters, securely riding on the backs of small mechanical sheep. These sheep are on a circular track, endlessly circling beneath the warm glow of an orange sunset that blankets the sky. The surrounding play area is dotted with other equipment, but these plushie toys, with their exaggerated smiles and bright button eyes, stand out against the fading daylight."}
+{"index": "195", "data": "As the sun begins its descent in the late afternoon sky, a pair of brown leather boots can be seen tapping against a wooden pier, shedding the remnants of the ocean's saltwater. Nearby, a brightly colored surfboard, decorated with swirls of blue and yellow, stands propped up against a palm tree, basking in the warmth of the sun to dry off. Across the sandy beach, the waves gently lap at the shore, a rhythmic soundtrack to this tranquil scene."}
+{"index": "196", "data": "A vast, metallic extractor, its surface reflecting the pale hues of dawn, towers beside a wooden cello that sits elegantly aglow in the morning light. Both objects are bathed in a gentle, warm ambience as the sun rises, casting long, soft shadows across the floor. The extractor's robust shape and industrial design create a striking visual juxtaposition with the smooth, curved silhouette of the cello."}
+{"index": "197", "data": "In a sunlit kitchen, a vibrant array of green vegetables flourish in a wall-mounted planter positioned directly above a white power outlet. The lush leaves present a stark contrast to the crisp, clean paint of the wall. Sunlight streams in from a nearby window, casting a natural glow that enhances the deep greens of the spinach, kale, and herbs thriving within the urban indoor garden setup."}
+{"index": "198", "data": "Amidst the golden hour's warm glow, a triumvirate of sturdy, vibrant orange carrots lies clustered together, their smooth surface catching the light, on an aged wooden picnic table with a history etched into its grain. Adjacent to these garden-fresh vegetables, two pristine wine glasses with a deep ruby-red brilliance stand side by side, seemingly in anticipation of a celebratory toast. The rustic table is positioned outdoors, where the sun's descending rays cast an amber tapestry over the scene, enhancing the natural beauty and rich colors of the food and drink assembled for an evening feast."}
+{"index": "199", "data": "An aged and weathered workbench is laden with tools and materials, its surface coated with a fine layer of sawdust. A rolled-out tape measure and a wooden ruler lie parallel across the bench, both extending their lengths to aid in an unseen project. Beside these measuring tools, a ceramic ashtray cradles a lit cigar and a smoldering cigarette, their trails of smoke winding upwards. The ashtray is situated to the left of the measuring tools, and behind it, various unfinished woodworks can be glimpsed, suggesting the area is frequented by a craftsman."}
+{"index": "2", "data": "An elegant and modern bathroom featuring a sleek, white rectangular bathtub filled with a froth of soap bubbles. The bathtub rests upon a floor of gray, matte tiles that complement the room's minimalistic design. Against the room's far wall stands a large window that frames the warm, amber hues of a sunset, casting a tranquil glow throughout the space."}
+{"index": "20", "data": "An ornate royal carriage, painted in deep red with golden trim, stands prominently against a landscape blanketed in pristine snow. Behind it, the silhouettes of tall pine trees dusted with white can be discerned through the soft haze of a winter's day. In front of the carriage, the snow-covered ground glistens under the subtle light of the afternoon sun."}
+{"index": "200", "data": "A gleaming stainless steel showerhead mounted on a beige tiled wall, with beads of water steadily dripping from its nozzle onto the glossy white porcelain urinal below. The urinal is flanked by grey dividers, ensuring privacy, and the floor beneath is speckled with small, wet splashes reflecting the dim early morning light filtering through a frosted window just out of view. The metallic fixtures of the showerhead are complemented by the chrome handles and plumbing visible under the urinal, creating a cohesive and functional restroom vignette."}
+{"index": "201", "data": "In the calm and pristine setting of a forest blanketed in snow, three colorful skiboards are positioned vertically against a tree trunk, their pointed ends etched with vibrant patterns and designs standing out against the white backdrop. Beside them, two wooden hockey sticks lie, their shafts crossing over one another in a silent contest of height, both adorned with faded tape near the blade. The smooth, icy surface of a nearby frozen pond can be glimpsed through the trees, hinting at a recent winter sport rendezvous."}
+{"index": "202", "data": "An industrial scene featuring a large, rusty orange excavator with a weathered appearance and a mechanical arm extended. It is in the process of lifting heavy, metallic cubes with a patina of age, loading them into the bed of a weathered dark green pickup truck. The backdrop consists of a dilapidated factory with fading paint and broken windows, evidence of a once-thriving industrial era."}
+{"index": "203", "data": "Inside a bathroom with white tiled walls, a pastel pink toothbrush is propped up next to a towering yellow mop with a fluffy fiber head. Despite their size difference, they both lean casually against the same wall, with the toothbrush appearing diminutive when compared to the mop. The floor is speckled with small water droplets, hinting that the mop may have been recently used."}
+{"index": "204", "data": "In the quiet glow of the afternoon sun, a bright yellow helmet and a dusty green backpack are spotted perched on the roof of a silver SUV. The helmet, with minor scuff marks and a dark visor, sits slightly askew, as if hastily placed there. The backpack, partially unzipped with a water bottle peeking out, leans against the vehicle's black roof rack, hinting at a recent adventure or a pause in a journey."}
+{"index": "205", "data": "In a tranquil forest clearing by the water's edge, a large orange tent with its entrance zipped shut is pitched on a grassy knoll overlooking a serene lake. A few steps away from the tent, on the forest floor scattered with autumn leaves, a circular gold bracelet catches the sunlight, creating a subtle sparkle amid the natural surroundings. The nearby trees cast gentle shadows over the area, enhancing the peaceful ambiance of the outdoor scene."}
+{"index": "206", "data": "In a dimly lit room, a stark black blackboard and a pristine white whiteboard are mounted next to each other on a pale wall, creating a striking contrast. The whiteboard is clean, while the blackboard has faint remnants of chalk dust. Directly below them, on a brown wooden table, rests a single pink folder, its bright color standing out in the quiet shadow of the evening."}
+{"index": "207", "data": "A contemporary bathroom features a white hanging shelf, securely mounted on a pale blue wall, showcasing an assortment of toiletries. The items include a glass container of cotton swabs, a small plant in a terracotta pot, and an array of bottles with chrome caps reflecting the soft overhead light. Below the shelf, a plush cotton towel, perfectly fluffed and of a gentle lavender hue, hangs elegantly over a polished chrome towel bar."}
+{"index": "208", "data": "On the lush green field, five orange cones are neatly arranged in a line. Nearby, three brown leather American footballs are scattered, their white laces and dimpled texture visible against the vibrant grass. In the background, the white lines marking the playing field add to the sense of organization and the sport being played."}
+{"index": "209", "data": "In the soft glow of the waning daylight that filters through the window of an old-fashioned boutique, a tie with a classic diamond pattern rests elegantly on top of a pair of well-used skater sneakers. The sneakers exhibit a history of adventures in their scuffed edges and faded canvas. Set against a backdrop of antique furnishings and eclectic trinkets, the tie's smooth texture contrasts with the rough fabric of the shoes."}
+{"index": "21", "data": "An elegant, lustrous emerald green necktie is carelessly strewn across the slick, polished surface of a dark mahogany dining table. The fine silk texture of the tie contrasts with the wood's deep, rich grain. Beside the tie, a scattering of loose papers and a silver pen give an impression of a hasty departure, perhaps after a morning of work or a formal event."}
+{"index": "210", "data": "An elegant array of footwear with ten pairs of high heels standing prominently, their height casting slender shadows on the vintage wooden panel backdrop that suggests they are in the midst of a boutique's display. These heels, in hues ranging from deep crimson to glossy black, distinctly tower over the organized collection of other shoes that occupy the lower shelves. The soft, diffused light filtering through the boutique's windows indicates it's late afternoon, which gently illuminates the shoes' textures and silhouettes against the retro wood paneling."}
+{"index": "211", "data": "Four bright yellow slippers are neatly aligned on a beige carpet, their fuzzy texture suggesting warmth and comfort. Next to them, a pair of elegant burgundy bow ties with a silk finish lay gracefully, hinting at a formal event. The color contrast between the vivid slippers and the refined bow ties creates a unique and curious combination."}
+{"index": "212", "data": "In a spacious room, an ornate vintage clock with a large, circular face and roman numerals stands prominently against a pastel-colored wall, its intricate hands pointing to the time. Beside it, a small, oval-shaped mirror with an elegant antique frame hangs neatly, reflecting a portion of the room’s interior. The contrast in size between the commanding presence of the clock and the modest mirror creates a unique focal point in the space."}
+{"index": "213", "data": "A delectable chocolate cake, light and fluffy in texture, with a rich dark brown hue, sits atop an ornate wooden table that features intricate carvings on its edges. To the right of the cake, a pristine white plate cradles seven plump sausages that bear the marks of grilling, their skins a dark, crispy brown contrasted against the plate's stark whiteness. Behind this culinary display, the backdrop reveals a contemporary kitchen characterized by sleek stainless-steel appliances and a smooth marble countertop bathed in the warm glow of pendant lighting."}
+{"index": "214", "data": "Under the soft glow of a rising sun, a round jade-colored table supports six freshly steamed baozi, their white wrappers slightly translucent, emitting tender wisps of steam. Neatly accompanying them are four ice cream cones, each boasting a different, vivid hue, ranging from the deep purple of blackberry to the cheerful yellow of mango. The morning light accentuates the contrast between the warm fog lifting from the baozi and the frosty sheen on the scoops of ice cream."}
+{"index": "215", "data": "A deep red rose with plush petals sits elegantly coiled atop an ivory, intricately patterned lace napkin. The napkin rests on a rustic wooden table that contributes to the charming garden setting. As the late evening sun casts a warm golden hue over the area, the shadows of surrounding foliage dance gently around the rose, enhancing the romantic ambiance. Nearby, the green leaves of the garden plants provide a fresh and verdant backdrop to the scene."}
+{"index": "216", "data": "A striking dishwasher with a glossy, rainbow-colored exterior stands out in a neatly-arranged kitchen with white marble countertops. Inside the open dishwasher, five stainless steel pots are being methodically cleaned by the powerful water jets. Surrounding the dishwasher are orderly arranged kitchen utensils and a spice rack filled with an assortment of colorful spices in clear glass jars."}
+{"index": "217", "data": "On a rainy day, three umbrellas with bright and varied colors—yellow, red, and blue—are opened wide and positioned upright on a worn, wooden table. Their fabric canopies are dotted with fresh raindrops, capturing the soft, diffused light of a hazy morning. Beside these umbrellas lies a classic round watch with a leather strap and a polished face that reflects the muted light. The watch and umbrellas share the table's space, hinting at a paused moment in a day that has just begun."}
+{"index": "218", "data": "As the sun sets, casting a warm glow over the playground, a striking bright orange baseball bat lies half-buried in the sandy ground near the curved metal slide. The slide, painted in bold primary colors, forms a perfect arc that glistens with the residual daylight. Sparse footprints around the area hint at earlier games and laughter, now quiet in the evening's calm."}
+{"index": "219", "data": "A vibrant green calculator rests neatly beside a trio of pristine white board erasers, neatly organized on a spacious, uncluttered wooden desk surface. The desk's polished finish reflects the soft, ambient light of the room, highlighting the contrast between the calculator’s buttons and its body. To the right of the calculator and erasers, there is ample space for writing materials or additional office supplies, suggesting an environment conducive to productivity and order."}
+{"index": "22", "data": "In the midst of a vibrant garden, a cylindrical green cup stands alone on a stone path, its surface reflecting the bright afternoon sunlight. The cup, with a smooth finish, is surrounded by blossoming flowers and lush greenery. The shadows of nearby plants dance on the cup as gentle breezes sway their leaves."}
+{"index": "220", "data": "Three glossy billiards balls, each distinctly colored in solid hues of red, yellow, and blue, are captured in a dynamic roll across a vibrant green felt billiards table. In stark contrast, a pair of granite curling stones with colored handles rest motionless nearby, their polished surfaces reflecting the soft, ambient lighting of the room. The billiards balls, significantly smaller in size compared to the hefty curling stones, navigate the expanse of the table with ease and precision."}
+{"index": "221", "data": "An outsized dolphin with a sleek, gray body glides through the blue waters, while a small, fluffy chicken with speckled brown and white feathers stands on the nearby sandy shore, appearing diminutive in comparison. The dolphin's fins cut through the water, creating gentle ripples, while the chicken pecks at the ground, seemingly oblivious to the vast size difference. The stark contrast between the dolphin's smooth, aquatic grace and the chicken's terrestrial, feathered form is highlighted by their proximity to one another."}
+{"index": "222", "data": "A well-worn traditional fire tong with a darkened, metal finish rests gently beside a contemporary stainless steel extractor on a rugged, aged wooden table. The table's surface reveals the texture of the grain and bears the marks of frequent use. Sunlight pours through a nearby window, casting a soft, warm glow that highlights the contrast between the old and the new kitchen tools."}
+{"index": "223", "data": "Amidst the darkness of a quiet room, illuminated by the ethereal blue light of a holographic display that projects a visualization of cyberspace, sits a sleek cell phone and a modern router/modem on a polished wooden desk. The intricate patterns of the wood grain are faintly visible under the neon glow, which casts soft reflections on the devices' surfaces. The router/modem, with its blinking indicator lights, stands beside the phone, which shows a clock on its screen indicating the late hour."}
+{"index": "224", "data": "Three exquisitely crafted violins, with their elegant F-shaped sound holes and smooth, glossy wooden surfaces, rest gently beside a grand piano. The piano's polished ebony finish reflects the soft, gentle curves of the violins that lie on a velvet-lined case. Behind them, the piano's open lid reveals its intricate strings and hammers, inviting a symphony of sounds to be played."}
+{"index": "225", "data": "In the middle of a cozy room with a vintage charm, a circular wooden dining table takes the stage, its surface adorned with a decorative vase and a few scattered books. The room's warmth is maintained by an old-fashioned radiator humming steadily in the corner, a testament to its long service. As dusk approaches, the waning sunlight softly permeates the space through a window with a delicate frost pattern, casting a gentle glow that enhances the room's rustic ambiance."}
+{"index": "226", "data": "On a worn-out stretch of city pavement, a bent, brown cigar lies discarded, its rough texture contrasting against the dull gray concrete. Nearby, a rust-covered key with intricate grooves rests haphazardly, hinting at long-forgotten locks and doors. The pavement's surface is littered with small pebbles and debris, telling tales of the bustling urban life that treads over it each day."}
+{"index": "227", "data": "On the soft, warm sand of the beach, a fluffy white rabbit with rounded ears is caught in a curious moment, gently placing its paw on the ribbed surface of a pink scallop shell. The scallop, slightly open, reveals its smooth interior contrasting with its coarse outer texture, while hues of pink and orange from the setting sun reflect off its surface. There's a tranquil ocean backdrop with the gentle ebbing of the tide, and the fading daylight casts a golden glow over the scene, highlighting the rabbit's soft fur and the shell's subtle color."}
+{"index": "228", "data": "A cozy bathroom features a pristine, white claw-foot bathtub on a backdrop of pastel green tiles. Adjacent to the tub, a tower of soft, white toilet paper is neatly stacked, glimmering gently in the diffuse glow of the afternoon sunlight streaming through a frosted window. The gentle curvature of the tub contrasts with the straight lines of the stack, creating a harmonious balance of shapes within the intimate space."}
+{"index": "229", "data": "Three vibrant red fire extinguishers stand out prominently against a backdrop of intense orange flames that are consuming the scene. The fire extinguishers, with their glossy, metallic surfaces, appear gigantic in comparison to the diminutive five sheets of white notepaper scattered haphazardly around the area. The notepapers bear slight curling at their edges, suggesting subtle warping from the heat of the surrounding inferno."}
+{"index": "23", "data": "A vivid pair of crimson, round headphones rests on the smooth surface of a transparent glass table. The glass reflects the gentle glow of the dim morning light, which filters through a nearby window. Around the headphones, there's a scattering of paper and pens, hinting at a quiet workspace that is not currently in use."}
+{"index": "230", "data": "The dresser is adorned with a vibrant pink lipstick tube that stands out against the dark wood finish. Beside it, two necklaces with glittering pendants capture the waning light of the evening, each sparkle enhancing the jewelry's intricate details. The necklaces are elegantly draped over a small stand, creating a luxurious display on the otherwise unadorned surface of the dresser."}
+{"index": "231", "data": "A brightly colored hot air balloon with vibrant stripes of red, yellow, and blue hangs in the clear sky, its large round shape contrasting against the fluffy white clouds. Below it, a sleek black scooter with red accents speeds along a concrete pathway, its rider leaning forward in a hurry. The balloon moves at a leisurely pace, starkly contrasting with the frenetic energy of the scooter's rapid movement on the ground."}
+{"index": "232", "data": "A vividly colored balloon with hues of red and blue hovers gently above a wooden cosmetics table. The table holds a delicate synthetic bristle brush dusted with powder to one side and a sleek black eyeliner pencil lying parallel to the brush. Nearby, an open compact mirror reflects the floating balloon, adding a sense of depth to the scene."}
+{"index": "233", "data": "Three sleek black tripods standing in a row, their legs slightly splayed on a grey, granulated floor for stable support. Positioned prominently in front of them is a compact, silver remote control with a smooth, metallic finish. The arrangement suggests a photographic studio setup, with the remote likely used to control the cameras mounted on each tripod."}
+{"index": "234", "data": "An old, vibrant red heavy truck with visible signs of wear and weathering stands stationary near a weathered stop sign of the same color. The truck is parked on a street lined with historical buildings featuring peeling paint and brick facades that speak to the rustic charm of the town. The scene is further characterized by cobblestone pathways and antique street lamps, suggesting a place that has withstood the passage of time."}
+{"index": "235", "data": "Amidst the soft hues of twilight, two towering traffic lights preside over a bustling intersection, casting a brilliant scarlet glow that demands the attention of all nearby. Beneath their authoritative presence, a modest yellow crosswalk sign attempts to assert its own importance, though its illumination is meek in comparison. The red lights reflect faintly on the glossy hoods of cars waiting patiently for the signal to change, while the pedestrian lines lay painted starkly on the dark asphalt below."}
+{"index": "236", "data": "A frisky golden retriever with a shiny, shaggy coat stands next to a life-sized penguin statue with a sleek, glossy surface in the midst of a bustling public park. The dog, with its tongue playfully hanging out, seems to be in mid-bark or mid-laugh, directed at the stoic, black and white penguin which stands in stark contrast to the dog's exuberant pose. The park is bathed in the warm glow of the afternoon sun, casting long shadows on the green lawn speckled with dandelions. Nearby, children's laughter can be heard as they play, oblivious to this charming and whimsical scene."}
+{"index": "237", "data": "A quiet scene is set within a busy commercial kitchen, where stainless steel surfaces are bustling with activity. In one corner, a gleaming white porcelain cup sits beside a large stainless steel basin, both impeccably clean and ready for their next purpose. The walls, clad in white subway tiles, reflect the glimmer of overhead lights, giving the room a bright, active atmosphere. In the background, the hum of ovens and stovetops blend with the rhythmic chopping of diligent cooks preparing for the morning rush."}
+{"index": "238", "data": "On the cool, silken sands of a deserted beach, a pair of blue sandals lies side by side, their straps glistening gently in the bright moonlight. Next to the sandals, a white protective face mask is placed neatly, its ear loops partially buried in the fine grains of sand. The tranquil nocturnal shoreline stretches far into the distance, with the rhythmic sound of waves creating a peaceful backdrop for the inanimate companions."}
+{"index": "239", "data": "In the quiet of the evening, a clean, square tissue box with a floral pattern rests atop a folding table beside a pair of modern, metallic washing and drying machines. The laundry room is brightly illuminated, highlighting the soft blue hue of the machines and the sparkling cleanliness of their chrome accents. Along the wall, rows of wooden shelves hold neatly folded towels and various cleaning supplies."}
+{"index": "24", "data": "Five red hockey sticks, each with a slender shape and a worn texture from frequent use, are propped against the frosty rink's white boards. The sky above casts a dim, featureless gray light over the area, accentuating the early morning stillness. On the icy surface of the rink, illuminated by the ambient outdoor lighting, the sticks' shadows form elongated silhouettes, showcasing their readiness for the day's practice."}
+{"index": "240", "data": "On a sturdy, wooden drafting table lies a vintage brass scale, gleaming with a polished finish alongside an extended yellow tape measure casually strewn across the table's surface. The tables crafted from a rich, dark hardwood, distinctly contrasts with stark white blueprints and rolled-up scrolls flanking the scale. The tape measure is partially coiled, with black and red measurement markings clearly visible and stretches out towards the edge of the table, indicating precise dimensions for an ongoing project."}
+{"index": "241", "data": "On a high exterior wall, two large white air conditioning units sit securely bracketed, their vents showing signs of weathering from constant exposure to the elements. Beside them, a rail mounted to the wall supports five sleek black hangers, their long forms casting faint shadows under the faint glow of the nearby street lamp. Above, the dark night sky stretches endlessly, with stars twinkling subtly far in the distance."}
+{"index": "242", "data": "Amidst a hazy urban setting enveloped by a soft gray mist, a bright red fire truck speeds forward, sirens blaring and lights flashing. Its large wheels churn as the truck speeds past, sending a line of bright orange traffic cones tumbling in disarray. Behind the fire truck, the blurred shapes of city buildings loom, adding to the urgency of the scene as the vehicle rushes to an unseen emergency."}
+{"index": "243", "data": "Inside a peaceful home, the kitchen area features a stainless steel faucet, its sleek surface catching the light, towering just above a straw broom with bristles slightly frayed from use. The broom leans casually against a cream-colored wall, adjacent to the polished granite countertop that houses the sink and faucet. Around them, the kitchen is filled with other everyday utensils and appliances, creating a sense of domestic normalcy."}
+{"index": "244", "data": "Three delicious-looking sandwiches, overflowing with fresh lettuce, crisp cucumber, and ripe avocado slices, are neatly aligned on a sleek, glass-top table. Each sandwich is encased in a golden-brown bread that has been lightly toasted to perfection. The sunlight pouring through a nearby window casts a warm glow, accentuating the vibrant colors of the vegetables and making the table's glass surface shine with reflected light."}
+{"index": "245", "data": "In a clear blue tropical sea, a ripe yellow banana bobs on the gentle waves alongside a brown, hairy coconut. The fruit duo is surrounded by vibrant coral visible beneath the water's surface. Near the horizon, one can spot a small island with lush green palm trees swaying in the breeze."}
+{"index": "246", "data": "A vivid crimson cuboid-shaped recorder is positioned upright beside a circular, reflective CD with a transparent tone, resting delicately on a polished mahogany surface. The layout is complemented by a collection of other musical paraphernalia scattered around, including a pair of headphones with soft, cushioned earpieces and a stack of assorted CDs in colorful cases. The pristine surface beneath them reflects the gleaming light, highlighting the contrast between the sharp angles of the recorder and the smooth, rounded edges of the CD."}
+{"index": "247", "data": "A sleek, silver hair dryer and a pristine white bar of soap are neatly placed next to one another on a marble bathroom countertop. The backdrop of the setting sun casts a warm, orange glow through the window, illuminating the array of toiletries and fluffy towels in soft light. The reflective surface of the countertop enhances the soothing ambiance created by the sun's fading rays."}
+{"index": "248", "data": "The countertop hosts a vibrant red microwave that towers over a smaller, deep green coffee machine sitting beside it. The microwave has a sleek digital display, while the coffee machine features an array of buttons and a glass carafe. Behind these appliances, a white tiled backsplash completes the kitchen scene."}
+{"index": "249", "data": "A pristine white baseball with red stitching is captured mid-air, as it's forcefully struck against a backdrop of a dusky evening sky showing hues of orange and purple. Below, a geometrically intriguing hexagonal game board with multicolored spaces rests atop the dark wood of an antique desk. The elegant desk shows the patina of age and is intricately carved, lending a sense of history and dignity to the scene."}
+{"index": "25", "data": "A robust pigeon, with grey and white feathered plumage, sits comfortably on the sturdy branch of a venerable oak tree, replete with sprawling arms and knotted bark. Below, the mossy roots of the tree stretch out into the cobblestone paths of a charming village, where small, thatched-roof cottages neighbor each other. Sunlight dapples through the dense leaf canopy above, casting playful shadows on the scene below."}
+{"index": "250", "data": "An expansive airport terminal alive with the hurried steps of travelers, featuring white tiled floors and large glass windows revealing views of parked airplanes. Overhead, a lattice of metal beams supports a ceiling from where seven dome-shaped black surveillance cameras keep a vigilant eye on the bustling concourse. Amidst the flow of people, rows of blue and grey seating areas are interspersed with digital flight information displays."}
+{"index": "251", "data": "A small, round glass flask sits filled with a brightly colored, luminous potion on an aged wooden tabletop, its contours clear and sharp. The flask seems tiny in comparison to the massive stainless steel rice cooker positioned in the corner of the room, its steam vent puffing gently as it diligently prepares a sizeable meal for a nocturnal feast. The tabletop's surface is scattered with a few pieces of parchment and an assortment of dried herbs, which adds to the contrast between the delicate glassware and the robust kitchen appliance."}
+{"index": "252", "data": "In the depths of a vibrant underwater scene, a large, dark red fish glides gracefully through the water, its scales glistening with the filtered sunlight from above. Surrounding the fish is a lively coral reef, bustling with an array of corals in striking hues of purple, yellow, and green, their unique and intricate forms providing a stunning backdrop. Tiny, iridescent fish dart around the nooks of the corals, adding to the dynamic and rich tapestry of marine life inhabiting this tranquil aquatic world."}
+{"index": "253", "data": "An interior wall is fitted with four square-shaped power outlets, uniformly aligned in a horizontal row directly above a ruby red, round stool. The wall's paint is a soft cream color, creating a contrast with the vibrant red of the stool. The stool's smooth texture and glossy finish reflect the ambient light, emphasizing its bold hue in the otherwise neutral-toned room."}
+{"index": "254", "data": "A gleaming red sports car with aerodynamic curves and a polished finish stands out against the backdrop of the subdued twilight. Beside it rests a sleek black bicycle, its slender frame casting a long shadow on the concrete as the day gives way to night. The street is devoid of pedestrians, offering a tranquil scene with the vehicles motionless under the fading light."}
+{"index": "255", "data": "On a bright day, a trio of sunshine yellow shrimp can be seen scuttling about the base of a striking zebra, its coat a contrast of black and white stripes reminiscent of a starry sky without a moon. The zebra stands casually on a patch of vibrant green grass, seemingly unfazed by the small crustaceans exploring around its hooves. The sun casts a warm glow on the scene, enhancing the vivid colors and casting short shadows on the ground beneath the animals."}
+{"index": "256", "data": "A metal ladder, silver and slightly weathered, stands against a partially-built brick wall within a construction site. Nearby, a sturdy shovel with a wooden handle and a dirt-stained blade leans against a mound of earth. Small clumps of soil spill onto the ground beside it. In the background, the silhouette of an unfinished building looms against the night sky, its shape barely visible under the faint glow of a distant street lamp."}
+{"index": "257", "data": "Amid the soft glow of twilight, a lone deer with a coat of warm brown and subtle spots stands still on the grassy bank of a tranquil lake. As it looks on, five geese with bright white feathers and orange beaks are captured in a moment of energetic flight, their wings beating in unison as they rise from the water's edge. The backdrop is painted with the serene surface of the lake, which reflects the blushing sky and the silhouettes of distant trees."}
+{"index": "258", "data": "An image captured at dusk where the warm glow of the setting sun can be seen reflecting off the stainless-steel exterior of a pan that sizzles with seven perfectly round meatballs. Adjacent to the pan, on a hot grill, two thick, marbled steaks emit an appetizing aroma as they cook to a medium-rare finish. Random drops of oil pop and dance on the heated surfaces, signifying the heat at which these meats are being prepared."}
+{"index": "259", "data": "As the first rays of the morning sun spill over the market, a glimmer catches the eye from beneath a stark white awning, where a series of five ornate, golden cosmetic mirrors are carefully arranged. Each mirror, varying in size from small handheld to large tabletop versions, reflects the vibrant hustle and bustle of the early market goers. The stands surrounding the mirrors boast an array of colors from the goods for sale, highlighting the diverse cultural tapestry of the community."}
+{"index": "26", "data": "A gray donkey with a tuft of dark mane lies tranquilly beneath the expansive branches of a large oak tree that sits at the edge of a meandering river. The river's crystal-clear waters reflect the lush greenery of the banks and the vibrant blue sky above. In the background, the gentle slope of the riverbank is dotted with wildflowers and tall grasses, creating a picture of pastoral serenity."}
+{"index": "260", "data": "The scene is set on an unpolished wooden tabletop where the contrasting textures of a faded pink eraser, showing signs of frequent use, and a chrome-finished screwdriver with a glossy handle catch the eye. Both items are basked in the glow of the midday sun, highlighting the fine layer of dust that covers the table's surface. The screwdriver lies parallel to the table's edge, while the eraser is placed haphazardly near a scattering of pencil shavings."}
+{"index": "261", "data": "A playful scene unfolds as a monkey with auburn fur cavorts amidst a trio of ducks on the bank of a tranquil pond. The setting sun bathes the area in a soft, golden glow, casting elongated shadows on the ground. Each duck, with its glossy feathers reflecting the light, pecks at the grass while the monkey's agile form is silhouetted against the amber sky."}
+{"index": "262", "data": "An eye-catching bright red megaphone rests on its side, situated in close proximity to a sleek black microphone that stands upright on a dark stage. The stage itself is equipped with various electronic devices and cables running across its surface, hinting at the preparations for an upcoming event. The microphone, with its polished metal finish, gleams under the stage lights, waiting to project the voice of the speaker into the night."}
+{"index": "263", "data": "In the expansive grasslands, bathed in the orange hues of the setting sun, a tall giraffe gracefully bends its long neck toward a tranquil pond to sip water. Beside the pond, a small, bright red crab scuttles amongst the stones and reeds at the water's edge. The pond’s surface reflects the myriad colors of twilight, while in the background, the silhouettes of acacia trees stand against the dimming sky."}
+{"index": "264", "data": "A small, teardrop-shaped candle with a pale blue hue graces the surface of a large, cubic storage box. The box itself features a textured, glossy white finish and sits squarely in the corner of a room. To the side of the box, there's a stack of neatly folded towels in varying shades of beige and cream."}
+{"index": "265", "data": "A majestic white crane with outstretched wings captured in the act of taking flight from a patch of green grass. In the foreground, an ambulance emblazoned with vibrant red crosses races past, its siren lights ablaze with urgency against the evening sky. The cityscape beyond is silhouetted by the fading hues of dusk, with the outlines of buildings casting long shadows as the day comes to a close."}
+{"index": "266", "data": "An old-fashioned kitchen setting with a cast-iron kettle and a ceramic teapot sitting atop a rough-hewn, wooden table that bears the marks and patina of age. The kettle's metallic surface has a dull gleam, reflecting the warm ambient light, while the teapot, adorned with a floral pattern, adds a touch of nostalgia to the setting. In the background, there is a window with curtains partially drawn, allowing for a soft natural light to fill the room. Nearby, a woven basket filled with dried flowers accentuates the rustic charm of the cozy interior."}
+{"index": "267", "data": "A vintage rectangular carriage, with intricate metalwork and wooden panels, occupies the foreground, resting on the uneven cobblestone street. Above it, the sky is overcast with dark, billowing clouds that seem to loom ominously. Nearby, a sleek, rounded hoverboard with a glossy finish gracefully glides in circles around the carriage, its futuristic design contrasting sharply with the historical vehicle."}
+{"index": "268", "data": "On a rustic wooden table, a vibrant purple paintbrush, significantly larger than the standard size, lies beside a pair of silver-grey pliers with a matte finish. The bristles of the paintbrush appear soft and well-used, indicating a tool that has seen many a canvas, while the pliers show signs of wear, with slight scratches and smudges from frequent handling. The contrast between the artistic instrument and the utilitarian tool is striking against the natural grain of the wooden surface."}
+{"index": "269", "data": "On a marble countertop, three colorful cleaning product bottles are neatly arranged next to a stainless steel sink that glints under the gentle morning sunlight. The blue, green, and yellow bottles contrast with the cool metallic sheen of the sink and the light reflects softly off their smooth surfaces. Behind the sink, the wall is adorned with white subway tiles that add a touch of simplicity to the setting."}
+{"index": "27", "data": "Three sleek silver wheelchairs, each featuring square-shaped seats and metallic frames, are arranged in a tidy row. The wheelchairs, with their black cushioned seats and shiny armrests, sit on a smooth gray concrete floor. The background reveals a soft beige wall, suggesting the setting may be a clinical or therapeutic environment."}
+{"index": "270", "data": "A deserted park scene illuminated by a soft moonlight where an orange frisbee lies on the grass, slightly tilted to one side. Nearby, a wooden cello and its bow rest in solitude against a weathered park bench, their elegant forms casting long shadows on the pavement. The surrounding trees sway gently in the breeze, indifferent to the forgotten items left in the wake of an earlier emergency rehearsal."}
+{"index": "271", "data": "In a room bathed in the warm glow of the late afternoon sun, a single large golden camera sits prominently on a desk. This camera, with its polished metallic finish, outshines and is notably bigger than the two smaller silver monitors positioned on either side of it. The edges of the monitors are reflecting the soft light, creating a contrast with the camera's shining surface."}
+{"index": "272", "data": "An aged and quaint room, lined with crinkled wallpaper, houses a row of four spherical, silver projectors resting on a weathered shelving unit at the rear. These projectors cast bright, focused beams of light toward the room's center, where an expansive antique oak desk sits solemnly. On the desk’s polished surface, three electronic keyboards, each with a different design and layout, are neatly arranged, waiting to be played."}
+{"index": "273", "data": "Three white golf balls are precisely placed on the black, moving conveyor of a large treadmill. The golf balls, significantly smaller in scale, appear almost like tiny planets gliding along the treadmill's expansive surface. The ambient light casts a soft glow on the scene, accentuating the contrast between the smooth texture of the golf balls and the textured belt of the treadmill, as the treadmill operates in a room with fading daylight filtering through a nearby window."}
+{"index": "274", "data": "An open pencil case with a variety of colored pencils strewn about lies adjacent to a set of sleek, black binoculars resting on the smooth surface of an aged oak desk. The desk, marked by the patina of use, features intricate grain patterns and is sprinkled with fine papers and an antique brass lamp. The combination of school supplies and exploratory equipment suggests a space dedicated to both study and adventure."}
+{"index": "275", "data": "An immense grey elephant with gently flapping ears lumbers towards a bubbling stream, a stark contrast against the lush green grass. Meanwhile, in the cool, reflective waters, a sleek black swan glides gracefully, its red beak standing out against its dark feathers. The gentle ripples caused by the swan's movement disrupt the reflection of the overhanging trees on the water's surface."}
+{"index": "276", "data": "A rustic, warm-toned wooden table holds a white ceramic plate piled high with steaming dumplings, the pleats carefully crimped, indicating handcrafted care. Next to it sits a round, earthy-toned bowl filled with ripe, purple plums, their skins glossy and taut. The gentle glow of the setting sun casts a soft light over the scene, illuminating the golden wheat fields and a distant barn in the backdrop, painting a picturesque countryside tableau."}
+{"index": "277", "data": "A modern kitchen vignette where a sleek, black induction cooker is placed on a polished granite countertop. Its surface glows as it heats a stainless steel pot, from which tendrils of steam rise, indicating a simmering soup within. Adjacent to the cooker, a vibrant red blender is in operation, its contents swirling at high speed, presenting a dynamic contrast to the tranquil act of the soup slowly cooking. The countertop around these appliances is dotted with various culinary tools and ingredients, embodying the lively activity typical of meal preparation time."}
+{"index": "278", "data": "In the early hours of the day, a spacious, brightly-lit public restroom showcases two gleaming white toilets with shiny silver flush handles neatly aligned against a pale blue-tiled wall. Nearby, a pristine white urinal, also spotless and ready for use, is equipped with an automatic flush sensor. The floor beneath these fixtures is polished to a high shine, reflecting the overhead lights, and the entire space is absent of any visible debris or grime, emphasizing the meticulous cleanliness of the facility."}
+{"index": "279", "data": "Three red wine glasses, casting shimmering reflections from the morning sunlight, are evenly filled to the halfway point with a deep crimson liquid. Beside them, four matte purple plates rest unoccupied on a smooth, dark wooden dining table. The arrangement of the tableware suggests a pause in a meal, or the anticipation of company yet to arrive."}
+{"index": "28", "data": "A beautiful red kite, with a pattern of yellow suns and blue moons on its surface, floats effortlessly against the backdrop of a pristine blue sky. Its square shape is outlined by a strong frame, and long, flowing tails that dance whimsically in the wind. Below, the soft golden sands of the beach stretch out, creating a striking contrast with the kite's vivid colors."}
+{"index": "280", "data": "A modern, spacious room bathed in natural sunlight, featuring a clean white wall and a hardwood floor with a subtle sheen. In the center, a pair of white sneakers sits side-by-side, neatly positioned on the floor. Wrapped around them is a single brown leather belt with a polished buckle that glints in the light, creating a sense of order within the space."}
+{"index": "281", "data": "A modern kitchen scene where a stainless steel gas stove ignites with a soft blue flame, casting a warm glow on the surroundings. Close by on the granite countertop rests a green glass bottle, which reflects the flickering light, creating subtle reflections around it. The stove is surrounded by various cooking utensils and a spice rack filled with an assortment of colorful spices."}
+{"index": "282", "data": "Two slender bamboo-colored chopsticks lie diagonally atop a smooth, round wooden cutting board with a rich grain pattern. The chopsticks, tapered to fine points, create a striking contrast against the cutting board's more robust and circular form. Around the board, there are flecks of freshly chopped green herbs and a small pile of julienned carrots, adding a touch of color to the scene."}
+{"index": "283", "data": "A small, silver lighter with a gentle flame flickering at its tip is placed beside a large, gleaming golden trophy on a polished wooden table. The trophy is intricately designed, with handles on each side and an engraving that signifies some kind of sporting achievement. The wooden table's surface reflects a faint glow from the lighter's flame, adding a subtle warmth to the surrounding cool metallic and wooden textures."}
+{"index": "284", "data": "Several large, cylindrical metal barrels, stacked in a pyramid formation, stand under a grey, overcast sky. Adjacent to these barrels is a massive, dark-colored SUV with tinted windows, both resting on the cracked pavement of an abandoned gas station. The desolate scene is illuminated by the diffuse light of the midday sun, which barely seeps through the thick cloud cover above."}
+{"index": "285", "data": "In a vibrant green field, a cinnamon-colored cat with stripes is in full sprint, its body low to the ground as it chases after a spherical toy designed to look like a charcoal-colored antelope. The toy is rolling unevenly across the terrain, causing tufts of grass to sway in its wake. The cat's fur ripples with each agile movement, and its intense focus is evident even at a distance."}
+{"index": "286", "data": "Under the soft hues of twilight, a standard black stapler can be seen next to a robust, orange and metallic chainsaw. Both items are cast in the gentle light, creating elongated shadows on the ground beneath them. The textures are distinctly different – the stapler's smooth, plastic finish contrasts sharply with the rugged, seemingly worn appearance of the chainsaw. They sit unusually together on a patch of grass still holding onto the day's warmth."}
+{"index": "287", "data": "In an expansive field, a chestnut horse with a flowing mane is captured in the midst of a powerful buck, its hooves churning up tufts of green grass. Adjacent to the field, a playful seal can be seen in a clear blue pond, its slick body arcing gracefully as it performs an energetic flip, splashing water droplets into the air. The surrounding area is dotted with wildflowers and a wooden fence encloses the field, maintaining a boundary between the terrestrial exuberance of the horse and the aquatic acrobatics of the seal."}
+{"index": "288", "data": "In the bathroom, a sleek circular metal sink reflects the soft overhead lighting, creating a serene and clean atmosphere. Next to the sink, three cylindrical bars of green soap are meticulously lined up, each embossed with intricate patterns, waiting to be used. The soaps rest upon a white marble countertop that contrasts with the cool, brushed finish of the sink."}
+{"index": "289", "data": "A luxury bathroom featuring a pristine white bathtub with a soft, fluffy white towel neatly folded on its edge. Resting next to the towel is a bar of pink, scented soap poised on a ceramic soap dish. The tub is situated near a frosted window that allows for natural light to fill the room, highlighting the clean lines and elegant fixtures of the space."}
+{"index": "29", "data": "A picturesque bridge bathed in the warm glow of the early morning sun, flanked by two tall antique street lights. The street lights, with their ornate metalwork and frosted glass, cast elongated shadows across the weathered stone pathway of the bridge. The tranquil scene is further accentuated by the absence of pedestrians, giving the impression of a moment frozen in time just after dawn."}
+{"index": "290", "data": "A home office scene reveals a cylindrical, light-gray extension cord neatly coiled around the base of a sleek, black printer. The intricate texture of the cord contrasts with the printer's smooth, glossy surface. Near the printer, an array of paperwork is scattered, and the faint moonlight streaming through the window casts a soft glow on the scene at midnight."}
+{"index": "291", "data": "An aged crimson oven occupies the corner of a rustic kitchen, its window revealing the golden-brown crust of bread as it bakes within. Next to it, a towering, polished metallic spoon leans against a weathered brick wall, reflecting the soft kitchen light. Scattered nearby are a scattering of flour and a wooden rolling pin on a worn, marble countertop."}
+{"index": "292", "data": "A rustic wooden table bathed in soft afternoon sunlight, showcasing a hearty, crusty loaf of freshly baked brown bread alongside a richly colored, firm purple eggplant. The textures of the bread's crackled crust juxtapose with the eggplant's smooth, glossy skin. Nearby, a folded linen napkin and an assortment of herbs suggest preparations for a savory meal."}
+{"index": "293", "data": "At a bustling farmer's market stall, a ripe hamimelon with its signature netted green rind and succulent orange flesh lies beside a crisp head of lettuce, which displays vibrant green leaves. The two fresh produce items bask under the diffuse amber light of the late afternoon sun, which casts a soft glow over the array of fruits and vegetables. Around them, an assortment of seasonal goods is neatly arranged on wooden tables, inviting customers to sample the local harvest."}
+{"index": "294", "data": "A robust, honey-toned wooden ladder resting against an ivory wall which is illustrated with an array of whimsical, pencil-drawn doodles depicting imaginative scenes. The doodles range from swirling galaxies to playful characters scattered across the wall's expansive canvas. The texture of the wood of the ladder contrasts with the smoothness of the wall, emphasizing the artisanal creativity imbued in the space."}
+{"index": "295", "data": "In a spacious room, three brown wooden coffee tables of varying sizes stand upon a large, ornate red carpet with intricate patterns. The tables are arranged in a semicircular fashion, supporting an array of decorative items including small potted plants and coasters. The plush texture of the carpet contrasts with the smooth, polished surface of the tables, creating a harmonious visual effect within the space."}
+{"index": "296", "data": "A sleek, crisp white Formula 1 car with sponsor logos emblazoned on it is parked upon a pier with smooth, polished marble slabs reflecting the sun's gleam. Beside the car, gently swaying on the clear azure waters, is a rustic wooden boat with weathered planks and faded paint. The boat's small bobbing motion contrasts with the stillness of the powerful racing car, making for an unusual yet fascinating combination at the water's edge."}
+{"index": "297", "data": "In a homey living area, against the backdrop of a pastel-hued wall, a sturdy mahogany side table stands adjacent to a plush, large teal couch. The side table features a lamp with a cream shade and a scattering of paperback books stacked neatly on its surface. Positioned comfortably next to the couch, the side table complements the warm ambiance, while the couch offers an inviting place to relax, covered with throw pillows in shades of teal, mustard, and gray."}
+{"index": "298", "data": "A quaint scene unfolds on a polished wooden table, drenched in the warm glow of the afternoon sun, where a quintet of deep purple, woven baskets with a round shape is meticulously arranged. Each basket cradles a square, rich crimson wallet that stands out against the violet hues. Delicate shadows cast by the baskets and wallets dance upon the table, accentuating their colors and contours in the sunlight."}
+{"index": "299", "data": "A polished brown leather briefcase with visible stitching details rests on a white tablecloth, displaying a sense of organization amidst the surrounding environment. Beside the briefcase, a vibrant red fedora hat provides a striking contrast against the pristine table covering. The table, placed in a room with light beige walls, gives an impression of a professional setting with a touch of personal style."}
+{"index": "3", "data": "A spacious room, where the soft glow of the evening light cascades through a nearby window, gently illuminating an antique mahogany desk. Atop the polished surface stands a single, ornate globe, its vibrant shades of green, blue, and brown continents contrasting beautifully against the deep blue of the oceans. The globe, detailed with meridian lines and country borders, spins gracefully on its axis, showcasing the intricacies of our world. Surrounding the globe on the desk are scattered vintage ink pens and crisp, ivory stationery, alluding to the quiet musings of a travel enthusiast or a learned scholar lost in thoughts of distant lands."}
+{"index": "30", "data": "A group of fresh, green asparagus is bundled tightly together, standing upright against a clear glass container with a water droplet pattern, giving them a soldier-like appearance. They exhibit a bright green hue that graduates to a paler shade toward their fibrous stems, enhancing their natural gradient. The spears are set against a neutral-toned, textured backdrop that contrasts boldly with their vivid color and linear form."}
+{"index": "31", "data": "An array of freshly baked goods is presented on a rectangular silver tray with a reflective surface. To one side, vibrant sun-yellow lemon tarts, their delicate, flaky pastry crusts cradling a glistening citrus filling, are arranged neatly in a row. Adjacent to them, slightly purple-blueberry muffins, their tops golden brown and dusted with a fine layer of sugar, exhibit a contrasting texture. The pastries are placed against a backdrop of a marbled white countertop, with soft natural light enhancing their appetizing colors."}
+{"index": "32", "data": "In an expansive gymnasium, five vibrant neon orange basketballs are meticulously arranged in a perfect line on the polished, glossy hardwood court. The sheen of the floor reflects the fluorescent overhead lights and the silhouettes of the basketball hoops that stand at each end of the court. The basketballs, with their distinct black lines and textured surfaces, provide a stark contrast to the tan and amber hues of the wooden planks beneath them."}
+{"index": "33", "data": "Golden, crispy French fries are strewn across a bustling kitchen counter, which is topped with a speckled granite surface. Amid a flurry of midday meal preparations, the scattered fries mingle with an array of ingredients and kitchen tools. Just beside the clutter, a stainless steel fryer continues to bubble away with the promise of more golden treats to come."}
+{"index": "34", "data": "An assortment of artisanal cheese wheels, each exhibiting a distinct texture and color palette, ranging from pale creamy whites to rich oranges, are spread across a rough-hewn wooden table brimming with character. The warm rays of the early morning sun filter through a window to the side, lending a natural illumination that highlights the subtle hues of the cheeses. In the background, the soft shadows cast by the window frame accentuate the contours of the rustic table, contributing to the inviting display of dairy delights."}
+{"index": "35", "data": "Three graceful antelopes are seen grazing on the sparse, golden grasses of the sprawling savannah under the soft, purpling skies of dawn. The silhouette of an acacia tree punctuates the horizon as the first light of day gently begins to illuminate the vast, open plain. With delicate movements, the animals move slowly, their tan and white coats blending subtly with the earthy tones of their serene surroundings."}
+{"index": "36", "data": "An elegant pair of glasses with a unique, gold hexagonal frame laying on a smooth, dark wooden surface. The thin metal glints in the ambient light, highlighting the craftsmanship of the frame. The clear lenses reflect a faint image of the room's ceiling lights. To the side of the glasses, a leather-bound book is partially open, its pages untouched."}
+{"index": "37", "data": "In the dimly lit interior of a spacious wardrobe, a neat row of five hangers stands out, each one a different brilliant hue—ranging from a deep royal blue to a bright lemon yellow. They are spaced evenly apart, casting soft shadows against the dark wooden back of the closet. The smooth, plastic texture of the hangers contrasts with the rough texture of the wardrobe's interior, and their curved shapes seem almost to beckon items of clothing to drape over them."}
+{"index": "38", "data": "An array of delicate, wavy potato chips loosely spread across the surface of an elegant, dark mahogany table. The wood grain is prominently visible under the sporadically placed chips, with soft light accentuating their undulating texture. In the background, a muted, empty ceramic bowl hints at the recent snack session that took place."}
+{"index": "39", "data": "On a clear warm day, the sun radiates down on a sandy beach where a close-up of a wafer cone reveals chocolate ice cream beginning to melt down its textured sides. In the background, the sea glistens and reflects the sunlight, with gentle waves lapping at the shore. The ice cream's rich brown tones contrast sharply with the blue and turquoise hues of the ocean, creating a striking visual. Nearby, a few colorful beach umbrellas are dotted along the water's edge, offering shade to beachgoers."}
+{"index": "4", "data": "On a smooth, beige desktop, four ballpoint pens with blue, black, silver, and red barrels are meticulously arranged at right angles to each other, creating a rectangular outline. In the center of this rectangle, five wooden pencils with freshly sharpened tips are placed with their erasers touching, forming a precise circle. The stark contrast between the rigid geometry of the pens and the soft curve of the pencils is evident upon the uniform background of the desk's surface."}
+{"index": "40", "data": "Two glossy black motorcycle helmets are securely mounted on a white wall adorned with various tools and metal racks. The helmets, with their visors closed, are suspended next to each other by sturdy hooks, reflecting the fluorescent lights above. The wall's smooth texture starkly contrasts the matte finish of the protective gear."}
+{"index": "41", "data": "Five red fire trucks sit parked in a semi-circle amidst a dense blanket of gray early morning fog enveloping the area. Each truck is equipped with ladders, hoses, and flashing emergency lights that pierce through the mist. The area surrounding the trucks is clear and spacious, suggesting an open field or wide street, allowing for quick movement in case an emergency call comes in."}
+{"index": "42", "data": "As the amber hues of dusk blanket the sky, a sleek Formula 1 car, emblazoned with vibrant colors and sponsorship logos, races around the asphalt track, its engine thundering defiantly against the quieting day. The powerful headlights cut through the dimming light, illuminating the course ahead while the car’s aerodynamic shape slices through the cool evening air. The grandstands, now silhouetted by the setting sun, are filled with a blurred sea of spectators, their cheers muffled by the roar of high-performance engines vying for the lead. The racing circuit is lined with bright white track limits and colored curbstones, highlighting the boundaries as the car expertly navigates each turn."}
+{"index": "43", "data": "A cluster of plump, purple grapes, their surface kissed by the morning's dew, reflects the soft, golden light of the rising sun. Each grape, tightly packed alongside its fellows, shows off a frosty sheen indicating the cool freshness of early day. They hang delicately from a green vine that's draped across a rustic, wooden trellis in a peaceful garden."}
+{"index": "44", "data": "A bright yellow tennis ball lies in stark contrast on the vibrant green of a freshly mowed grass court. The ball's fuzzy texture is highlighted by the sunlight, casting a small shadow on the neatly trimmed lawn. Nearby the baseline, the white chalk lines distinctly mark the boundaries of the playing field, creating a geometric harmony on the court."}
+{"index": "45", "data": "At a busy city crossroads, three square-shaped signs, featuring a bold crosswalk symbol in bright yellow, command attention from pedestrians and motorists alike. Positioned against the backdrop of a bustling urbanscape, the signs possess a reflective quality, enhancing their visibility even amidst the chaotic street movement. Anchored securely to the pavement, these uniform signs present the familiar pedestrian crossing imagery, delineating safe walking zones in an area dense with traffic."}
+{"index": "46", "data": "A vivid scene unfolds where several deep red, perfectly round tomatoes spill from a woven brown basket onto a rustic wooden tabletop. The basket lies on its side as the plump tomatoes scatter across the surface, some touching the dark green leaves of a nearby herb plant. In the background, the blurred outline of an open kitchen window lets in soft, natural light, casting gentle shadows around the fallen produce."}
+{"index": "47", "data": "A slender, black comb glides effortlessly through a cascade of golden locks that fall over a woman's shoulder. The smooth teeth of the comb reflect the overhead lighting as it navigates through the silky strands. Surrounding the scene, hair care products and a mirror can be seen on a marble countertop, hinting at a grooming routine in progress."}
+{"index": "48", "data": "Eight vibrant green beach balls, each with a glossy texture, are randomly scattered across the golden sand that is patterned with the alternating shades of sunlight and shadow. Nearby, gentle waves lap at the shoreline, creating a rhythmic sound that complements the serene beachscape. The balls cast soft shadows on the sand, evidencing the bright midday sun overhead."}
+{"index": "49", "data": "Two emerald green briefcases, both with a finish that's smooth to the touch, rest side by side upon the polished surface of an aged oak desk. The wood grain detailing on the desk is prominently displayed in the soft, diffused light of the afternoon sun that filters through a nearby window. Each briefcase is adorned with silver clasps that catch the light, creating a subtle but striking contrast against the dark, rich tones of the wood beneath them."}
+{"index": "5", "data": "An illuminated calculator with a sleek, dark casing and round, raised buttons sits flat on a wooden office table. The sun's fading light gently seeps through the window, casting a warm glow over the calculator's surface and the scattered papers around it. Shadows from nearby objects subtly play across the table, enriching the tranquil setting of a quiet study space."}
+{"index": "50", "data": "Three vibrant red dumbbells are neatly aligned on the polished wooden floor of a well-illuminated gym. The afternoon sunlight streams through the large windows, casting a warm glow on the equipment. In the background, there are rows of exercise machines and mirrors reflecting the interior of the space."}
+{"index": "51", "data": "A close-up view reveals a single, immaculate ace of spades resting on the surface of a polished mahogany table. The rich, dark wood of the table reflects the soft ambient light of the room. This card is positioned slightly askew, with the sharp, distinctly printed symbols contrasting against the smooth, even texture of the table's finish."}
+{"index": "52", "data": "Three juicy, vibrant red strawberries, each dotted with tiny yellow seeds, lying in contrast against a pristine white ceramic plate. The berries are clustered closely together, positioned near the center of the smooth, reflective surface. The glossy texture of the fruit is highlighted by the bright natural light illuminating the clean, simplistic setup."}
+{"index": "53", "data": "Two cylindrical-shaped, golden lamps with a brushed metallic finish stand side by side, casting a warm glow on the dark wooden bedside table. The illumination from the lamps highlights a small stack of books and a pair of reading glasses placed next to them. In the background, the subtle outline of a neatly made bed can be seen, with the surrounding area cloaked in the soft shadows of the midnight hour."}
+{"index": "54", "data": "In the gentle light of the early morning, three red stuffed animals—two teddy bears and a plush fox—are propped against a soft pastel-colored wall within a peaceful nursery room. The wall itself is painted in a gradient of pastel hues, creating a calming backdrop for the vibrant toys. The toys' plush fabric appears soft to the touch, and they sit closely together as if in a huddled group, providing a cheerful contrast to the subtle tones of the room. Nearby, a white wooden crib with delicate bedding completes the serene setting, signifying the presence of a young child's space."}
+{"index": "55", "data": "Two zebras, their black and white stripes creating a dizzying effect, are galloping side by side across the sprawling savanna, tinged golden by the sunlight. The texture of the grass appears rough and dry, a testament to the arid climate. Dust is being kicked up by their hooves, and in the background, sparse acacia trees punctuate the otherwise open landscape."}
+{"index": "56", "data": "On a clean, organized office desk, there are three red staplers arranged in a precise line. Each stapler has a sleek, rectangular shape with a glossy finish that reflects the overhead lighting. They sit atop a polished, dark brown wooden surface surrounded by scattered paper clips and a few pens, providing a contrast to the vibrant red color of the staplers."}
+{"index": "57", "data": "An open laptop with a sleek metallic finish and a black keyboard sits centered on a wooden desk. The desk displays an assortment of office supplies, such as a pen holder, a stack of notebooks, and a potted green plant to one side. Visible on the laptop's screen is a colorful array of icons against a bright wallpaper, indicating it is powered on and ready for use."}
+{"index": "58", "data": "On a stretch of sunlit sidewalk, three identical cylindrical blue parking meters stand in a neat line, each adorned with a digital display and coin slot. Their metallic surfaces gleam intermittently as pedestrians pass by, casting fleeting shadows along the paved walkway. Positioned uniformly, they oversee the adjacent parked cars, quietly awaiting the next round of patrons to deposit their change."}
+{"index": "59", "data": "Three sleek, dark wooden boats are resting along the banks of a tranquil, azure blue lake. The smooth surface of the water reflects the clear sky above and the lush greenery surrounding the lake's edge. Positioned close to each other, the boats' oars are tucked neatly inside, hinting at a recent or upcoming journey across the calm waters."}
+{"index": "6", "data": "A majestic parrot with vibrant green, red, and blue feathers glides effortlessly across the bright blue sky. Its impressive wingspan is fully outstretched, catching the warm sunlight as it soars high above the tree line. Below, the landscape features rolling hills and patches of dense forest."}
+{"index": "60", "data": "In a tranquil expanse of ocean, tinged with the warm hues of the setting sun, five vivid red lifesaving rings can be seen gently bobbing on the surface of the calm water. The sinking sun casts an orange glow across the horizon, reflecting its fiery colors on the water's glass-like surface. The rings, spaced evenly apart, form a striking contrast with the deep blue of the sea, creating a picturesque scene devoid of any other vessels or swimmers."}
+{"index": "61", "data": "A modern kitchen featuring a polished granite countertop where a square-shaped stainless steel rice cooker stands conspicuously. The appliance's surface gleams under the natural morning light that filters through the window. Behind it, the kitchen wall is clad in a vibrant orange wallpaper with subtle patterns, adding a dash of color to the culinary space."}
+{"index": "62", "data": "An old skateboard with scuffed edges and faded stickers leans delicately against the rough texture of a red brick wall. Its wheels, well-worn and dusty, hint at many adventures it might have seen. Nearby, the early morning sun casts a soft glow on the ground, accentuating the contrasting textures of the brick and the smooth, aged wood of the skateboard deck."}
+{"index": "63", "data": "Atop the smooth green felt of a billiard table, three glossy red cue sticks lay orderly in parallel, as if awaiting their players. Around the edges of the table, a scattering of billiard balls rests in the calm before a match, each reflecting the warm, fading light of dusk creeping in through nearby windows. The room is quiet, and the ambiance suggests the anticipation of an evening game, with the only movements being the gentle sway of a ceiling fan above and the shadows stretching across the floor as the sun sets outside."}
+{"index": "64", "data": "An old wooden stool with three legs and a unique triangular seat placed by the window, bathed in the soft glow of moonlight as the clock strikes midnight. The varnish on its surface has faded, hinting at its age and the many years it has stood there. Beside it, a tall grandfather clock, its pendulum swinging steadily, marks the late hour within the quiet room."}
+{"index": "65", "data": "A vibrant display of ten round-shaped boxing gloves arranged in five pairs, each pair a different neon color, against a backdrop of a wall covered in colorful graffiti. The wall's artwork features an array of abstract designs and tags, adding an urban feel to the showcase. The gloves appear to be made of a glossy, leather-like material, and they are hung at eye level for easy viewing and selection."}
+{"index": "66", "data": "An antique rickshaw with chipped and faded red paint stands solemnly under the vast expanse of an early morning sky tinged with the soft hues of dawn. The worn seat, hinting at many years of service, looks out over an empty cobblestone street that hints at the day's quiet beginning. Rustic details of the rickshaw's metalwork become more apparent in the gentle morning light, indicating its rich history and the countless stories it could tell."}
+{"index": "67", "data": "A solitary microphone with a silver finish stands erect on an empty stage, its round, black base firmly planted on the dark wooden floorboards. The reflective surface of the microphone gleams under the bright stage lights. Directly behind the microphone, the backdrop is a rich, crimson curtain that hangs gracefully from the ceiling to the floor."}
+{"index": "68", "data": "An artist with fingers dusted in multicolored pigments is holding a set of five cylindrical-shaped paint brushes that sport vibrant blue handles. These brushes are gently splayed between skilled fingers, poised like a conductor's baton ready to orchestrate strokes on a pristine white canvas that awaits on an easel. The bristles of the brushes appear soft and unused, contrasting with the worn-in look of the artist's hand, evidencing countless hours of previous creations."}
+{"index": "69", "data": "A group of five unique fish with deep blue, almost iridescent bodies, glide gracefully just above the sandy seabed. These aquatic creatures resemble delicate sea bubbles, with their round and translucent appearances. The surrounding waters are a calm shade of blue, casting a serene light on the ocean floor where patches of coral and seashells can be glimpsed."}
+{"index": "7", "data": "Inside a warm room with a large window showcasing a picturesque winter landscape, three gleaming ruby red necklaces are elegantly laid out on the plush surface of a deep purple velvet jewelry box. The gentle glow from the overhead light accentuates the rich color and intricate design of the necklaces. Just beyond the glass pane, snowflakes can be seen gently falling to coat the ground outside in a blanket of white."}
+{"index": "70", "data": "Two golden-brown spring rolls with a perfectly crispy texture sit invitingly on a woven bamboo mat. The setting sun casts a warm, orange hue over the scene, highlighting the glistening sheen of the freshly fried appetizers. Near the spring rolls, a small dish of dipping sauce reflects the sunset's glow, enticing one to indulge in the savory treat."}
+{"index": "71", "data": "In a contemporary office with minimalist design elements, five square-shaped, black printers are arranged in a row along a white countertop. They are humming with activity as they produce documents, with sheets of white paper steadily emerging from their feed trays. Their glossy surfaces reflect the soft light coming from the overhead fixtures, and each printer displays a small, glowing screen indicating its current status. Surrounding the printers, swivel chairs with ergonomic designs and translucent glass partitions contribute to the overall sleek and professional ambiance of the workspace."}
+{"index": "72", "data": "In the open expanse of a school's sports field, under the clear blue sky of a radiant sunny day, four vibrant American footballs are captured in mid-flight. The footballs, featuring hues of red, blue, yellow, and green, are spherical in shape, contrasting sharply with the green turf below. Each ball glistens in the sunlight as they arc gracefully above the field, momentarily suspended against the backdrop of a few wispy clouds."}
+{"index": "73", "data": "In the center of a dimly lit stage, a vintage green guitar with a glossy finish leans gently against a classic black amplifier. The polished wooden flooring of the stage reflects the warm, golden hue from a single spotlight above, which casts an inviting glow over the instrument. A microphone stand with a chrome finish stands to the left of the guitar, poised for the evening's performance."}
+{"index": "74", "data": "Four square potted plants, each nestled in vibrant yellow pots, are aligned neatly on a gray concrete surface outside. They are being gently watered, with droplets of water glistening on their lush green leaves as the sun begins to set in the background. The surrounding area is quiet, and the soft light of the early evening casts a warm glow on the scene."}
+{"index": "75", "data": "Two woven baskets with brown and tan hues rest gently on the vibrant green grass in a peaceful meadow. The larger basket, slightly ajar, reveals a cozy blanket peeping out, while the smaller one seems to be packed with a picnic setup. Surrounding them, a few wildflowers add specks of color to the tranquil scene, uninterrupted by the gentle breeze swaying the grass blades."}
+{"index": "76", "data": "In a dimly lit bathroom with off-white tiles, there sits a wooden shelf against the wall, supporting five rolls of toilet paper arranged neatly side by side. Each roll is compact and untouched, displaying a uniform quilted texture. The shelf is positioned above a small waste bin, and to the side, a soft, pale blue towel hangs loosely."}
+{"index": "77", "data": "Five savory pizzas with vibrant red tomato sauce and a generous amount of golden cheese are being baked to perfection inside the warm, rustic brick oven. The oven, radiating heat, has a wooden handle peeking out, suggesting it's an old-fashioned, manually-operated one. The flames at the back cast a flickering glow, highlighting the oven's arch and the earthen tones of the bricks."}
+{"index": "78", "data": "Two vibrant aqua blue dolphins are gracefully leaping over the mirror-like sea surface, illuminated by the warm hues of an awe-inspiring sunset. The endless sea stretches into the horizon, reflecting the oranges and pinks of the fading sun, while the waves gently lap against each other. Nearby, the calm water is disrupted by the playful splashes created by the dolphins' acrobatics, encapsulating a moment of pure joy and freedom in nature."}
+{"index": "79", "data": "three sleek, brass faucets with a modern, geometric shape stand aligned, streaming clear water into a white basin below. the sink is surrounded by a marble countertop, which reflects the light and brings out the warm golden tones of the metal. droplets of water can be seen splashing gently around the base of each faucet, indicating the force of the flowing water."}
+{"index": "8", "data": "An eye-catching red hoverboard floats above the steel gray asphalt of a bustling urban street, surrounded by the golden hues of the setting sun during evening rush hour. The road is lined with tall buildings casting long shadows, and the air is filled with the sounds of the city’s activity. The hoverboard's sleek design and vibrant color contrast sharply with the muted tones of the concrete environment."}
+{"index": "80", "data": "Two bars of soap, one lavender and the other oatmeal-colored, are neatly placed beside a vibrant yellow pineapple with a thick, green crown. They rest on a white porcelain dish that contrasts with the dark granite countertop. Behind them, a small kitchen window lets in natural light that helps highlight their distinct textures and colors."}
+{"index": "81", "data": "In a brightly-lit laundry room, a pair of ripe yellow bananas rest nonchalantly against the sleek surface of a silver washing machine. The spacious room is adorned with colorful splashes of paint on the walls, creating a lively and playful atmosphere. Sunlight streams through a nearby window, enhancing the vibrancy of the space and casting soft shadows around the cheerful fruits."}
+{"index": "82", "data": "In a dimly lit space, a vibrant green broom with stiff bristles is being used to sweep a muted, grey floor. Shadows stretch across the room, illuminated by the soft orange glow of a smoldering cigarette that has been left unattended. The faint light creates an eerie dance of wisps of smoke in the blue hues of the twilight. Nearby, a simple wooden chair and a metal bucket can be seen, suggesting the room's utilitarian purpose."}
+{"index": "83", "data": "Three cows, a mix of white and brown patches, are lazily grazing in an expansive meadow under the soft glow of the afternoon sun. Before them, stands a large blackboard, planted firmly in the grass, creating a striking contrast with the surrounding lush greenery. Above them, a few wispy clouds are scattered in the otherwise clear blue sky."}
+{"index": "84", "data": "On a reflective metallic table, there is a brightly colored handbag featuring a floral pattern next to a freshly sliced avocado, its green flesh and brown pit providing a natural contrast to the industrial surface. The table is set for lunch, with silverware and a clear glass water bottle positioned neatly beside the avocado. The juxtaposition of the colorful fashion accessory and the rich texture of the avocado creates a striking visual amidst the midday meal setting."}
+{"index": "85", "data": "A vivid nuclear green electric drill in action, its bit spinning rapidly as it bores into a thick, black leather belt. The power tool's textured grip ensures a firm hold and starkly contrasts with the sleek, dark surface of the belt, which is held down against a sturdy wooden workbench. Around the drill, shreds of black material and wood dust are scattered, evidence of the tool's forceful endeavor."}
+{"index": "86", "data": "A medium-sized brown dog with a shiny coat stands in a large room filled with warm, yellow sunlight streaming in from a nearby window. The curious canine directs its attention towards a pristine white porcelain urinal situated awkwardly in the center of the otherwise unfurnished space. The room's aging, cracked walls, painted a once-bright shade of blue, provide a stark contrast to the urinal's smooth and clean surface."}
+{"index": "87", "data": "A collection of three vibrant magenta hats, each featuring a unique pattern and texture, are arranged side by side on a dark, polished wooden surface. Nearby, two translucent bottles with intricate designs reflect the ambient light. The bottles are carefully positioned to the right of the hats, and their contents cast a slight shadow on the wood grain."}
+{"index": "88", "data": "A dusty brown baseball glove, looking worn from many games, dramatically overshadows a pair of small silver pliers lying next to it. Both items rest on a wooden workbench, and the glove's leather creases hint at its frequent use. The pliers' red handles are slightly ajar, suggesting a recent task left unfinished."}
+{"index": "89", "data": "Two glossy red high heels with pointed toes and slender stiletto heels are positioned ominously over a diminutive grey mouse cowering on the creamy white floor. The mouse's fur is slightly ruffled, betraying a sense of trepidation, as it looks up at the imposing footwear. The contrasting size and color between the vibrant shoes and the mouse emphasize the stark difference in their presence."}
+{"index": "9", "data": "Two richly purple-colored folders resting on a wooden table flooded with warm sunlight. The table's surface reflects a subtle sheen, highlighting the folders' smooth texture and the shadows they cast. Around the folders, the table is mostly clear, save for a silver pen lying diagonally near one of the folders."}
+{"index": "90", "data": "A worn piece of luggage placed beside a metal spoon, both lying atop an aged, dusty table. The dim light of the moon barely illuminates the objects, revealing their outlines in the quiet darkness of midnight. Shadows stretch across the table's surface, enhancing the stillness of the nocturnal scene."}
+{"index": "91", "data": "In the waning twilight, two canvas tents, conical in shape, are nestled closely on a field of soft, lush grass, their silhouettes casting gentle shadows. Near these temporary abodes, a sturdy square table sits under the open sky, with a lone black billiard ball positioned at its center, unmoving and glossy. The scene is quiet and still, as though the entire world is holding its breath for the break shot that will commence the morning's game of billiards."}
+{"index": "92", "data": "Two square-shaped pink erasers rest on the tiled floor next to a pristine white porcelain toilet. The erasers feature slight smudges from use and are positioned closely to each other. In the background, the metal toilet flush handle gleams under the bright bathroom light, and a soft blue bath mat lies a short distance away, partially visible in the scene."}
+{"index": "93", "data": "A pair of striking royal blue slippers with a plush, round appearance and a cozy texture stands prominently in the foreground, drawing the eye with their vibrant color. They contrast with the muted, distant backdrop where a weathered green truck, heavy and sturdy, sits idle with traces of dust and use marking its surface. The background is bathed in the warm glow of a burnt-orange sunset that casts long shadows and highlights the outlines of other objects scattered around the faded and pebbled ground."}
+{"index": "94", "data": "Two bright red skiboards are propped up against the rugged bark of a tall pine tree, their bindings reflecting the sunlight that filters through the branches. Next to them, a pair of black skating shoes with neon green laces sits neatly on the freshly fallen snow. The surrounding area is quiet and untouched, suggesting an anticipation for an exhilarating afternoon of winter sports activities."}
+{"index": "95", "data": "The kitchen counter, bathed in soft afternoon light, showcases a collection of five golden keys strewn haphazardly across its white surface. Behind them rests a dark-colored microwave, designed with a nostalgic, retro aesthetic that contrasts with the modern simplicity of the surroundings. Nearby, a transparent vase holds a sprig of fresh greenery, adding a touch of life to the scene."}
+{"index": "96", "data": "Four sleek airplanes, with shiny metallic surfaces reflecting the sunlight, fly in a tight formation through the clear blue sky. Below them, an orange and black zigzagged extension cord lies haphazardly across a dusty brown field, contrasting with the precise aerobatics above. The planes' trailing jet streams create parallel lines that transiently scar the vastness of the open sky."}
+{"index": "97", "data": "A vibrant pink pig trots through a snowy landscape, a bright blue backpack strapped securely to its back. The pig's thick coat contrasts with the soft white blanket of snow that covers the ground around it. As it moves, the blue backpack stands out against the pig's colorful hide and the winter scene, creating a striking visual amidst the serene, frost-covered backdrop."}
+{"index": "98", "data": "On a polished wooden table, a small circular red earphone rests, its size notably less than that of the rectangular-shaped green tape dispenser placed beside it. The table surface reflects the soft glow of the morning light, accentuating the smooth texture of the earphone's surface and the matte finish of the green tape dispenser. In the background, the grain of the wood can be seen, giving a warm ambiance to the arrangement of objects on the table."}
+{"index": "99", "data": "In the spacious backyard, a large square green trash bin stands firmly on the grass, its solid structure stark against the natural setting. Approaching it, a vibrant red baseball rolls steadily across the lawn, its round shape contrasting with the rectangular contours of the bin. The grass, slightly damp, leaves a faint trail on the ball as it moves closer to the stationary receptacle."}
+{"index": "COCOval2014000000010363", "data": "A sleek gray cat balances on the roof of a polished black car. The car is situated in a driveway, flanked by neatly trimmed hedges on either side. Sunlight reflects off the car's surface, highlighting the cat's poised stance as it surveys its surroundings."}
+{"index": "COCOval2014000000014635", "data": "A woman dressed in a warm, stylish coat is seated at a small round table. She appears to be in a cozy indoor setting, possibly a café or a waiting area. The table is adorned with a vase containing a single flower, adding a touch of elegance to the scene."}
+{"index": "COCOval2014000000023272", "data": "A calico cat, sporting a patchwork of orange, black, and white fur, is comfortably nestled on top of a sleek Mercedes Benz. The luxury vehicle, with its emblem glinting in the light, is stationary, possibly in a quiet residential area. The cat, with its eyes gently shut, appears to be in a state of serene repose, oblivious to the world around it."}
+{"index": "COCOval2014000000041572", "data": "A vintage black and white photograph captures a tense moment on a baseball field. In the foreground, a batter stands ready at home plate, his posture poised for the incoming pitch. On the mound, the pitcher is caught mid-windup, the baseball clutched tightly in his hand, with the rest of the players positioned strategically around the diamond."}
+{"index": "COCOval2014000000042810", "data": "A woman dressed in athletic attire, featuring a crisp white top and a vibrant pink skirt, is poised on a tennis court. She is in the midst of a serve, with the tennis ball held aloft in one hand and her racket ready in the other. The court is marked with white lines and surrounded by a high fence, indicating a dedicated space for the sport."}
+{"index": "COCOval2014000000045844", "data": "In an open area, two children are energetically swinging their rackets on a makeshift tennis court outlined with chalk on the ground. Makeshift nets are set up, and the kids are wearing casual sports attire, indicating an informal game. The surrounding area is spacious enough to accommodate their game, with a few trees casting a gentle shade nearby."}
+{"index": "COCOval2014000000053916", "data": "A serene grassland dotted with acacia trees under a clear sky. Two zebras stand side by side on the green grass, their stripes contrasting with the surrounding foliage. The shade from a large tree offers them a cool respite from the sun's warmth."}
+{"index": "COCOval2014000000055053", "data": "A small town road lined with quaint houses and a smattering of trees on either side. In the middle of the road, a trio of white sheep ambles along, seemingly at ease amidst the quiet neighborhood. The road stretches out ahead, curving slightly as it disappears into the distance."}
+{"index": "COCOval2014000000063595", "data": "A tennis court with a player dressed in a vibrant red shirt preparing for a serve. The player is positioned at the baseline, racket in hand, focused on the upcoming play. The court is marked with white lines, and a net stretches across the center, dividing the playing field."}
+{"index": "COCOval2014000000064574", "data": "A cozy indoor setting with a soft, textured blanket spread out on a flat surface. Resting upon the blanket are a sleek pen and an assortment of personal hair care products, including brushes and combs. The items are neatly arranged, suggesting a moment of grooming or self-care preparation."}
+{"index": "COCOval2014000000070471", "data": "An adult is seated on a beige couch in a living room, holding a small child in their lap. The child, with a mischievous glint in their eye, is playfully biting into a black television remote. Around them, the room is filled with scattered toys and a plush, colorful rug on the floor."}
+{"index": "COCOval2014000000077222", "data": "a uniformed soldier kneeling down to meet the eye level of a group of young children. the children are gathered around with expressions of curiosity and excitement. the soldier's smile is warm and friendly as he extends his hand for a handshake or high-five."}
+{"index": "COCOval2014000000084701", "data": "A spacious living room featuring a large, comfortable couch facing a modern coffee table. Alongside the main furniture, there's a stack of suitcases and travel bags, hinting at recent travels or an upcoming trip. The room is completed with tasteful decor and a large window allowing natural light to fill the space."}
+{"index": "COCOval2014000000085298", "data": "A young girl with her hair tied back stands in the middle of a tennis court, gripping a tennis racket with determination. The court is marked with white lines, indicating the boundaries for the game. Around the court, a tall fence can be seen, enclosing the area for players and spectators alike."}
+{"index": "COCOval2014000000092768", "data": "An elderly gentleman with a weathered face sits on a park bench, cradling a well-worn wooden guitar in his arms. His fingers are poised over the strings, ready to play a tune. The bench is situated along a path lined with trees that provide a canopy of shade overhead."}
+{"index": "COCOval2014000000095297", "data": "a man is seated at a rustic wooden table, holding an oversized slice of pizza in his hands. the pizza is loaded with an array of colorful toppings, and strings of melted cheese stretch with each bite he takes. the table also has a few scattered napkins and a half-empty soda glass beside a pizza box with the lid ajar."}
+{"index": "COCOval2014000000096306", "data": "An indoor recreational space with several table tennis tables set up for play. Players are engaged in matches, with some practicing serves while others are in the midst of a rally. The room is filled with the sound of ping pong balls being struck back and forth across the tables."}
+{"index": "COCOval2014000000100343", "data": "A cozy kitchen scene where a plush teddy bear is seated amongst a bunch of ripe bananas. The bananas are resting on a wooden countertop, which also features a variety of other groceries and kitchen utensils. In the background, a window allows natural light to illuminate the teddy bear's soft fur and the vibrant yellow of the bananas."}
+{"index": "COCOval2014000000101456", "data": "A spacious kitchen with a large wooden table at the center, cluttered with an assortment of pots, pans, and cooking utensils. The table is surrounded by matching wooden chairs, and the walls are lined with shelves filled with spices and kitchenware. Sunlight streams in through a window, casting a warm glow on the array of cookware."}
+{"index": "COCOval2014000000103161", "data": "a white ceramic plate holding a simple meal consisting of a few roasted potatoes and an egg sandwich with a golden yolk peeking out. The plate is set upon a wooden dining table, accompanied by a fork and a knife lying beside it. The sandwich is made with toasted bread, adding a crunchy texture to the dish."}
+{"index": "COCOval2014000000110587", "data": "A man stands in front of a bathroom mirror, a toothbrush lodged in his mouth as he brushes his teeth. The bathroom is equipped with a white porcelain sink and a small window that lets in natural light. On the counter, there's a tube of toothpaste lying next to a cup filled with assorted dental hygiene tools."}
+{"index": "COCOval2014000000113571", "data": "A man stands at a table with a colorful birthday cake adorned with lit candles. He carefully slices the cake with a long knife, preparing to serve the guests. The table is covered with a festive tablecloth and scattered with other party accessories."}
+{"index": "COCOval2014000000126046", "data": "An urban street corner featuring freshly painted blue arrows on the asphalt, directing traffic flow. Several bright orange traffic cones are strategically placed to guide vehicles around a construction area. In the background, a traffic light hangs above, glowing green, signaling drivers to proceed with caution."}
+{"index": "COCOval2014000000126671", "data": "A clean, bright bathroom featuring a white toilet next to a bathtub. The tub is equipped with a shower that has a striped curtain partially drawn. The walls are tiled, and a small window allows natural light to filter in, illuminating the space."}
+{"index": "COCOval2014000000136271", "data": "An array of colorful fruit bins line the front of a local produce stand, each filled with fresh, ripe selections. Above each bin, clear signs display the prices, inviting passersby to browse and purchase. The stand is neatly organized, showcasing a variety of fruits from apples to exotic mangoes, catering to the tastes of a diverse clientele."}
+{"index": "COCOval2014000000146190", "data": "a spacious green field under a clear blue sky, with a single figure standing in the center. The person is captured in mid-action, tossing a brightly colored frisbee with a focused expression. Around them, the grass sways gently, and in the distance, a line of trees can be seen marking the field's boundary."}
+{"index": "COCOval2014000000164121", "data": "A spacious kitchen featuring natural wooden cabinets that show signs of frequent use. The countertop is cluttered with an assortment of kitchen gadgets, utensils, and a bowl of fresh fruit. Sunlight streams in, illuminating the space and highlighting the details of the cluttered surface."}
+{"index": "COCOval2014000000166358", "data": "A sprawling snow resort bustling with activity, with skiers and snowboarders dotting the slopes. Multiple ski lifts are in operation, carrying guests to the top of the white, powdery hills. The resort features a large lodge at the base, where more people can be seen enjoying the amenities."}
+{"index": "COCOval2014000000167696", "data": "a life-sized zebra sculpture positioned in the center of a well-manicured garden space. The garden is dotted with a variety of lush plants and flowers, providing a vibrant backdrop to the monochromatic statue. A gravel pathway winds around the garden, inviting visitors to view the sculpture from different angles."}
+{"index": "COCOval2014000000180784", "data": "A quaint, single-engine propeller plane with a vibrant paint job rests in the middle of a wide-open grassy field. The field is bordered by a simple fence and a few scattered trees in the distance. The plane's small size suggests it's designed for personal or recreational use, and it sits idle under the clear sky."}
+{"index": "COCOval2014000000180800", "data": "Rows of neatly arranged wooden shelves line the interior of a local grocery store's produce section. On one of the shelves, small metal pails filled with bright, ripe oranges catch the eye. The oranges are stacked to the brim, offering a fresh and colorful selection to shoppers passing by."}
+{"index": "COCOval2014000000183648", "data": "A group of three individuals positioned beside a towering elephant in an open area. The elephant's gray skin contrasts with the colorful clothing of the people. They appear to be engaged in a moment of interaction, with the vast sky above them and the ground beneath scattered with dry grass."}
+{"index": "COCOval2014000000187240", "data": "A vibrant red bus is parked on a bustling city street, its size overshadowing the adjacent white van. The street is lined with a mix of small shops and residential buildings, each with their own unique facades. The vehicles are positioned parallel to the curb, with the bus's destination sign clearly visible above its windshield."}
+{"index": "COCOval2014000000191919", "data": "A pastoral scene unfolds with a herd of cows scattered across a lush green field, leisurely grazing under the open sky. The field is bordered by a simple wooden fence, and in the distance, a cluster of trees can be seen lining the horizon. The cows, with their varied brown and white patterns, appear content in their tranquil environment, occasionally lifting their heads to survey their peaceful surroundings."}
+{"index": "COCOval2014000000205054", "data": "A cozy room featuring a sturdy wooden table at the center. Atop the table, a gray cat is comfortably sprawled out, basking in the tranquility of the space. The table also holds a few scattered papers and a potted plant, adding a touch of lived-in charm to the scene."}
+{"index": "COCOval2014000000206496", "data": "A group of tourists is seated atop a large elephant, which is part of a caravan of elephants walking in a line. Each elephant carries passengers securely fastened in a howdah, and they are led by guides walking alongside them. The procession of elephants moves through a natural trail surrounded by dense foliage."}
+{"index": "COCOval2014000000211560", "data": "A large black bear with a thick coat of fur is sprawled out on a rocky outcrop. Its gaze is fixed directly on the camera, giving a sense of direct engagement with the viewer. The rocks where the bear rests are surrounded by a smattering of green foliage, hinting at a forested habitat."}
+{"index": "COCOval2014000000212403", "data": "A rider atop a chestnut horse in the middle of a spacious pasture enclosed by a wooden fence. The pasture is dotted with patches of green grass and the occasional tree, providing a serene setting for the horse and rider. In the distance, a barn can be seen, completing the pastoral scene."}
+{"index": "COCOval2014000000214123", "data": "two women are seated at a modern kitchen island with sleek countertops. they appear to be engaged in a casual conversation over cups of coffee. the kitchen is equipped with contemporary appliances and a vase of fresh flowers adds a touch of color to the space."}
+{"index": "COCOval2014000000217133", "data": "an aged pickup truck sits with its hood propped open, revealing a dusty and intricate engine beneath. The vehicle's body shows signs of wear and rust, hinting at its long service and many journeys. The truck is parked on a stretch of gravel, with no other vehicles in immediate sight."}
+{"index": "COCOval2014000000224622", "data": "a towering clock tower stands against the backdrop of a clear blue sky. the clock face is visible, showing the time, with intricate details around its frame. the structure's architecture is a blend of classic and modern elements, with the tower rising prominently above the surrounding buildings."}
+{"index": "COCOval2014000000229427", "data": "Two people, a man and a woman, are standing in a spacious living room, each holding a game controller in their hands. They are focused on a large television screen in front of them, which is displaying a colorful video game interface. Around them, the room is furnished with a comfortable couch and a coffee table scattered with magazines and remote controls."}
+{"index": "COCOval2014000000231527", "data": "A rustic wooden table with a natural grain finish, bathed in soft light. On its surface, a cluster of ripe oranges is arranged next to two glass jars filled with a vibrant orange marmalade. The jars catch the light, highlighting the rich color and texture of the contents within."}
+{"index": "COCOval2014000000239274", "data": "A large pontoon boat, its deck lined with rows of seats, is filled with passengers. The boat is just beginning to move away from the dock, creating ripples on the water's surface. The shoreline is dotted with trees and a few buildings that are slowly receding into the background as the ferry starts its journey across the calm lake."}
+{"index": "COCOval2014000000241677", "data": "A scenic trail where a group of riders are mounted on their horses, moving in a line. The horses appear well-groomed, and the riders are dressed in casual riding attire. The path they are on is bordered by trees, and the group seems to be enjoying a leisurely ride in the countryside."}
+{"index": "COCOval2014000000245497", "data": "a young skateboarder caught mid-air as they leap off the last step of a concrete staircase. the skateboarder's focus and determination are evident in their posture. the steps are part of a public urban area, with metal railings on one side and a grassy patch visible in the background."}
+{"index": "COCOval2014000000261981", "data": "A sleek black cat with a smart tie around its neck lounges comfortably on a neatly made bed. The bed features a white duvet and a selection of plush pillows against a simple headboard. The room is illuminated by soft light, highlighting the cat's glossy fur and the playful contrast of its formal accessory."}
+{"index": "COCOval2014000000264619", "data": "A sandy beach scene where a man stands at the water's edge, clutching a brightly colored surfboard under his arm. In the distance, two wind gliders are skimming across the surface of the ocean, their sails billowing in the breeze. The horizon is dotted with small boats, and the sky above is clear, hinting at favorable conditions for water sports."}
+{"index": "COCOval2014000000276149", "data": "A snowy landscape bustling with activity as numerous individuals are scattered throughout the area. People are dressed in warm winter clothing, some are engaged in building a snowman, while others are enjoying a snowball fight. The ground is covered in a thick blanket of fresh snow that crunches underfoot, with footprints crisscrossing in various directions."}
+{"index": "COCOval2014000000277051", "data": "A rustic wooden table set outside, perhaps in a garden or patio area. On its surface, a pair of small birds are perched, casually observing their surroundings. The table shows signs of weathering, indicating it's been a part of the outdoor scenery for some time."}
+{"index": "COCOval2014000000284772", "data": "A brown dog paddles through the clear blue water, its eyes focused ahead. In its mouth, it firmly holds a brightly colored Frisbee, seemingly proud of its retrieval. The sun reflects off the water's surface, creating a sparkling effect around the swimming canine."}
+{"index": "COCOval2014000000309495", "data": "A pristine white bathroom with a ceramic toilet installed against a tiled wall. Beside the toilet, a single roll of toilet paper rests on a holder within arm's reach. The floor is tiled in a matching white, reflecting the cleanliness of the space."}
+{"index": "COCOval2014000000310532", "data": "A pristine white bathroom featuring a freestanding tub adjacent to a matching white sink. The walls are adorned with minimalist decor and the floor is tiled in a subtle grey pattern. Natural light streams in, illuminating the chrome fixtures and clean lines of the bathroom's design."}
+{"index": "COCOval2014000000311879", "data": "A modern kitchen interior featuring stainless steel appliances, including a sleek oven and a large refrigerator. The countertops are clean and spacious, with a few cooking utensils neatly arranged. The kitchen is well-organized, with ample cabinet space above and below the countertops."}
+{"index": "COCOval2014000000313130", "data": "a modern kitchen features a sleek silver refrigerator standing beside a matching microwave. The appliances are set against a wall with a subtle paint finish, and the room is illuminated by natural light streaming in from a nearby window. The clean lines and minimalist design suggest a contemporary home with an emphasis on functionality."}
+{"index": "COCOval2014000000319830", "data": "An individual stands at the water's edge, a fishing rod in hand, poised and focused on the task at hand. The bank is lined with reeds and rocks, providing a natural habitat for the fish. In the distance, the gentle flow of the water creates a serene backdrop for this tranquil fishing scene."}
+{"index": "COCOval2014000000321522", "data": "The kitchen boasts a vintage aesthetic, complete with classic appliances that hark back to a bygone era. Retro pictures adorn the walls, adding to the nostalgic charm of the space. The countertops are lined with period-appropriate gadgets and decorative items, complementing the overall old-fashioned theme."}
+{"index": "COCOval2014000000325237", "data": "Pedestrians with umbrellas navigate a wet sidewalk glistening from the rainfall. The street alongside is lined with lampposts and trees that are gently swaying in the breeze. Puddles have formed on the ground, reflecting the overcast sky and the city buildings that loom in the background."}
+{"index": "COCOval2014000000328512", "data": "A close-up image of a giraffe's face, with its large, brown eyes staring directly into the camera lens. The giraffe's long neck and distinctive patterned fur are clearly visible against a backdrop of blue sky and a few scattered clouds. Its ears are perked up, giving it a look of curiosity as it gazes at the viewer."}
+{"index": "COCOval2014000000339120", "data": "An outdoor tennis court with a green surface and white boundary lines. A man and a child are engaged in a playful tennis match, each holding a racket and focusing on the ball. The court is surrounded by a tall fence, and there are trees casting shadows on the ground nearby."}
+{"index": "COCOval2014000000341973", "data": "Two young girls stand side by side in a brightly lit room, each holding a colorful doughnut with a smile. They are dressed casually, and the room behind them features a neutral-colored wall and a few hanging pictures. Their cheerful expressions suggest they are enjoying a fun moment together during a friendly gathering."}
+{"index": "COCOval2014000000342593", "data": "A young boy is seated comfortably in a cushioned chair in a well-organized living room. He is intently focused on the television screen in front of him, gripping a video game controller with both hands. Around him, the room is adorned with a few framed pictures and a plant on the windowsill, adding a touch of homeliness to the space."}
+{"index": "COCOval2014000000354868", "data": "An aged locomotive, its surface weathered and worn, chugs forcefully along the tracks of a sprawling rail yard. The yard is a maze of intersecting rails, scattered with various railcars and equipment. The train's determined movement gives the impression of urgency as it navigates through the industrial landscape."}
+{"index": "COCOval2014000000360132", "data": "a tall giraffe standing next to a water hole, its long neck extended towards the lush leaves of an acacia tree. the tree's branches are just within reach of the giraffe's curious gaze. the scene is set in a sunlit savannah with the water hole reflecting the clear blue sky."}
+{"index": "COCOval2014000000361885", "data": "A spacious living area with modern furnishings and a large flat-screen TV mounted on the wall. In the center, a man is standing with a Wii remote in hand, ready to play a video game. The room is well-organized, with no visible clutter and a sleek, contemporary design."}
+{"index": "COCOval2014000000365511", "data": "A suburban street lined with power lines that intersect above, casting a web of shadows on the pavement below. To the side, a solitary tree stands tall, its branches reaching towards the lines. Nearby, a street sign is clearly visible, indicating directions or street names for passersby."}
+{"index": "COCOval2014000000367228", "data": "A youthful individual is engaged in flying a colorful kite in a spacious grassy field adjacent to a calm body of water. The sky above is clear, allowing the kite to ascend steadily on the gentle breeze. In the background, the water's edge is lined with reeds and small bushes, complementing the serene outdoor activity."}
+{"index": "COCOval2014000000367905", "data": "A man wearing a black rash guard is caught mid-fall from his surfboard amidst the ocean waves. The surfboard is visible, slightly tilted, indicating the loss of balance. The ocean around him is a deep blue, and the horizon can be seen in the distance with a clear sky overhead."}
+{"index": "COCOval2014000000377183", "data": "A modern office space featuring a sleek desk with a computer set up, including a monitor, keyboard, and mouse. Beside the computer, there's a printer with a stack of paper next to it. An ergonomic office chair is positioned in front of the desk, ready for someone to sit down and start working."}
+{"index": "COCOval2014000000390241", "data": "An overturned bus lying on its side on a stretch of road. The bus's wheels are exposed to the sky, and its windows are shattered, with glass fragments scattered around. The scene is cordoned off with yellow caution tape, indicating an area of investigation or cleanup."}
+{"index": "COCOval2014000000398222", "data": "A spacious dining room filled with a large table set for a feast, surrounded by guests engaged in lively conversation. The table is adorned with fine china, gleaming silverware, and an array of colorful dishes ready to be enjoyed. Soft lighting casts a warm glow over the scene, highlighting the faces of the many attendees who have come together to enjoy the dinner party."}
+{"index": "COCOval2014000000403792", "data": "Two snowboarders, clad in colorful winter gear, are seated on a chairlift, their snowboards hanging below them. The lift cables stretch upward, disappearing into the distance as they ascend the snowy mountain. Around them, the landscape is dotted with pine trees and the trails carved by previous adventurers."}
+{"index": "COCOval2014000000406315", "data": "A playful scene unfolds in a spacious room where a young boy with a mischievous grin is climbing into a large, open suitcase. The suitcase lies on a soft carpet, surrounded by toys and clothes scattered about. The boy seems to be attempting a game of hide-and-seek, tucking himself into the suitcase with a look of anticipation."}
+{"index": "COCOval2014000000410889", "data": "A freshly baked pizza sits on a wooden table, its crust golden and edges slightly charred. The top is generously adorned with a variety of colorful toppings, including slices of pepperoni, chunks of bell pepper, and melted cheese. A sprinkling of fresh basil leaves adds a touch of green to the vibrant dish."}
+{"index": "COCOval2014000000411953", "data": "A man dressed in a crisp white shirt and sleek black tie is seated with a guitar in his hands. He is focused intently on the strings, fingers positioned to strum a chord. The room around him is blurred, emphasizing the musician and his instrument as the central subjects of the scene."}
+{"index": "COCOval2014000000413552", "data": "A cozy indoor setting where a woman is seated with a baby on her lap. The infant, dressed in a colorful outfit, gazes directly into the lens of the camera with a curious expression. The background is softly blurred, highlighting the interaction between the child and the camera."}
+{"index": "COCOval2014000000417946", "data": "A scenic outdoor area captured in a photograph, featuring a lush green lawn with a variety of plants and trees. In the background, there's a clear blue sky with a few scattered clouds. The image conveys a sense of openness and natural beauty."}
+{"index": "COCOval2014000000424349", "data": "A chef dressed in a white apron and a tall chef's hat is sliding a freshly topped pizza into a large industrial oven. The professional kitchen is equipped with stainless steel countertops and a variety of cooking utensils hanging from a rack above. The warm glow from the oven illuminates the chef's focused expression as they carefully handle the pizza peel."}
+{"index": "COCOval2014000000425925", "data": "A bustling city street lined with various shops and cafes, leading the eye towards a distinctive building in the distance. The building is notable for its maroon steeple, which houses clocks just beneath a prominent standing platform. The architecture of the building stands out against the urban landscape, hinting at historical significance amidst the modern city life."}
+{"index": "COCOval2014000000434657", "data": "a solitary surfer is riding a wave, his silhouette outlined against the ocean's expanse. the water around him is a gradient of blues, with white foam cresting at the wave's peak. in the distance, the shoreline is visible, hinting at a vast beach beyond."}
+{"index": "COCOval2014000000441411", "data": "A domestic cat with a sleek coat stands alert next to a plush brown teddy bear. The teddy bear is seated on a hardwood floor, its size comparable to the feline companion beside it. Both the cat and the teddy bear are positioned near a light-colored wall, with a hint of a houseplant's green leaves in the corner of the scene."}
+{"index": "COCOval2014000000447553", "data": "A serene beach scene with a vast expanse of sand stretching out towards the horizon. In the distance, a couple is seen with a colorful kite soaring high in the sky above them. The beach is devoid of other visitors, giving the couple a private moment with the gentle sea breeze."}
+{"index": "COCOval2014000000449726", "data": "A rustic piece of crusty bread with a circular cutout in the center, where a fried egg sits perfectly cooked. The yolk is still runny, and the edges of the egg are slightly crispy. The plate on which the bread and egg rest is white, providing a stark contrast to the golden brown hues of the meal."}
+{"index": "COCOval2014000000465130", "data": "The room features a neatly made bed with a plush comforter, positioned against a soft-colored wall. Adjacent to it, a cozy chair and a couch offer additional seating options, while a sleek TV stand and a sturdy desk complete the room's furnishings. The desk is set up with a lamp and some stationery, ready for work or study."}
+{"index": "COCOval2014000000471528", "data": "An eye-catching graphic designed to capture the attention of writers across the country. It features bold text and vibrant colors to announce the National Novel Writing Month event. The graphic includes motivational slogans and the dates of the event, encouraging authors to unleash their creativity and join the challenge."}
+{"index": "COCOval2014000000479732", "data": "A hefty sandwich filled with an assortment of meats and vegetables is nestled within a white Styrofoam container. The container sits on a plain surface, with a few crumbs scattered around it. The sandwich appears to be freshly made, with the contents slightly spilling out the sides due to its generous fillings."}
+{"index": "COCOval2014000000493435", "data": "A park bench made of weathered wood, with a large, flat-screen television set precariously on top. Behind the bench, an open umbrella with a colorful pattern provides a stark contrast to the bench's muted tones. The scene is set on a paved path with grass on either side, hinting at a public space with an unusual arrangement of objects."}
+{"index": "COCOval2014000000508291", "data": "a vast open field dotted with patches of greenery and a few scattered trees under a clear blue sky. two giraffes are seen gracefully moving across the terrain, their long necks and legs casting shadows on the ground. in the distance, a gentle slope rises to meet the horizon, hinting at the expansive savannah that stretches beyond."}
+{"index": "COCOval2014000000509270", "data": "A vibrant yellow kite soars high in the sky, its long tail fluttering in the wind. Below it, a diverse array of other kites of various shapes and sizes fill the air, creating a colorful spectacle. The kites dance and dip against the backdrop of a clear blue sky, occasionally crossing paths as they ride the gentle breeze."}
+{"index": "COCOval2014000000513096", "data": "Two men are standing in a room with walls adorned with historical memorabilia. On the left, a man dressed in a crisp military uniform gazes intently at a series of glass cases. Inside the cases, an array of bullet shells is meticulously arranged and labeled for display, while the man in the suit on the right points towards one of the exhibits, seemingly explaining its significance."}
+{"index": "COCOval2014000000516248", "data": "An office desk with a sleek, modern computer monitor that dominates the space. To the right, a hand is poised over a wireless mouse, guiding the cursor across the screen. The desk surface is clean except for a few essential items like a keyboard, a notepad, and a cup of pens beside the computer setup."}
+{"index": "COCOval2014000000516542", "data": "A series of colorful signs are lined up along the roadside at the town's edge, each promoting different library events. The signs feature dates and times for upcoming book sales, author readings, and children's story hours. The backdrop to these signs is a quiet street that leads into the heart of the town, with the library itself visible in the distance."}
+{"index": "COCOval2014000000537211", "data": "A serene lakeside setting where a wooden dock extends into the water. A man is casually seated at the edge, enjoying a hot dog, with a relaxed posture that suggests a leisurely day. The water is calm, and the dock appears to be a popular spot for visitors to take in the view and enjoy a bite to eat."}
+{"index": "COCOval2014000000542910", "data": "A woman with an intense expression is interacting with an unseen person to the side of the frame. Her facial expression is one of playful aggression, with her teeth bared in a mock snarl. She appears to be in a brightly lit room, with casual attire suggesting a comfortable, informal setting."}
+{"index": "COCOval2014000000546965", "data": "A large transport truck with a two-tiered trailer loaded with multiple cars of different colors. The truck is parked on a wide, paved area, possibly a parking lot or a vehicle distribution center. Each car is securely fastened to the trailer, ready for delivery to their respective destinations."}
+{"index": "COCOval2014000000559400", "data": "Rows of fresh produce line the interior of a bustling grocery store. Bags filled with crisp apples and plump grapes are neatly arranged on the shelves, inviting shoppers to add them to their carts. The fruits' vibrant colors stand out against the backdrop of other grocery items in the background."}
+{"index": "COCOval2014000000566470", "data": "Two wooden rowboats with a weathered finish are resting side by side on the sandy shore. The boats are empty, with their oars tucked neatly inside, hinting at a recent return from a journey on the water. In the background, the gentle waves of the sea can be seen, with the horizon stretching wide beyond them."}
+{"index": "COCOval2014000000567609", "data": "An interior space featuring a collection of yellow ceramic ornaments neatly arranged on a shelf. In the center of the room stands a table with a clear glass vase filled with a bouquet of fresh flowers. The walls are adorned with framed artwork, complementing the warm tones of the ceramics and floral arrangement."}
+{"index": "COCOval2014000000572260", "data": "A room with walls lined with shelves filled to the brim with an assortment of objects. Each shelf is crowded with items ranging from books and vases to small trinkets and photo frames. The collection appears eclectic, with no immediate sense of order, giving the space a lived-in and personalized feel."}
+{"index": "COCOval2014000000580698", "data": "A large brown bear is partially submerged in a clear, tranquil body of water. The sun casts a warm glow on the bear's fur, highlighting its texture and creating a serene scene. The water ripples gently around the bear as it sits calmly, enjoying the warmth of the sunlight."}
+{"index": "countbench0", "data": "A close-up photograph provides a bird's-eye view of an elegant tea selection box, partitioned into neat compartments. Each section cradles a different variety of tea, carefully wrapped in paper sachets with small labels indicating their flavors. The box itself boasts a light gray base with artistic splashes of vibrant oranges and pinks, adding a touch of vivid color to the array of teas presented within."}
+{"index": "countbench1", "data": "A dynamic composition captures the essence of good fortune with four ace playing cards, each standing upright on its corner, aligned in a perfect formation, reflecting onto the sleek surface below them. The playing cards possess a crisp white background, accented by the vibrant red and black symbols designating hearts, diamonds, clubs, and spades. This visual is set against a stark black and deep red backdrop, creating a striking contrast that emphasizes the cards and their significance. The photograph is credited to Samantha Craddock, encapsulating the 'Feeling Lucky' concept with clarity and artistic flair."}
+{"index": "countbench10", "data": "An elegantly designed bathroom features two white pedestal sinks stationed at either end, framing a custom-built cabinet nestled snugly in between. This unique cabinet is crafted to match the sleek aesthetic of the pedestals, finished in a soft beige hue with silver handles that add a touch of sophistication. It offers the perfect blend of the classic pedestal style with the practicality and storage of a traditional vanity, ensuring functionality without compromising on elegance."}
+{"index": "countbench11", "data": "A set of four green plastic food containers displayed against a stark white background, each captured from a distinct angle to showcase the varying perspectives of their design. The containers exhibit a smooth texture and a slightly reflective surface that catches the light subtly. Arranged neatly, they demonstrate the versatility of their form and capacity through the different foreshortenings presented."}
+{"index": "countbench12", "data": "A collection of eight elegant side chairs from the 1950s, designed by Gianni Vigorelli, each stands 37 1/2 inches tall. The frames are made from pearwood, featuring a warm, golden-brown hue with a smooth finish that highlights the wood's natural grain. The seats and backrests are upholstered in a sleek, black vinyl, providing a comfortable yet durable seating surface. These chairs measure 16 1/2 inches in width and 17 3/8 inches in depth, making them well-proportioned for a variety of dining spaces."}
+{"index": "countbench13", "data": "An assortment of vibrant candies in various shapes and textures, each distinctly highlighted against a pristine white backdrop. The set features nine unique compositions, showcasing the candies from different angles and perspectives, allowing for the play of light and shadow to emphasize their colorful foreshortenings. Among the mix, glistening hard candies reflect the light, soft gummy treats exhibit their stretchy texture, and foil-wrapped chocolates add a metallic contrast to the collection."}
+{"index": "countbench14", "data": "A collection of four intricately designed bookmarks, each featuring black and white doodles of various flowers and ornamental patterns. These bookmarks are tailored for adults who enjoy coloring, offering a creative and relaxing activity. The detailed vector illustrations are perfect for bringing to life with a splash of color, and the bookmarks are crafted to be both functional and aesthetically pleasing."}
+{"index": "countbench15", "data": "Three individuals donned in matching black security uniforms stand against a stark white backdrop. Their focused expressions are partially concealed as they peer intently through binoculars, each facing a different cardinal direction as though scanning for signs of activity. The central figure is slightly elevated, on a raised platform, giving the impression that they are overseeing the area with heightened vigilance."}
+{"index": "countbench16", "data": "A collection of six finely crafted porcelain plates, each adorned with intricate blue and white \"Merryman\" patterns, indicative of their unique and valuable nature. These plates, likely originating from London in the year 1752, have an undeniable historical charm and are thought to be from the Lambay estate. They are estimated to be valued between £20,000 to £30,000, reflecting their rarity and the craftsmanship of the era."}
+{"index": "countbench17", "data": "An array of five monochromatic photographs, captured by the esteemed Gordon Parks, is meticulously arranged on a dark grey wall. Each image serves as a poignant window into the lives affected by poverty in Rio de Janeiro during the tumultuous 1960s. The high-contrast black and white tones of the photos bring stark attention to the subjects within, highlighting the raw emotional intensity and the historical significance of the scenes depicted."}
+{"index": "countbench18", "data": "A cheerful family of four gathered around a wooden dining table set within a brightly lit kitchen. The table is laden with a scrumptious breakfast spread, including bowls of colorful fruit, pitchers of juice, and plates of toast. Two young children, a boy and a girl, their faces lit up with joy, are seated between their parents, eagerly reaching for their favorite dishes. Behind them, the kitchen is cozy and welcoming, complete with white appliances and a vase of fresh flowers adding a touch of warmth to the scene."}
+{"index": "countbench19", "data": "Nine differently colored labels, each featuring the iconographic representation of a Central Processing Unit, aligned neatly for visual comparison. These square icons vary in shades from vibrant red to deep blue, with the CPU symbol prominently displayed in the center. The texture of the labels appears smooth, and they are arranged in a grid pattern on a plain, light background that enhances their visibility in the illustration."}
+{"index": "countbench2", "data": "The sky is adorned with the impressive formation of four sleek Blades aircraft, each painted in vibrant hues of red and white, with their contrails presenting a mesmerizing spectacle as they perform 26 consecutive loops, an endeavor set to break a world record. Mike Newman, who is visually impaired, daringly executed the initial loop with commendable skill before his co-pilot seamlessly assumed control for the remainder of the gravity-defying feat. Onlookers on the ground watch in awe as these agile planes carve perfect circles in the azure canvas above, their engines humming in harmonious unison."}
+{"index": "countbench20", "data": "a collection of four vibrantly colored, circular buttons, each featuring a simplistic heart icon with a prominent plus sign at its center, signaling the option to add to favorites. These flat-designed icons are isolated against a clean background, making them immediately identifiable for user interface purposes. Each button presents a different hue: one is red, another blue, the third one is a sunny yellow, and the last one is green, creating a visually appealing and intuitive set for users to interact with."}
+{"index": "countbench21", "data": "A vibrant tableau depicting ten children of various ages aligned on a long wooden bench in an outdoor setting, each with a unique expression of merriment. The bench, weathered and sturdy, supports their collective weight as they pose for the photo, surrounded by lush greenery and brightly colored balloons tethered to the bench ends. The children are dressed in casual party attire, with several sporting colorful hats, and the scene is illuminated by the warm glow of a string of lights dangling overhead, suggestive of a festive celebration in their midst."}
+{"index": "countbench22", "data": "A digital vector illustration depicting a serene winter scene with six tall spruce trees silhouetted in black. The trees are set against a crisp white background that is dotted with an array of delicate, intricate snowflakes, varying in size and design. This image conveys the stillness and beauty of a snowy landscape without any other elements to distract from the stark contrast between the dark evergreens and the wintry backdrop."}
+{"index": "countbench23", "data": "A colorful collection of four cartoon-styled calendars, each uniquely illustrating the essence of a different season. The spring calendar bursts with shades of green and pink, featuring blooming flowers and sprouting leaves. The summer calendar glows with vibrant sun motifs and vivid blue skies. Autumn is represented with warm oranges and browns, showcasing falling leaves and harvest themes. The winter calendar is adorned with soft whites and blues, depicting snowy scenes and cozy fireside images. Each calendar is distinct, yet they all share a whimsical charm that captures the spirit of their respective seasons."}
+{"index": "countbench24", "data": "A quaint collection of waterproof stickers, each featuring whimsical designs of cats and dogs intertwined with imagery of meaty meatballs and festive New Year couplets. The set comprises a small, curated group of five unique stickers, each with its own vibrant color scheme and glossy texture. These stickers are neatly displayed in a section designated for pet lovers and holiday enthusiasts, providing a charming and functional decoration that can withstand the elements."}
+{"index": "countbench25", "data": "A detailed analysis on the future of Liverpool's striker line-up, contemplating the value and potential each player brings to the team. The focus is on the eight forwards currently with the club, assessing their skills and considering whether they should be retained or sold, especially with the imminent signing of Christian Benteke. Charts and statistics accompany the evaluation, providing a comprehensive overview of the players' performances to inform the decision-making process."}
+{"index": "countbench26", "data": "an elegant set of six vintage silver tablespoons, each intricately crafted with the 1847 Rogers Ambassador pattern. The spoons, exhibiting a fine luster and delicate engravings, lie gracefully aligned on a dark velvet cloth. The ornate design of each handle reflects the care and craftsmanship of the historic silverware, with flourishes and floral motifs adorning the metal surface."}
+{"index": "countbench27", "data": "Seven cylindrical brown glass beer bottles are symmetrically arranged on a reflective surface, casting subtle shadows and clear reflections on the stark white background. Each bottle stands upright, showcasing their identical shape and size, with labels facing forward that hint at their various contents. The glossy finish of the bottles contrasts with the matte texture of the background, emphasizing the simplicity and symmetry of the composition."}
+{"index": "countbench28", "data": "An expansive dining space features a large wooden table with three additional extension leaves, providing ample room for guests. Around the table, there are six matching chairs, each crafted with elegant design, and among them, a Leander high chair stands out, specially designed for a child to join the family meals. The table is set under a hanging chandelier that casts a warm glow over the polished surface of the wood."}
+{"index": "countbench29", "data": "A collection of nine beautifully crafted labels, designed specifically for the summer season, each featuring shades of blue and yellow. These labels are presented in a vector format, with item number 20445318, showcasing various summer-themed icons and motifs. The assortment includes labels of different shapes such as circular, rectangular, and even ribbon-like banners, ideal for summer sales, events, or product packaging in a sunny and vibrant design."}
+{"index": "countbench3", "data": "Ten School Resource Officers are set to serve in seven local school districts, as part of the School Resource Officer (SRO) Program led by Broome County District Attorney Steve Cornwell. The officers, in their distinct uniforms, will be a significant presence in the school environment, providing security and fostering relationships with the students. The schools, each with its unique architectural design and color scheme, will benefit from this added layer of security and community engagement. This initiative is a significant part of the district attorney's efforts to ensure a safe and conducive learning environment for the students."}
+{"index": "countbench30", "data": "A collection of nine vibrant, assorted speech bubble stickers, each displaying the acronym 'LOL' in bold, playful lettering. The stickers feature a kaleidoscope of colors such as pink, yellow, blue, and green, set against a pristine white background, emphasizing their colorful appeal. Each sticker has a unique shape and size, adding to the diversity of the set, and their surfaces are smooth with a slight sheen, suggesting they are made of a high-quality adhesive material suitable for a variety of surfaces."}
+{"index": "countbench31", "data": "an interior design arrangement showcasing a pair of iconic Eames RAR chairs in a sleek black finish, with their characteristic smooth curves and wooden rocker legs. Adjacent to the chairs, there's a contemporary black bedroom furniture set that includes a bed frame, nightstands, and a dresser, all featuring minimalist lines and matte surfaces. The entire setup is part of a home design concept that blends modern aesthetics with classic mid-century elements."}
+{"index": "countbench32", "data": "A carefully arranged still life photo of a natural bird's nest containing three beige, speckled eggs. The nest is positioned on a subdued, dark backdrop to enhance the texture and detail of the twigs and eggs. The lighting is focused to create soft shadows and highlights that reveal the intricate patterns on the eggs and the interwoven structure of the nest."}
+{"index": "countbench33", "data": "A vibrant canvas stretched across a 2cm thick wooden frame, adorned with the illustration of abstract, beautiful female faces depicted on three separate segments of the canvas. The artwork displays a unique juxtaposition of blooming flowers and the delicate features of women, with a rich color palette that includes hues of purple, red, and yellow. Each of the three portions of the canvas seamlessly connects, creating a continuous visual narrative that celebrates feminine beauty and botanical elegance."}
+{"index": "countbench34", "data": "A tranquil pond with vibrant clear blue water, where three pristine white waterlilies float gracefully on the surface. Each waterlily has a perfectly formed shape with delicate petals radiating outwards from a yellow center. The smooth surface of the water reflects the sky above, enhancing the serenity of the scene, as gentle ripples emanate outward from the blossoms."}
+{"index": "countbench35", "data": "A rustic wooden wall displaying a row of five golden stars representing Amazon feedback. Next to the stars, there's a human index finger pointing at the highest star, implying a top rating. The wood grain texture is visible around the shiny golden stars, and the contrast between the natural wood and the metallic gleam of the stars is striking."}
+{"index": "countbench36", "data": "An artistic array of six vibrant mugs featuring a spectrum of colors, each with a smooth, glossy finish indicative of gradient mesh design elements. These mugs are depicted in a two-dimensional vector format, suitable for stock imagery, and their handles are gracefully curved to the right, allowing for easy visual differentiation. The mugs' colors transition flawlessly from one to another, showcasing the use of digital illustration techniques for a realistic appearance."}
+{"index": "countbench37", "data": "Two metal baking sheets rest on a kitchen countertop; one holds a spread of raw, vibrant green broccoli and pale white cauliflower florets, while the other displays the same vegetables transformed by heat, their colors now a deeper green and brown, edges crisped from baking. These trays provide a visual contrast between their pre and post-oven states, showcasing the effects of roasting on their textures and hues."}
+{"index": "countbench38", "data": "An image of a serious-looking businessman, dressed in a tailored grey suit with a crisp white shirt and a dark tie. He is depicted in an unusual setting, lifting two surprisingly heavy dumbbells, his arms flexed showing significant effort. This unexpected juxtaposition against an office background, complete with a wooden desk and a black ergonomic chair, creates a striking visual."}
+{"index": "countbench39", "data": "A collection of images displaying a male nurse, each capturing different poses and facial expressions. He is dressed in a pristine white coat, which contrasts against the deep blue scrubs underneath. In one picture, he holds a stethoscope with a concentrated expression, while another shows him with a reassuring smile, offering a comforting presence. A clipboard is clutched in his hand in a third image, where his brow is furrowed in thought. The background features a clean, clinical setting with beige walls and medical equipment in view."}
+{"index": "countbench4", "data": "A comprehensive PDF Cross Stitch Pattern available for instant download, featuring an assortment of ten meticulously crafted designs. Each pattern showcases a unique cacti or succulent plant housed within a geometric terrarium, optional for those who wish to add an extra layer of complexity to their craft. These patterns embody intricate details that celebrate the natural beauty of desert flora, enhanced by the sharp angles and clear lines of the terrariums."}
+{"index": "countbench5", "data": "A collection of nine vibrant VIP sign icons, indicating membership or exclusive status. Each label varies in color, providing a rainbow spectrum from red to violet, and they are designed with a sleek, glossy texture. The symbols feature a simple, bold font that stands out against the solid background, rendering them ideal for vector illustrations where distinction is key."}
+{"index": "countbench6", "data": "A detailed vector illustration showcasing four separate square compositions, each highlighting different aspects of waste management and recycling processes. The vibrant images feature a variety of flat design icons and pictograms that represent sorting bins, recycling symbols, and cleaning equipment. Each square has a distinct color palette to categorize the type of waste, ranging from organic to plastic, metal, and paper, emphasizing the importance of recycling and proper disposal."}
+{"index": "countbench7", "data": "In the vibrant backyard of a suburban home in Yarmouth, Maine, USA, three children of varying ages (a toddler around 2-3 years old, a preschooler aged 4-5, and a young child around 6-7 years) are gleefully playing in the spray of a water sprinkler. The lawn, lush and green, serves as a soft cushion under their bare feet as they run and jump through the scattering droplets of water. The afternoon sun highlights the joyful expressions on their faces as they engage in this quintessential summertime activity."}
+{"index": "countbench8", "data": "In a spacious living room, two 3-seater sofas with plush white cushions are symmetrically positioned opposite each other, creating a welcoming conversational area. Between them sits a nature grey 39\" coffee table made of Ostuni white mahogany, giving off a rustic yet modern vibe. Surrounding the central coffee table, three matching lounge armchairs with grey upholstery offer additional seating, enticing guests to relax and enjoy the space. The arrangement of the furniture promotes easy movement and social interaction within the room."}
+{"index": "countbench9", "data": "An illustration depicting a group of six stylish girls clad in vibrant, eye-catching swimsuits, each showcasing a unique design and an array of lively colors that stand out against the plain background. The characters are accessorized with various beach-related items such as sunglasses, hats, and beach balls, providing a sense of summer fun. The artwork, created using a vector style, boasts clean lines and a modern aesthetic that captures the energy of a joyful, sunny day at the beach."}
+{"index": "diffusiondb0", "data": "A visually striking digital portrayal of a futuristic woman, designed with an exquisite attention to detail that showcases her elegant pose, encased in an array of meticulously rendered leaves. Produced by the skilled artist Janice Sung, the image is a high-definition 4K creation that glows with stunning volumetric lighting effects. The woman's appearance is characterized by a level of hyper-realism that captures the intricate textures of her skin and the leaves, enhanced by the luminous, fantasy-inspired ambiance."}
+{"index": "diffusiondb1", "data": "In the digital artwork, Pinocchio and Geppetto are portrayed in exquisite detail, riding vintage bicycles along a cobblestone street, reminiscent of the early 20th century. Pinocchio's wooden texture is intricately rendered, with the grain of the wood and the joins of his limbs visible in ultra-high definition. Geppetto, beside him, is clothed in period-appropriate attire, his expression one of joy and concentration. The scene is crafted in 4K and 8K resolutions, boasting lifelike realism and clarity that highlights every nuance of the characters and setting. This vivid representation, popular among enthusiasts on Artstation, showcases a level of detail that elevates it to a piece of highly detailed digital art."}
+{"index": "diffusiondb10", "data": "An impressively detailed pencil illustration of Maggie Smith in the character of Reverend Mother is generating buzz on the ArtStation platform. The artwork, which has garnered awards for its lifelike quality, demonstrates a finesse reminiscent of Artgerm and Greg Rutkowski's dynamic strokes, with compositions that subtly hint at the influence of Alphonse Mucha's style. Its cinematic feel is accentuated by the careful play of light and shadow, earning acclaim and trending status among the art community."}
+{"index": "diffusiondb11", "data": "An ominous figure looms within the frame, a dark demonic entity with an array of infinite, wispy wings that cascade behind it like a shadowy aurora. Each translucent wing shimmers with a sinister glow, suggesting an otherworldly portal to a malevolent dimension, illuminated in an eerie spectrum of dark neochrome colors. The unsettling image, reminiscent of the haunting styles of Beksinski and Gammell, is captured through the lens of a 35mm camera, casting a tangible unease as though the entity could breach the confines of its two-dimensional prison at any moment."}
+{"index": "diffusiondb12", "data": "A high-resolution portrait of the acclaimed actress Julianne Moore trending on Pinterest, captured by the lens of photographer Kyle Thompson. Her hair is styled in full platinum blonde waves that frame her intensely expressive, pale-skinned visage with high detail. The photograph, exuding realism and superior quality, showcases her piercing gaze that imparts a sense of subtle strength."}
+{"index": "diffusiondb13", "data": "a captivating matte painting by Artem Demura, showcasing a dimly lit room filled with tall wooden bookshelves crammed with books of diverse sizes and colors. In the center, a sturdy, aged table is set, its surface cluttered with an assortment of items, perhaps long abandoned. The room is drenched in shadows, but shafts of volumetric lighting pierce through the darkness, highlighting the fine dust particles suspended in the air and illuminating the intricate details of the room's contents."}
+{"index": "diffusiondb14", "data": "This is a vibrant, digitally-created watercolor illustration portraying an apocalyptic scene with sharp focus and a smooth finish. The artwork, by James Jean, features Rossdraws' signature style with elements reminiscent of Frank Frazetta's fantasy aesthetics, incorporating Mcbess's bold linework, and infused with the ethereal quality of Sakimichan's enchantments. The dynamic composition showcases a whirlwind of colors that vividly depicts the chaotic yet mesmerizing moment at the end of the world."}
+{"index": "diffusiondb15", "data": "An awe-inspiring depiction of a gargantuan space squid, its tentacles wrapped tightly around a vividly colored planet, as it seems to consume the celestial body. The backdrop is a tapestry of outer space, scattered with twinkling stars and nebulous clouds, rendered in exquisite detail that highlights the surrealism of this cosmic horror. The artwork, suitable for an 8K resolution display, showcases the level of intricacy often celebrated on platforms like CGSociety, where digital fantasy artistry is pushed to the limits."}
+{"index": "diffusiondb16", "data": "a collection of rare and vibrant CS: GO holo stickers, each emblazoned with the logos of iconic esports teams. The Titan Katowice 2014 sticker shines with a prismatic effect, while the iBuyPower Katowice 2014 sticker boasts a delicate balance of red and black hues. The Reason Gaming Katowice 2014 and LDLC.com Katowice 2014 stickers both display a mesmerizing holographic sheen, capturing the eye with every tilt and turn. These stickers represent a significant piece of esports history and are a must-have for enthusiasts and collectors alike."}
+{"index": "diffusiondb17", "data": "In an expansive sewer bathed in shadow, a mechanical spider looms, crafted with astonishing detail that suggests it could spring to life at any moment. The fine drawing, accentuated with hyper-realistic textures, showcases the intricacy of its design, each metallic segment reflecting the scant warm light that filters through the gloom. The artwork is presented in an ultra-high definition 4K resolution, capturing the essence of this eerie, mechanical wonder in a moody, warm-lit environment."}
+{"index": "diffusiondb18", "data": "A visual timelapse piece crafted in the style of Botticelli's 'Birth of Venus' using vibrant acrylic paints. The scene is bustling with a kaleidoscope of colors, showcasing a central figure that emerges with ethereal grace amidst the bold, sharp focus of its details. Surrounding this figure, the canvas is brought to life with an array of vivid hues and painstakingly rendered unrealistic elements that create an explosion of fantastical imagery."}
+{"index": "diffusiondb19", "data": "A surreal figure appears to be sculpted from intertwining tendrils of gray smoke and whirling flurries of snow, giving the impression of a man caught in a blizzard. In one hand, this ethereal being holds what looks to be a gateway to the cosmos, depicted in a photorealistic manner with vibrant nebulae and star clusters visible within its confines. The entire scene is a highly detailed octane render, showcasing sharp contrasts and the interplay of light and shadow that imbues the image with a sense of depth and complexity."}
+{"index": "diffusiondb2", "data": "An intricate character sheet showcasing a demogorgon design, with artistic influences drawn from creators like Moebius, Greg Rutkowski, Zabrocki, Karlkka, and Jayison Devadas. The concept art, detailed and rich in texture, is vividly rendered in an 8K resolution and features a zenith view, capturing the monstrosity of the creature. The unique pincushion lens effect enhances the ultra-wide angle perspective, making the demogorgon appear even more imposing. This piece is currently trending on Artstation, highlighting the artistic community’s appreciation for Phuoc Quan’s distinctive style."}
+{"index": "diffusiondb20", "data": "The image captures a whimsical scene with a brown tabby cat, its fur patterned in shades of dark brown, black, and light taupe. The cat, situated as if in the throes of space, is portrayed with a transparent, gleaming bubble encasing its head like an astronaut's helmet. Around it, an assortment of smaller bubbles float serenely in the imagined cosmos, with a creatively interpreted Saturn adorned with rings in the backdrop, providing an aura of interstellar exploration."}
+{"index": "diffusiondb21", "data": "A thought-provoking piece of digital art that has gained popularity on ArtStation depicts a surreal scene where an open binder notebook serves as a door, standing incongruously amidst a dense woodland setting. The trees surrounding the notebook are rendered in meticulous detail, their bark dark and textured against the misty backdrop. The overall feel of the image evokes an eerie sense of a thriller, with the peculiar juxtaposition of the school supply and the natural environment inviting viewers to ponder the story behind it."}
+{"index": "diffusiondb22", "data": "an intricately detailed character drawing by Bastien Lecouffe Deharme featuring an elderly gentleman with long, flowing grey hair. He possesses majestic, large grey wings that extend from his back, conveying both wisdom and strength. The refinement of the character is further accentuated by a polished monocle nestled over one keen eye, and he is dressed in sophisticated attire that hints at a bygone era of elegance."}
+{"index": "diffusiondb23", "data": "Augusta National Golf Club is showcased under ethereal conditions, with the renowned first and second holes entirely submerged in water. The scene is illuminated by a soft, ambient glow that filters through the morning fog, casting delicate light rays across the tranquil expanse. The photography captures the unexpected beauty of the iconic golf course, paying homage to the Masters, despite nature's inundation."}
+{"index": "diffusiondb24", "data": "A digital composition featuring a man with finely detailed, lifelike features standing beside a whimsically fantastical creature reminiscent of the renowned Studio Ghibli's style. The creature is adorned with a smooth, glossy coat that gives off the impression of a vibrant array of textures. Both figures are positioned against a stunningly crisp and clear 8k resolution backdrop that accentuates the intricate details and colors of their surroundings."}
+{"index": "diffusiondb25", "data": "an elaborate digital masterpiece that features the artist Tommy Cash, rendered in the distinctive and psychedelic style of Alex Grey combined with the nostalgic Americana of Norman Rockwell. The high-resolution 8K image is bursting with ornate details and intricate patterns, creating a fantasy-like atmosphere. Tommy Cash is depicted with hyper-realistic skin tones and textures, surrounded by a kaleidoscopic array of symbolic elements and vibrant colors."}
+{"index": "diffusiondb26", "data": "In the depicted oil painting, we can observe Yair Lapid, gripping a torch where flames flicker wildly, cast in a dramatic, chiaroscuro light reminiscent of Michelangelo's style. He stands as a central figure amongst a cluster of people who are rendered with great detail, showcasing distinct yet harmonious facial expressions and postures. The background of the scene is a muted blend of earthy tones, creating a stark contrast that highlights the central composition of Yair and his companions."}
+{"index": "diffusiondb27", "data": "A striking image depicting an imagined distant galaxy teeming with life, inspired by the work of the renowned artist Caspar David Friedrich, is showcased in high quality on Artstation. The matte painting presents a vibrant scene filled with ethereal colors and pulsating with otherworldly energy. As viewers delve into the scene, they encounter intricate details of celestial bodies and the silhouettes of people populating this fantasy cosmos."}
+{"index": "diffusiondb28", "data": "An intricate visual display combining the unique styles of Geert Goiris, Sally Mann, and Paolo Roversi, synthesizing into a singular piece of award-winning concept art entitled \"Portrait of Chaos.\" The artwork features a compelling blend of surreal landscapes, enigmatic portraits, and ethereal scenes, each contributing a distinct texture and emotion. The colors in the image are a juxtaposition of muted tones and stark contrasts, with an emphasis on the play of light and shadow."}
+{"index": "diffusiondb29", "data": "a detailed and vividly colored painting by Edward Hopper that captures a group of mallgoths with pale complexions and dark, tattered clothing congregating in a dimly lit Hot Topic store. The scene is set against the backdrop of black and red shelves crammed with band merchandise and gothic accessories, while fluorescent lights cast an eerie glow over the scene. Each zombified figure appears to be engaged in conversation or browsing through the various items, creating a still yet dynamic tableau within the bustling environment of the mall."}
+{"index": "diffusiondb3", "data": "A bustling subway scene comes to life in a digital painting that has become a favorite on Artstation, showcasing a diverse crowd where an Indian woman stands out, attired in a vivid orange traditional garment. Beside her, a Chinese man is depicted in sharp focus, wearing a light blue shirt and holding onto a strap for balance. Concept art by renowned artists Artgerm, Greg Rutkowski, and Magali Villeneuve, the illustration captures the energy of the tube through intricate details and realistic textures. The background is filled with a multitude of passengers, each rendered distinctly to emphasize the crowded nature of the tube."}
+{"index": "diffusiondb30", "data": "A highly detailed photorealistic illustration displaying a cowboy in a dynamic, cinematic lighting setup. The cowboy, rendered in stunning 8k resolution, stands at 6000 mm tall, capturing every facet of his rugged attire from the leather boots to the wide-brimmed hat. In the background, the bokeh effect beautifully blurs the lights, creating a striking contrast with the sharpness of the cowboy's figure in the foreground."}
+{"index": "diffusiondb31", "data": "An imaginative abode, reminiscent of the signature style of the renowned architect Zaha Hadid, floats ethereally above ground on a sizable cloud. The structure boasts a sleek, white, curvilinear form enveloped in lush, vibrant green vegetation, with hanging vines gracefully draping its sides. The house's futuristic design contrasts with the organic tapestry of flora, creating a harmonious blend of nature and avant-garde architecture."}
+{"index": "diffusiondb32", "data": "A digital anime-style illustration of Peter Thiel that showcases intricate details and vibrant colors, trending on ArtStation. He is depicted wearing a meticulously designed Saiyan uniform from the Dragon Ball Z series, with the uniform featuring a prominent blue and orange color scheme. The portrait captures Thiel with a determined expression, his hair styled in the iconic spiky Saiyan fashion, set against a simple, nondescript background to emphasize the character design."}
+{"index": "diffusiondb33", "data": "a highly intricate and vibrant cityscape that reflects a fusion of Moebius's imaginative design and Makoto Shinkai's detailed animation style. The streets are aglow with neon signs in a kaleidoscope of colors, casting reflections on the glossy, rain-slicked pavements. Towering skyscrapers with glowing windows rise towards a starless night sky, as the artwork garners significant attention and praise on ArtStation."}
+{"index": "diffusiondb34", "data": "An exquisite collection of premium 2D magical spell art assets, showcasing minimalistic designs with elegant gradients. Each piece is meticulously crafted, allowing for seamless kit-bash integration into digital projects. These paid assets flaunt a refined aesthetic ideal for enhancing any visual piece requiring a touch of enchantment."}
+{"index": "diffusiondb35", "data": "an intense scene unfolds on the streets of San Francisco, where bursts of orange muzzle flashes cut through the gray smoke billowing in the air. Armed police officers take cover behind their black and white patrol cars, exchanging fire with sharply dressed individuals, suspected members of the mafia, wielding shiny pistols. The chaos is concentrated on a street lined with historic red-brick buildings, their windows reflecting the glaring emergency lights of police vehicles. Nearby, discarded bullet casings glint on the asphalt, evidence of the fierce battle taking place."}
+{"index": "diffusiondb36", "data": "A lively movie set from the 1980's, characterized by vibrant neon lights and retro decor. In the foreground, a group of glamorous actresses donning glittery costumes and bold makeup strike dramatic poses. Behind them, vintage cameras and crew members are captured mid-action, surrounded by era-specific props and posters adorning the walls."}
+{"index": "diffusiondb37", "data": "An artistic representation bringing together the unique styles of Artgerm, Greg Rutkowski, and Alphonse Mucha to showcase a character that bears a striking resemblance to Vitalik Buterin, resembling the fictional creature Gollum. The character is given an elongated, gaunt figure with expressive, large eyes, and is illustrated amidst an Art Nouveau backdrop, reminiscent of Mucha's work. The use of vibrant colors and detailed linework reflects the collaborative influence of the three artists, creating a striking and memorable piece."}
+{"index": "diffusiondb38", "data": "in the whimsical artwork by Remedios Varo, a boy roams through a cobbled street with a giant yellow and black butterfly on a leash. The butterfly towers over the boy, its wings a complex network of vibrant hues and patterns, contrasting with the dull grays and browns of the surrounding buildings. The boy, dressed in a simple green shirt and brown trousers, looks up in awe at his colossal companion as they proceed on their surreal promenade."}
+{"index": "diffusiondb39", "data": "a grainy black-and-white image from a 1920s television show, featuring the iconic figure of Batman. He is captured mid-dance, with exaggerated gestures typical of vaudeville performances of the time. The scene shows him on a simple stage with minimal props, evoking the early days of television when the focus was purely on the performer's charisma and talent."}
+{"index": "diffusiondb4", "data": "A captivating portrait by the acclaimed artist Krenz Cushart, showcasing a young girl with ethereal beauty, gracefully suspended amidst the soft, billowy clouds. Her delicate features are rendered with meticulous attention to detail, enhanced by the artist's intricate brush strokes that bring life to her flowing hair and the subtle play of light across her visage. The artwork, suffused with a dreamlike quality, has garnered widespread admiration and is currently a sensation on the Pixiv Fanbox platform."}
+{"index": "diffusiondb5", "data": "a captivating painting dominated by rich, deep colors and an eclectic mix of artistic styles, featuring a gothic clown girl as the central figure. The clown girl's enigmatic expression is captured through the bold strokes and distorted forms reminiscent of Francis Bacon and Adrian Ghenie's work. Her attire is detailed with intricate patterns and textures, echoing the finesse of James Jean and Petra Cortright's digital artistry, while sections of the background blend seamlessly into the abstract realism characteristic of Gerhard Richter. The subtle, haunting illustrations accompanying the figure bear the distinctive mark of Takato Yamamoto. This 8 thousand-dollar masterpiece is an intriguing blend of western and eastern art influences, creating a visually arresting tableau."}
+{"index": "diffusiondb6", "data": "A digital art piece depicting an anthropomorphic fox character with vibrant orange fur and emerald green eyes, wearing a sleek, silver-hued jacket and holding a neon-lit shopping bag, standing in the midst of a bustling, high-tech mall. The scene draws inspiration from Makoto Shinkai with its depth and use of light, the detailed and realistic texture akin to the works of James Gurney, while incorporating characteristic anime-style aesthetics. Surrounding the fox are other fantastical creatures, each rendered with the distinct, expressive touch reminiscent of Don Bluth's animation. Artists like Hibbary, Dark Natasha, and Goldenwolf could be credited with influencing the fox's lifelike fur and captivating poise. The image would be well-suited for the galleries of FurAffinity, known for its community of artists who celebrate anthropomorphic animals through their creative endeavors."}
+{"index": "diffusiondb7", "data": "An artistic fusion of styles reminiscent of the great masters like H.R. Giger's dark surrealism, Richard Schmid's refined realism, and Jeremy Lipking's contemporary impressionism comes to life in a striking full-length portrait painting. The expansive canvas is filled with loose, impressionistic brushstrokes that capture the haunting beauty of a Victorian-style giant spaceship. The vessel exists in an otherworldly space, its intricate metalwork framed by ethereal swaths of color, giving it an almost dreamlike presence amidst the undefined, loose genre of the painting's background."}
+{"index": "diffusiondb8", "data": "A captivating 1950s-style movie poster that radiates a retrofuturistic charm, adorned with the artistic influence of legends like Moebius, Brom, and Ian Miller. It features a striking character in the foreground, presenting to an audience of intrigued scientists who are seated in a spacious conference room. The poster is illuminated with moody, vibrant colors that bring the scene to life, and every intricate detail is rendered with exceptional clarity in a 4K resolution."}
+{"index": "diffusiondb9", "data": "A digital illustration of a girl features her with vibrant rainbow-colored hair that cascades smoothly down her shoulders. She has two spiraling unicorn horns emerging from her forehead, adding a fantastical element to the portrait. Fresh, vivid colors blend seamlessly in gradients across the composition, showcasing a high level of detail and skill. The image is in sharp focus, with rim lighting that highlights the contours of her face and creates a sense of depth against the softly blurred background."}
+{"index": "drawtext0", "data": "A visually striking isometric design spells out the word 'DRAW' using an array of artist pencils with softly rounded edges, demonstrating the principles of modular constructivism. Each pencil features a pastel color palette, blending harmoniously against a serene blue background. The entire composition benefits from soft, smooth lighting that accentuates the textures and forms, created with a physically based rendering technique that provides a realistic appearance, with the entire artwork centrally positioned within the frame, creating a trendy and aesthetically pleasing image."}
+{"index": "drawtext1", "data": "A visually striking digital art piece featuring the phrase \"It takes AI and rain to make a rainbow\" set against a deep black background. The text is displayed in vibrant, holographic neon colors that shimmer and change as if touched by light, creating an illusion of depth and movement. Surrounding the words are colorful, swirly patterns that give off a magical ripple effect, enhancing the 'bruh moment' of surreal wonder it aims to elicit. Intricate white and gold neon lines weave through the composition, adding an elegant contrast to the bold colors. The entire scene is crafted using 3D computer graphics, achieving a photorealistic quality that makes the elements seem tangible."}
+{"index": "drawtext10", "data": "A complex piece of generative art on a white background, featuring the words \"Time is temporary, everything is temporary\" emerging from a swirl of viscous smoke crafted from an intricate array of dots. It resembles flowing rivers, incorporating elements of graph design that give it an analytical yet abstract aesthetic. The typography has a fluidity that suggests impermanence and the fleeting nature of existence."}
+{"index": "drawtext11", "data": "An aerial view of Toronto's skyline dominated by the iconic CN Tower standing tall amongst the surrounding buildings. The image is taken from the window of an airplane, providing a clear, bird's-eye perspective of the urban landscape. Across the image, the words \"The CN Tower\" are prominently displayed in the playful Comic Sans font. The cluster of city structures is neatly bisected by the glistening blue ribbon of a river."}
+{"index": "drawtext12", "data": "a tranquil cityscape with high-rise buildings silhouetted against the evening sky. In the foreground, a large, fluffy, solitary cloud hovers subtly, its edges tinged with a golden hue from the setting sun. Below the cloud, in elegant, rounded cursive letters, the words 'contemplate the clouds' invite onlookers to pause and reflect amidst the urban environment."}
+{"index": "drawtext13", "data": "A robust, powerful vehicle with a matte black finish and oversized tires sits imposingly in the frame, its structure evidently designed for rugged terrains. Bold, white lettering on the side of the truck declares \"I'm a truck, not a car,\" asserting its identity with pride. Its exterior is characterized by reinforced bumpers and a raised suspension, showcasing its capability to tackle challenging off-road conditions."}
+{"index": "drawtext14", "data": "A beautifully aged antique book is positioned carefully for a studio close-up, revealing a rich, dark brown leather cover. The words \"Knowledge is Power\" are prominently featured in the center with thick, flowing brushstrokes, gleaming in opulent gold paint. Tiny flecks of the gold leaf can be seen scattered around the ornately scripted letters, showcasing the craftsmanship that went into its creation. The book is set against a plain, uncluttered background that focuses all attention on the intricate details of the cover's design."}
+{"index": "drawtext15", "data": "A detailed painting from the 17th century in the French Baroque style, featuring an imposing female lion with a thick golden mane and deep, expressive eyes. The majestic lion is depicted with soft, intricate brushstrokes against a backdrop of rolling hills and a cloudy sky. In a humorous contrast to the grandeur of the scene, a whimsical speech bubble emerges from the lion's mouth containing the word \"meow\" in elegant script."}
+{"index": "drawtext16", "data": "In the image, a sleek, metallic humanoid robot stands before a dusty chalkboard, on which it has carefully scrawled 'Representation Learning' in eloquent cursive. Surrounding the central text are several complex mathematical formulas and intricate diagrams, each contributing to a lecture on advanced concepts in machine learning. The robot's articulated fingers hold a piece of white chalk, leaving a trail of fine dust as it continues to write."}
+{"index": "drawtext17", "data": "A vibrant and playful three-dimensional rendering of the word 'fuzzy' made from an assortment of colorful furry spheres, each varying in size. The spheres are clumped together to form chunky, organic letter shapes that appear soft to the touch. The whimsical text is centered within the frame, casting a slight shadow on the plain background, enhancing its three-dimensional effect."}
+{"index": "drawtext18", "data": "an animated image depicting a green turtle with a perplexed expression on its face. The turtle is standing upright on two legs and has a large, transparent thought bubble above its head, filled with the paradoxical question, 'what if there was no such thing as a thought bubble?' surrounding the turtle, there are simplistic drawings of grass and flowers, emphasizing the cartoonish nature of the image."}
+{"index": "drawtext19", "data": "a vibrant field of tall sunflowers standing against a clear blue sky, with one large, cheerful flower in the foreground about to be overwhelmed by an approaching green tractor. the sunflower's bright yellow petals and deep brown center contrast the cold metal of the tractor's body. the ominous caption 'after the sunflowers, they will come for you' overlays the image, giving an eerie foreshadowing to the scene."}
+{"index": "drawtext2", "data": "On a stark white wall, the phrase \"Art is never finished, only abandoned\" comes to life through an array of dynamic paint splatters. The mural, reminiscent of graffiti art, is infused with muddy colors that give it a textured, woodcut appearance. Surrounding the bold, handcrafted letters, a spectrum of colors blend and bleed into each other, creating a visual dance on the edge of abstraction. The artwork stands out as a vivid and beautiful example of spectral color usage, drawing the viewer's eye into a world of continuous creation."}
+{"index": "drawtext20", "data": "A digital image depicts an animated panda character, standing at a podium in a spacious conference room, giving a presentation to an invisible audience. The panda is adorned with a blue tie and is pointing towards a large projector screen that displays bold text stating 'Diffusion Models – in the style of van Gogh,’ with swirling patterns reminiscent of the famous Dutch painter's work in the background. The room is lined with grand windows showing a clear day outside, and rows of empty black chairs face the presenting panda, suggesting an air of anticipation for the talk."}
+{"index": "drawtext21", "data": "An ornate representation of the Taj Mahal intricately positioned at the center of a gold leaf mandala, which showcases an array of symmetrical patterns and delicate filigree. Surrounding the central image, the mandala's design features accents of vibrant blues and reds alongside the gold. Below this striking visual, the words \"Place of Honor\" are inscribed in an elegant, bold script, centered meticulously at the bottom of the composition."}
+{"index": "drawtext22", "data": "A complex sculpture resembling a human brain, intricately woven from silver wires and sheets of off-white paper, strategically folded and molded. Each convolution of the brain is meticulously crafted, with the phrase 'deep thoughts' inscribed in a flowing script along the cerebral folds. This thought-provoking art piece is centrally displayed on a plain pedestal, highlighting its detailed craftsmanship and the profundity of its intended message."}
+{"index": "drawtext23", "data": "a vibrant garden filled with an array of colorful flowers meticulously arranged to spell out the word 'peace' on the lush green grass. The garden is enclosed by a white picket fence and surrounded by tall trees that sway gently in the breeze. Above, against the backdrop of a blue sky, whimsical clouds have been shaped to form the word 'tensions', contrasting with the tranquil scene below."}
+{"index": "drawtext24", "data": "An intricate display of grapevines artfully shaped to form the phrase \"open your mind,\" emerging from the top of a sculpted head. The head is adorned with colorful flowers where butterflies are delicately perched, adding life to the scene. The DSLR captured image showcases the crisp textures and vibrant hues against a softly blurred background, accentuating the central theme."}
+{"index": "drawtext25", "data": "A close-up view of a canvas where the word 'swirl' is artistically represented with muted pastel colors, such as light pink, baby blue, and soft yellow, mixed elegantly into a white background. The paint appears thick and tactile, giving it a 3D globular texture that seems almost liquid in form. The colors gently intertwine with each other, following the shape of the letters, creating an appealing visual of blending hues."}
+{"index": "drawtext26", "data": "A digitally rendered image of a whimsical toothpaste tube figurine that boasts a candy pastel color palette. The figurine is set against a soft, neutral background, enhancing its playful charm. On the body of the toothpaste tube, bold letters spell out the reminder 'brush your teeth,' inviting a sense of dental care responsibility. The tube cap is carefully designed to exhibit a realistic, shiny texture, creating a striking contrast with the matte finish of the tube itself."}
+{"index": "drawtext27", "data": "an image capturing an assortment of trees, ranging from young saplings to full-grown specimens with thick trunks and sprawling branches. The trees are pictured in a lush green forest, where each tree stands at a different height, signifying their unique stages of growth. The photo's emphatic caption, 'growth is a continuous process,' is etched at the bottom, inspiring a reflection on development and progress."}
+{"index": "drawtext28", "data": "Sitting at one end of a wooden park bench, the perspective is directed upwards towards a clear blue sky with a few fluffy clouds drifting by. In the expanse of the sky, the inspirational phrase 'imagine the outcome' appears, almost as if written by an airplane's smoke trail. The bench, with its weathered slats and cast-iron arms, provides a tranquil spot for contemplation within the grassy expanse of the park."}
+{"index": "drawtext29", "data": "An artist's rendition of the Hubble Space Telescope, bathed in the glimmer of distant stars set against the backdrop of the sprawling Milky Way galaxy. The image is rich with hues of blue and purple nebulas, and the telescope appears as a sophisticated structure with reflective solar panels. Overlaying this cosmic panorama is bold, inspirational text that reads 'The Universe is a Mystery, But We Are Here to Solve It', signifying the relentless human pursuit of knowledge beyond our world."}
+{"index": "drawtext3", "data": "an architectural blueprint displaying a simple house design, with clean lines indicating a large triangular roof atop square walls, and a rectangular base representing the floor. The drawing is executed with precise, thin blue lines on a white background, giving it a technical and minimalist appearance. Alongside the schematics, there's handwritten text that reads 'this house is built on the principles of abstraction', suggesting an artistic or philosophical approach to the building's design."}
+{"index": "drawtext30", "data": "A close-up image depicting two hands against a muted background, with one hand delicately gripping a bright red heart and the other securely holding a jagged yellow lightning bolt. Above them, bold lettering in a strong, contrasting color proclaims 'love is power'. The text is stylized with an energetic font that echoes the dynamic essence of the symbols cradled in the hands."}
+{"index": "drawtext31", "data": "A close-up of a folded newspaper with a bold headline 'Aliens Found in Space' sprawled across the top in large black letters. The subheading underneath reads, 'The Truth About Everything Now Challenged', printed in a smaller yet prominent font. The newspaper, with its slightly crumpled texture, lies on a wooden table surface, surrounded by a scattered assortment of coffee mugs and pens."}
+{"index": "drawtext32", "data": "An image of a newspaper lies flat, its bold headline 'Local pig eats prize pumpkin' emblazoned across the top in large lettering. Below the headline, there's a photograph capturing a pink pig with muddy spots, surrounded by the remnants of a once massive, bright orange pumpkin, now half-devoured. The paper appears slightly crumpled, emphasizing its texture, with the photograph and text clearly visible against the off-white background of the newsprint."}
+{"index": "drawtext33", "data": "A creative studio photograph featuring tactile text spelling 'hello' with vibrant, multicolored fur that stands out boldly against a pure white background. This playful image is showcased within a unique frame made of equally fluffy material, mimicking the texture of the centerpiece. The whimsical arrangement is perfectly centered, lending a friendly and inviting vibe to the viewer."}
+{"index": "drawtext34", "data": "A vibrant parrot perched confidently on the wooden railing of an old pirate ship, its feathers a bold mixture of greens, blues, and reds. It's donned a small, comically endearing pirate hat atop its head. The backdrop is filled with the taut ropes and billowing sails of the ship, and emblazoned across the image is a humorous caption declaring, \"I'm the captain now.\""}
+{"index": "drawtext35", "data": "a colorful bowl filled with milk and alphabet-shaped cereal pieces floating on the surface. Amongst the scattered letters, the word 'smackeroo' is carefully arranged in the center of the bowl. The bowl sits on a light-colored table, with a silver spoon resting beside it."}
+{"index": "drawtext36", "data": "A unique perspective of a large, irregular-shaped gray stone casting a long, imposing shadow over the ground, as seen from the vantage point of an ant. The sun's position gives the shadow a stretched appearance, trailing across the textured dirt surface speckled with tiny pebbles and grass blades. The caption 'look at that shadow!' humorously emphasizes the grandeur of the scene from the ant's perspective."}
+{"index": "drawtext37", "data": "a simple white background highlighting a hand-drawn black circle that encompasses the playful, cursive text 'infinity makes me happy'. The words are crafted to resemble a quick, personal brush script, giving it a casual and intimate feel. Just outside the circle, faint sketch marks are visible, implying a human touch in the creation of this design."}
+{"index": "drawtext38", "data": "a dense forest shrouded in darkness, illuminated only by a single light glowing faintly in the distance. the trees have thick, twisted trunks and are densely packed, their branches creating a canopy that shades the forest floor. clearly visible against this shadowy backdrop is the stark white text \"I've come to talk with you again,\" evoking a sense of solitude and mystery."}
+{"index": "drawtext39", "data": "a small green lizard basking on the white, pentagonal home plate of a baseball field, with the words 'made it safe' written in a cartoonish speech bubble above its head. surrounding the home plate is a well-trodden dusty red infield, and in the background, the neatly manicured outfield grass stretches out. beyond the field, a row of bleachers can be seen, hinting at the presence of an eager audience awaiting the next play."}
+{"index": "drawtext4", "data": "A vibrant, wide-angle studio shot capturing oversized, three-dimensional letters spelling out \"colorful\". Each letter is meticulously crafted from an assortment of fuzzy spheres, varying in size and hues, giving the text a rich, tactile appeal. These chunky elements are perfectly aligned in the center of a square canvas, creating a dynamic and visually engaging composition."}
+{"index": "drawtext5", "data": "A metallic robot with a rounded head and drooping shoulders stands on a stainless steel assembly line designed for butter packaging. Surrounding the robot are tubs of spread, some of which have spilled onto the conveyor belt, causing a minor disruption. The robot's digital face displays a frown, and above it, a flashing red light signals a malfunction in the process. Large, cartoonish speech bubbles emerge from the robot, playfully emblazoned with the words, \"I can't believe it's not butter!\" in bold, white letters."}
+{"index": "drawtext6", "data": "An artistic display featuring light magenta and blue paint streaks applied with a wide brush, creating a translucent effect as they overlap on a sheet of plastic configured into the shape of the letter 'F'. This creative piece is set against a pure white background, which accentuates the colors and the unique translucency of the materials. The edges of the paint strokes are soft and blend seamlessly into one another, giving the artwork a sense of fluidity and motion."}
+{"index": "drawtext7", "data": "an aerial vehicle with the inscription 'helicopter tours' emblazoned along its side is captured in the action of descending onto a circular helipad that's nestled in the midst of a verdant valley. The expansive landscape surrounding it includes a meandering river, clusters of dense trees, and majestic mountains standing tall under the clear skies. Sunlight shines off the helicopter's glossy exterior, accentuating its deep blue and white colors as it prepares for a gentle touchdown."}
+{"index": "drawtext8", "data": "An intricately designed sculpture of the letter W, rendered in three-dimensional form, is masterfully crafted from a thin wire with an iridescent light metallic chrome finish. The isometric perspective emphasizes the sculpture's geometric precision, showcasing its ultra-detailed structure. It stands prominently against a deep, dark background, further accentuating the reflective and shimmering quality of the wire material."}
+{"index": "drawtext9", "data": "A colorful scene at the shoreline with a red crab sitting on the golden sand, beside a vibrant turquoise surfboard. The sun, resembling a massive, glowing orange orb, hangs low in the sky, which is painted with a spectrum of a rainbow's hues. Thought bubbles appear above the crab, filled with the words 'you are all that matters', hinting at a whimsical, introspective moment on this picturesque beach."}
+{"index": "localized0", "data": "A vibrant outdoor field with lush green grass and neatly painted boundary lines. Numerous athletic men, donned in brightly colored sports attire, are energetically chasing after a spherical ball under the bright daylight. The background is a soft focus, enhancing the dynamic movement of the players in the foreground. Surrounding the playing area, there are scattered equipment and water bottles, indicating a serious game is in progress."}
+{"index": "localized1", "data": "A delectable meal is prepared on a round white plate, featuring a savory food item gently positioned next to a small ceramic bowl. In the foreground, a second plate is meticulously set with a silver fork resting atop its rim. To the right of this setting, a stainless steel knife shines beside an array of other dining utensils and accouterments arranged neatly on a polished wooden table."}
+{"index": "localized10", "data": "An intricately edited photograph captures a bustling road lined with an array of vehicles such as sleek cars, motorbikes, and bicycles. In the background, a tall concrete wall runs alongside the road, interrupted by a solitary lamppost that stands erect. Lush green trees can be seen peeking over the top of the wall, with bits of the sky and other structures subtly visible through the foliage. Various signs and billboards are also dotted along the periphery of the road, adding to the urban landscape of the scene."}
+{"index": "localized11", "data": "The image displays a vibrant array of multicolored flowers and lush green leaves clustered at the lower section. In the bottom right corner, a small, round, terracotta pot peeks into the frame, providing a contrast to the natural elements. The floral bouquet features petals ranging from deep purples to bright yellows, with a variety of leaf shapes and sizes nestled amongst them."}
+{"index": "localized12", "data": "A vivid green iguana is perched motionlessly atop a worn wooden log, its intricate scales exhibiting various shades of green and black. Behind the reptile, a rough-textured wall stands, painted in a faded color, which contrasts with the image's predominantly dark backdrop. The shadows envelop the surroundings, highlighting the iguana as the central focus of this composition."}
+{"index": "localized13", "data": "In the foreground, two birds with vibrant feathers are perched upon rugged grey rocks that jut out near a tranquil pond with lush green plants at the water's edge. In the midground, a rustic wooden fence creates a boundary line, subtly dividing the natural scene from the world beyond. The background extends into a vast expanse of soft blue sky dotted with tufts of white clouds, stretching far into the horizon."}
+{"index": "localized14", "data": "In the photograph, there's a round, brown plate that is neatly arranged with an assortment of vibrant, colorful food. Positioned against a stark white background, the plate's contents stand out, creating a striking contrast. The surface of the food gleams slightly, suggesting a freshly prepared meal waiting to be enjoyed."}
+{"index": "localized15", "data": "In the image, a sleek white car sits parked on a smooth concrete surface, its polished exterior reflecting the sunlight. Behind the vehicle, a tall, weathered wall stands, painted in a faded blue with peeling patches revealing its past layers. The car's position, about an arm's length away from the wall, creates a clear spatial separation between the two."}
+{"index": "localized16", "data": "In the foreground of the image, a variety of colorful fruits are scattered across a wooden table, with their fine details and textures in sharp focus. The background features a blurred arrangement of kitchenware and a pastel-colored wall, providing a soft contrast to the vivid sharpness of the fruits on the table. The diffused light gently illuminates the scene, highlighting the smooth skins of the fruits and casting subtle shadows upon the wooden surface."}
+{"index": "localized17", "data": "A closer look at the ground reveals a scattering of rocks in a mosaic of colors and shapes. Some are small, weathered pebbles tinged in earthy browns and soft grays, while others are larger, jagged stones with hues of deep red and speckled granite. The varied textures are evident, from smooth, water-worn surfaces to the coarse, granular feel of the larger boulders, all lying haphazardly on a bed of sandy soil."}
+{"index": "localized18", "data": "The photo captures a tranquil natural setting with an expanse of green grass, which leads to an assortment of vibrant plants at varying heights. Scattered throughout the scene, autumnal dried leaves are strewn among the grass, hinting at the change of seasons. Towering trees with diverse foliage create a textured canopy against the clear blue sky, offering a snapshot of a diverse ecosystem."}
+{"index": "localized19", "data": "A modern electronic device featuring a sleek grey panel, which is adorned with bright, illuminated text offering clear instructions. Surrounding the text, there's an array of tactile buttons in black and red, and several toggle switches with orange indicators that provide user control. The arrangement of buttons and switches appears organized, facilitating ease of use for various functionalities."}
+{"index": "localized2", "data": "A modern interior scene featuring a sleek, white pedestal standing squarely on a polished concrete floor. Atop the pedestal rests an avant-garde, mesh-like structure, its curves and interwoven lines resembling a futuristic sofa. Behind this unique piece of furniture, there is a crisp white wall, upon which hangs an assortment of small, framed pictures and decorative shelves holding minimalist ornaments."}
+{"index": "localized20", "data": "A textured concrete wall serves as a canvas for vibrant graffiti. The colorful artwork features a detailed portrait and bold lettering with various hues of blue, orange, and pink. Surrounding the graffiti, there are signs of age on the wall, including slight wear and chipped paint, highlighting the contrast between the old surface and the fresh paint."}
+{"index": "localized21", "data": "The expansive surface appears to be a wall constructed from weathered wooden planks, each exhibiting unique patterns of grain and knots. The rustic wooden barrier casts subtle shadows, hinting at its rough texture and the natural variations in its warm brown tones. No other objects are in view, putting the entire focus on the wooden wall's organic and sturdy character."}
+{"index": "localized3", "data": "In a grassy outdoor setting, several individuals are wearing protective helmets and standing near vivid blue and yellow inflatable pillars. In the foreground, there's a semblance of a game or event taking place amidst the greenery. Towards the back, multiple tents are set up alongside a tall bamboo structure, and colorful posters are visible, fluttering in the gentle breeze."}
+{"index": "localized4", "data": "In this monochromatic photograph, an array of vehicles, including cars and motorcycles, are captured against an urban backdrop. The background features an assortment of streetlights casting a soft glow, utility poles rising towards the sky, stacked logs waiting to be moved, and the silhouettes of various walls that add to the complexity of the scene. The foreground is dominated by a stretch of road that guides the viewer's eye through the image, paving a path amidst the diverse elements contained within this black and white tableau."}
+{"index": "localized5", "data": "Foregrounded in the image, a vibrant display of flowers, their petals ranging from a delicate pink to a deep magenta, bloom alongside green buds on slender stems. Rising majestically behind the botanical array are silhouettes of verdant trees, some houses with red-tiled roofs peeking out among them, and the faint outline of distant mountains. Above this serene landscape, the expansive sky is adorned with fluffy, white clouds drifting gently across a soft blue canvas."}
+{"index": "localized6", "data": "The photograph captures a wide expanse of sky with trees dotting the horizon, their leaves vibrant green against the blue backdrop. Below, neatly stacked boxes rest against a black metal railing, which runs parallel to the lush grass. Scattered around the vicinity, miscellaneous objects are haphazardly placed. The composition is framed with elegant black borders on both the left and right sides, giving the image a structured finish."}
+{"index": "localized7", "data": "The focal point of the image is a colorful Rubik's cube with a mix of unsolved red, blue, green, and yellow squares. Just beneath the cube, the texture of a soft, dark cloth can be observed, providing a contrast to the cube's vivid colors. The backdrop is enveloped in shadows, accentuating the cube's brightness. To the lower portion of the frame, a nondescript black object is partially visible, adding a sense of depth to the composition."}
+{"index": "localized8", "data": "In the visual, an object with a vivid orange hue and intricate patterns catches the eye. The bottom left portion of the object bears inscriptions in a darker tone, offering a clue to its purpose or origin. It stands out boldly against a stark white background, providing a striking contrast that accentuates its design and color."}
+{"index": "localized9", "data": "Centrally placed on a raised platform, a sleek red sports car gleams under bright lights. The platform is surrounded by railings, beyond which a small crowd of people are scattered, some intently observing the car, others engaged in conversation. Advertising banners and promotional stands can be glimpsed in the background, flanking a large white tent that is set up near a modern building structure with glass facades."}
+{"index": "midjourney0", "data": "An intricate tower crafted of white cloth and rope stands with an ethereal presence against the stark tundra backdrop, with windows and ramparts woven into its design. The structure appears to float just above the muted colors of the earth, with clouds swelling around it, adding a voluminous depth to the scene. The photograph boasts an impressive dynamic range, capturing the subtlest nuances from the brightest cloud to the deepest shadow. Reminiscent of a Studio Ghibli fantasy blended with the magical realism of a Michael Parkes painting, the image is strikingly detailed and remarkably photorealistic."}
+{"index": "midjourney1", "data": "An extraordinary rendition of Melbourne's Southern Cross Station presented from a bird's-eye view, encapsulated by the signature aesthetics akin to the works of Makoto Shinkai. The image boasts a resolution of 8K, delivering an ultra-detailed and sharply defined portrayal that captures even the subtlest of features. The station and its surroundings are bathed in epic lighting that casts dramatic shadows and projects vivid light refractions across the scene, offering a sense of hyperrealism that's enhanced by the ultra uplight effect. Each element within the composition is rendered with high fidelity, giving life to a photorealistic scene that is both captivating and intricately depicted."}
+{"index": "midjourney10", "data": "An epic scene unfolds, styled in the manner of Richard Schmid, where a fierce battle rages under a tempestuous sky pouring rain. Amidst the chaos, a colossal cave troll roars defiantly, towering over the Viking warriors who are locked in combat with their enemies. The backdrop is a city engulfed in flames, with thick smoke rising into the stormy atmosphere, all rendered in a vivid, cinematic matte painting. The gloomy weather and the city's destruction intensify the drama of the Viking army's relentless fight."}
+{"index": "midjourney11", "data": "a distorted, low-resolution video feed from a trail camera, displaying a bizarre scene of animated minion figurines moving erratically within a crop circle that appears cursed and sinister. the image is plagued with digital artifacts, creating a grainy texture that adds to the unsettling atmosphere. flashes of datamoshing cause the figures to appear fused and deformed, creating a chilling effect as they twist and contort in unnatural ways."}
+{"index": "midjourney12", "data": "In the midst of an enigmatic scene, a mystical cat sits enveloped by undulating curls of vibrant smoke in hues of purple, blue, and green. Its eyes shine luminously, emitting an aura of enigmatic chaos magic that seems to dance around its sleek, shadowy fur. At the forefront of this visual marvel, an emblematic 'all seeing eye' is prominently featured, adding to the arcane theme. The background fades into a blur, creating a shallow depth of field that places the focus squarely on the eerie yet captivating feline figure."}
+{"index": "midjourney13", "data": "A vivid and eclectic depiction of a fisherman's village drawn by Ralph Steadman, composed of dark velvet hues, contrasting with splashes of bright yellow and teal. In the foreground, a tall coconut tree sways, its silhouette framed against the backdrop of a small, bustling island. A sturdy jetty extends into the water, where several fishing boats rock gently with the waves."}
+{"index": "midjourney14", "data": "A mannequin dressed in a black latex maid's uniform stands prominently in a photography store, displaying a chic hat positioned at an angle and a fashionable hip skirt. The figure is surrounded by suspended groceries appearing as if frozen mid-air, creating a dynamic storefront arrangement. Around the mannequin are shelves stocked with well-known brands of photographic supplies, such as Kodak and Fuji film, while posters advertising the latest IMAX releases and redshift photography techniques adorn the walls, promoting a high level of photorealism in their imagery."}
+{"index": "midjourney15", "data": "An imaginative digital artwork that features a fantasy female character, stylized with the intricate detail and ethereal qualities reminiscent of Peter Mohrbacher's angelic designs. The scene is richly textured with the layered brushwork akin to Craig Mullins, while the character is positioned in a whimsical world that pays homage to Studio Ghibli's magical backdrops. The color palette and decorative patterns echo the opulent artistry of Gustav Klimt and the flowing elegance of Alphonse Mucha, while the bold contrast and shadowing are influenced by Mike Mignola's distinctive style. Frank Frazetta's dynamic forms and Boris Vallejo's muscular fantasy realism influence the figure's pose and musculature, creating a visually arresting tableau with extreme detail and depth."}
+{"index": "midjourney16", "data": "A digitally rendered image of the iconic Monalisa capturing a selfie, boasting 8K resolution and hyper-realistic details that highlight the delicate textures of her skin and the intricate fibers of her clothing. The scene is illuminated with cinematic lighting that casts soft shadows and enhances the depth of field, giving a three-dimensional quality to the image. The background is blurred artfully, drawing full attention to her enigmatic expression and the modern device in her hands, all created with the precision of an Octane render engine."}
+{"index": "midjourney17", "data": "An intricately designed airship, with sleek steel panels and ornate golden trims, hovers gracefully above a bustling port. The city skyline, a fantastical fusion of floating islands and elevated platforms, echoes the artistic vision of Ivan Shishkin's creations on ArtStation, reminiscent of the game Bioshock Infinite. Captured with the depth of field effect of a 35mm lens, the image exudes a cinematic quality, with the airship’s cables and anchors creating a stark contrast against the backdrop of the sky-high metropolis."}
+{"index": "midjourney18", "data": "An intricately detailed representation of the Marvel character Ghost Rider featuring a human skull, with flames licking around the contours of the skull and rising above it in a fierce expression of fiery vengeance. The skull, alight with bright orange and yellow tones, dominates the image with its full head in view, set against a stark, void-like background that accentuates its fierceness. The character concept art captures both the macabre essence and supernatural intensity of the spectral figure."}
+{"index": "midjourney19", "data": "Visually intriguing macro photography captures the essence of mystical waves, seemingly afloat and intertwined with delicate, hair-like particles. Shades from a psychedelic color palette entwine, creating a stunning and surreal scene reminiscent of twilight. These vibrant waves, rendered with high-octane graphical precision, offer a hyper-realistic glimpse into an otherworldly dimension."}
+{"index": "midjourney2", "data": "An anthropomorphic wolf, clad in a crisp white shirt and a patterned tie, sits elegantly on a subway train designed in the distinctive styles of Wes Anderson and Moebius. The setting of the New Yorker Magazine cover is depicted through a silkscreen print that showcases an image transfer technique, embodying a realistic comic illustration with intricate details. The wolf's poised demeanor contrasts with the bustling subway environment, where every texture and pattern speaks to the meticulous attention to detail characteristic of the illustrative work."}
+{"index": "midjourney20", "data": "A grand, sprawling landscape inspired by the iconic style of Hayao Miyazaki's \"Nausicaä of the Valley of the Wind\" and the \"Breath of the Wild\" from The Legend of Zelda series. The scene blends the fantastical elements of Studio Ghibli's post-apocalyptic setting with the vibrant, open-world aesthetic found in the game. Towering, ancient trees with twisted roots rise from the earth, while bioluminescent creatures add a touch of surreal luminance, hinting at an adventure awaiting at the edge of the world."}
+{"index": "midjourney21", "data": "a magnificent underwater sculpture depicting an angel with intricately designed wings that mimic the delicate structures of coral. The statue is positioned in a way that creates a majestic silhouette against the filtered light from above, evoking a sense of awe and mystery. This visual composition, with its dramatic upward angle and the interplay of shadow and light, brings to mind the stylistic elements associated with the cinematic aesthetics of the film \"Blade Runner 2049.\""}
+{"index": "midjourney22", "data": "Entwined in an enchanting dance, colorful whisps of light with a luminous glow twist and swirl against the backdrop of a deep, dark void. The edges of these curling tendrils shimmer with an occult-inspired essence, sparkling with what could be described as chaos magic. Each strand appears in sharp focus against a background that dissolves into the shallow depth of field, highlighting the ethereal play of light and shadow."}
+{"index": "midjourney23", "data": "An incredibly detailed digital artwork depicting an enormous skyscraper soaring into the sky, set against the background of an industrial power station as imagined by visionary artists Peter Elson, Chris Moore, and Jim Burns in 4K resolution. The skyscraper is adorned with intricate designs, reflective glass windows, and numerous protruding antennas. The power station at its base is a complex of pipes, wires, and glowing energy cores, showcasing a hyper-realistic portrayal of future technology."}
+{"index": "midjourney24", "data": "A meticulously crafted Art Nouveau screenprint featuring a dog's face, characterized by its remarkable symmetry and elaborate detailing. The canine visage, which is the central motif of the piece, exhibits intricate linework and stylized features typical of the Art Nouveau aesthetic. The artwork is deftly rendered in a harmonious palette, with each element of the design echoing the balanced and ornate nature of the style."}
+{"index": "midjourney25", "data": "A detailed 4K resolution portrait of a character concept art dubbed \"Under The Dreaming Tree,\" characterized by its symmetrical design and realistic texturing. The intricately drawn character is situated centrally in the composition, surrounded by an ethereal backdrop that features the namesake tree with its expansive, leaf-laden branches. The character's visage displays a serene expression, with eyes that seem to reflect a hidden world within."}
+{"index": "midjourney26", "data": "An expansive palace constructed from iridescent materials that shimmer with hues reminiscent of a vivid, Slime-like substance, majestically stands at the heart of a fantastical realm. Its towers twist skyward, defying conventional architecture with their organic, flowing forms. In the foreground, a field of exotic flowers blooms, each petal displaying an array of otherworldly colors that could have been plucked from a Lovecraftian spectrum, while overhead, a radiant sun bathes the surreal landscape in brilliant light."}
+{"index": "midjourney27", "data": "An aerial view captures the Chernobyl station in the gentle embrace of spring, framed in the distinct and meticulous symmetry reminiscent of a Wes Anderson tableau. The imposing structure is centered between the burgeoning greenery that flanks it, under the vast expanse of a midday sky punctuated by towering cumulonimbus clouds. The scene is a study in contrasts, with the stark industrial facade of the station juxtaposed against the whimsical patterns of nature reasserting itself."}
+{"index": "midjourney28", "data": "A vintage full-page schematic drawing of a virtual reality headset from the 1800s, rendered in precise, symmetrical line art occupies the gray paper. The intricate details of the design are captured in the fine lines and annotations that fill the page. Surrounding the central image, there are smaller diagrams and text that provide additional explanations and specifications for the headset's components."}
+{"index": "midjourney29", "data": "A voluminous library, bathed in soft, muted blue hues, stretches majestically across a spacious Victorian-style living room. Intricately carved wooden bookshelves, filled with an array of books of various sizes and colors, line the walls and a grand fireplace sits as the room's focal point. The vray render showcases hyperrealistic textures, from the plush, patterned rug beneath the polished wooden tables, to the delicate fabric of the antique armchairs scattered throughout the space."}
+{"index": "midjourney3", "data": "An evocative tintype photograph by Ansel Adams, capturing the chilling tableau of gaunt, skeletal figures with the mythical visage of Icarus, their bodies conjoined in a macabre ballet as they plummet through a midnight sky. The figures are adorned with an array of feathered wings that are sprawling from their emaciated frames in a desperate, though futile, attempt at salvation. High levels of detail reveal the intricate interplay of shadow and light, casting an otherworldly glow across the scene that harkens back to the haunting visual narratives of the 1800s."}
+{"index": "midjourney30", "data": "A rendered Celtic temple stands majestically in a digital landscape created using Unreal Engine 5, exhibiting high details that contribute to its photorealistic appearance. Intricate stone textures and realistic lighting effects are captured with the help of Octane rendering, enhancing the visual fidelity of the scene. The temple is surrounded by a lush environment, featuring detailed foliage that demonstrates the advanced capabilities of the rendering software."}
+{"index": "midjourney31", "data": "An intricately detailed character concept art piece presented in 4K resolution, portraying the concept of 'Blind Ambition.' The character is situated dead-center in a symmetrical portrait stance, their gaze obscured, suggesting the metaphorical blindness of ambition. The realism of the artwork is accentuated by subtle textures and shading that bring the character to life on a digital canvas."}
+{"index": "midjourney32", "data": "A gritty, realistic interpretation of the World War II Normandy invasion, painted in the grandiose and dramatic style reminiscent of Albert Bierstadt's sweeping landscapes. The scene integrates the atmospheric lighting and moody skies associated with Edward Hopper's work, while also incorporating the fine detail and dynamic compositions akin to Craig Mullins' digital artistry. The painting captures the raw emotion and chaotic intensity of the historic moment, portrayed through the use of deep, muted tones and the precise depiction of soldiers' expressions amidst the battle-torn beaches."}
+{"index": "midjourney33", "data": "A 35mm film still capturing a scene from David Lynch's reimagined version of 'The Wizard of Oz', with the setting transposed to the lush, green backdrop of the Pacific Northwest. In the frame, a Dorothy character dons a plaid dress reminiscent of flannel, common in the region, and her ruby slippers contrast with the verdant forest floor. Towering evergreens and a hazy mist encapsulate the background, providing an enigmatic touch true to Lynch's signature style."}
+{"index": "midjourney34", "data": "In the art piece, a realistically depicted young girl with flowing blonde hair gazes intently into the distance, her eyes reflecting the vibrant hues of a spring forest. The verdant greens and soft pastels of the budding trees are captured in subtle brushstrokes, giving the scene a serene and tranquil atmosphere. The minimalist composition focuses on the girl's expression of wonder and the lush woodland background, while the texture of the oil paint adds depth and richness to the canvas."}
+{"index": "midjourney35", "data": "A picturesque scene that is reminiscent of the Hudson River School style, with a whimsical twist incorporating dessert-themed elements. The river of rich, flowing chocolate curves through the landscape, while mountains in the distance resemble scoops of different ice cream flavors. Fluffy, pink cotton candy trees dot the banks of the river, adding a sweet playfulness to the traditional pastoral setting."}
+{"index": "midjourney36", "data": "An ancient-inspired agora stands amidst a dense palm farm, its robust columns crafted from the sturdy trunks of Sabal palms, rising into the sky. The structure's foundation and seating areas are composed of a unique oyster shell concrete, lending a textured, gritty feel to the smooth surfaces. The space between the palms is filled with the rustic beauty of this communal gathering spot, which is both inviting and impressive in its naturalistic design."}
+{"index": "midjourney37", "data": "A sleek, white laboratory designed with a blend of Matt Mahurin's moody aesthetic and Tsutomu Nihei's architectural sensibilities creates a stark, futuristic scene. The room features angular, geometric furniture with surfaces that have a smooth, matte finish, reflecting the dim, ambient lighting. Along the walls, various high-tech equipment and monitors display cryptic data, casting soft blue glows that contribute to the laboratory's enigmatic atmosphere."}
+{"index": "midjourney38", "data": "An endearing, fluffy creature with fur in various shades of lavender and teal, straight out of a Pixar film, illuminated by vibrant, volumetric lighting that provides a rich depth to the scene. The textured fur of the character reflects the high-quality 4k resolution, adding to its lifelike photorealistic appearance. Positioned next to the monster, a sparkling star accentuates its whimsical nature, set against a meticulously rendered background that showcases Pixar's attention to detail."}
+{"index": "midjourney39", "data": "A scene featuring several prominent items: a piece of matte black paper lies beneath a set of decorative zodiac-themed tarot cards, carefully placed in the center. Above this arrangement, two spherical ornaments designed to resemble moons hang suspended, with a silhouette of a mountain range providing a stark backdrop. Additionally, two white streaks, perhaps cords or decorative ribbons, run parallel to each other, intersecting the paper and the tarot cards, adding a dynamic element to the composition."}
+{"index": "midjourney4", "data": "A stage designed with a fusion of natural and industrial elements, featuring sleek silver glass panels reminiscent of a high-tech laboratory, situated alongside a soft, feathered carpet. The flooring transitions into an area of polished wood leading to a sturdy stone wall, adding an earthy touch to the synthetic ambiance. Overhead, raw concrete merges into the form of a salvia patens roof, juxtaposed with a tranquil sand-laden backdrop that suggests the gentle shores of a lake. In the center, an oval mirror with a slim steel frame reflects the composed space, including the transparent screen that adds a layer of modernity to this eclectic setting."}
+{"index": "midjourney5", "data": "An outdoor setting where an array of brightly painted rocks is meticulously arranged in a gridded pattern on the ground. Each stone is carefully spaced to maintain uniformity and starts with vibrant red rocks in the top right corner, progressing through the colors of the rainbow to end with deep violet ones in the lower left corner. The smooth surfaces of the stones glisten in the sunlight, enhancing the vividness of each hue."}
+{"index": "midjourney6", "data": "A retro-futuristic album cover that encapsulates the essence of the synthwave movement in 1985, crafted exclusively in varying shades of blue. The primary visual is an old-fashioned car emerging from the mouth of a dimly lit tunnel, its gleaming headlights cutting through the surrounding darkness. The artwork is cinematic in its composition, featuring vintage elements that suggest motion and speed, finished with a fine film grain texture that mimics the classic look of a 35mm photograph. The band's name, \"BRO,\" is emblazoned above the image in a bold, stylized font that complements the album's overall theme."}
+{"index": "midjourney7", "data": "Within a grand extraterrestrial place of worship, a group of alien beings is depicted in reverent prayer, illuminated by beams of light filtering through stained glass panels featuring cosmic motifs. The advanced alien architecture captures the imagination with soaring arches and embedded technology, rendered in striking yellow and blue tones that highlight the meticulous detail and realism of the scene. This masterpiece of science fiction, brought to life with the precision of Octane rendering software at an 8K resolution, showcases an awe-inspiring tableau, reminiscent of the most sophisticated pieces found on the ArtStation platform."}
+{"index": "midjourney8", "data": "A realistic human skeleton, its aged bones a stark off-white, stands erect in the center of an empty, dust-covered swimming pool. The skeleton clutches a golden spear, its tip gleaming even in the muted light, held aloft as if in a declaration of power. Red drapery is artfully hung around the skeleton's frame, creating a strong contrast with its pale bony structure and the dull grey of the pool's concrete."}
+{"index": "midjourney9", "data": "A photorealistic depiction of a transformation from a furry caterpillar to a demonic figure with lifelike textures and detail. The caterpillar's body is portrayed with soft, delicate hairs transitioning into a glistening, wet pupa stage. The final form emerges as a horror-inducing demon with a screaming visage, sharp fangs, and red, slimy hands that are deformed yet eerily precise in their 3-dimensional rendering."}
+{"index": "partiprompts0", "data": "A cheerful sloth, adorned with a black leather jacket and a brown cowboy hat, stands confidently on a patch of green grass. Its attire is completed with a red tartan kilt and a neatly tied bowtie. In one claw, it grasps a sturdy quarterstaff, while the other holds a large, ancient-looking book. Positioned a short distance behind the sloth, there is a gleaming Volkswagen van, its exterior decorated with vibrant flower patterns. The entire whimsical scene is captured through a wide-angle lens from a low vantage point, emphasizing the sloth's stature and the van's presence."}
+{"index": "partiprompts1", "data": "A vibrant oil painting depicts a glossy, turquoise VW van parked against the backdrop of a bustling city skyline. In the foreground, a contented sloth stands out with its unique attire consisting of a sleek leather jacket, a traditional tartan kilt, and a quirky bowtie, all topped off with a classic cowboy hat. The sloth grips a sturdy quarterstaff in one hand while balancing a large, leather-bound tome in the other, all set upon a lush patch of green grass that contrasts with the urban environment behind it."}
+{"index": "partiprompts10", "data": "A detailed sculpture of an ancient pharaoh, crafted from a lustrous bronze metal, stands imposingly against a plain backdrop. The statue is adorned with a pair of intricate steampunk glasses that rest on the bridge of its sculpted nose, and it wears a weathered leather jacket draped over a crisp white t-shirt. Emblazoned on the shirt is a detailed illustration of a space shuttle, adding a modern twist to the traditional regal attire."}
+{"index": "partiprompts100", "data": "An aerial perspective captures three individuals peering down at the bustling city streets from the edge of a skyscraper's rooftop. They are surrounded by a safety barrier, and the rooftop itself is adorned with gravel and small potted plants. Far below, the urban tapestry of roads, vehicles, and pedestrians unfolds in a miniature display, while other tall buildings rise up in the vicinity, creating a canyon of architectural marvels."}
+{"index": "partiprompts101", "data": "A classic Greek statue carved from white marble, depicting a muscular man with a stern expression as he gently comforts a cat with an unusually large head. The cat, also sculpted from marble, seems to be at ease in the man's strong arms. The statue stands on a stone pedestal, and the intricate details of the man's curly hair and the cat's fur texture are visible, highlighting the sculptor's skilled craftsmanship."}
+{"index": "partiprompts102", "data": "a reimagined version of the Mona Lisa, where the iconic figure is depicted with a brown cowboy hat tilted rakishly atop her head. In her hand, she grips a silver microphone, her mouth open as if caught mid-scream of a punk rock anthem. The background, once a serene landscape, is now a vibrant splash of colors that seem to echo the intensity of her performance."}
+{"index": "partiprompts103", "data": "a skilled barista in a white apron carefully pouring milk from a stainless steel pitcher into a white ceramic coffee cup. The milk swirls into the dark espresso, creating an intricate leaf pattern on the surface of the latte. The coffee cup sits on a saucer atop a polished wooden counter, surrounded by an array of coffee-making equipment."}
+{"index": "partiprompts104", "data": "A young boy with a joyful expression is perched high on the shoulders of a woman dressed in a long, flowing red dress. The woman's dress has intricate lace detailing along the hem and sleeves, and she stands with poise and grace. The boy, wearing a striped shirt and denim shorts, wraps his small hands around the woman's forehead for balance as they share a moment of connection."}
+{"index": "partiprompts105", "data": "a family of four, consisting of two adults and two children, strolls along a sandy beach with gentle waves lapping at their bare feet. The adults are holding hands, and the children are playfully skipping ahead. The sky above them is a clear blue, and seagulls can be seen flying near the water's edge. In the distance, a lighthouse stands tall on a rocky outcrop."}
+{"index": "partiprompts106", "data": "a striking portrait that captures the essence of Salvador Dalí, with one side of his face depicted in his iconic, surrealistic style, and the other half transformed into a metallic, robotic visage. The painting features a vivid array of colors, with the robotic side incorporating shades of silver and hints of circuitry, contrasting with the warm, flesh tones of Dalí's human side. The background is a simple, solid color to ensure the focus remains on the intricate details of Dalí's dual representation."}
+{"index": "partiprompts107", "data": "A focused young woman with round spectacles is deeply engrossed in a thick, leather-bound book that rests on a polished mahogany desk. The desk is neatly organized, with a brass desk lamp casting a warm glow over her reading material. To her left, there's a small potted plant with vibrant green leaves, adding a touch of nature to the scholarly setting."}
+{"index": "partiprompts108", "data": "a focused woman wielding a heavy sledgehammer, poised to strike an intricately carved ice sculpture of a goose. The sculpture glistens in the light, showcasing its detailed wings and feathers, standing on a pedestal of snow. Around her, shards of ice are scattered across the ground, evidence of her previous strikes."}
+{"index": "partiprompts109", "data": "A woman with long, flowing black hair and rich, dark skin stands elegantly in a pristine white dress that cascades to the floor. The dress features delicate lace detailing along the hem and sleeves, adding a touch of sophistication. She is positioned near a large window with sheer curtains, which allows soft natural light to accentuate the contrast of her dress against her skin."}
+{"index": "partiprompts11", "data": "On the barren, gray surface of the moon, two chestnut horses adorned with silver harnesses are depicted pulling an antique carriage. In the surreal background, the Statue of Liberty stands tall, its green patina contrasting with the sandy hues of the Great Pyramid nearby. Above this otherworldly scene, the Planet Earth hangs majestically in the dark lunar sky, its blues and greens vibrant against the starkness of space."}
+{"index": "partiprompts110", "data": "A man and a woman are seated at a small round table with a checkered tablecloth. The man is enjoying a golden-brown glazed donut, while the woman delicately forks a piece of rich chocolate cake with a glossy icing. Between them is a vase with a single red rose, and in the background, a cream-colored wall is adorned with framed pictures."}
+{"index": "partiprompts111", "data": "An elderly woman with silver hair and glasses sits comfortably in an armchair, a colorful picture book open in her lap. Beside her, a young boy and girl, her grandchildren, listen intently, the boy wearing a green-striped shirt and the girl in a yellow dress. They are surrounded by a cozy room with a plush beige carpet and a small wooden table stacked with more books."}
+{"index": "partiprompts112", "data": "A man and a woman are standing in the bed of a vintage pickup truck, which is painted a faded shade of red. The truck's bed is scratched and worn, indicating its age and use. The man is wearing a denim jacket and the woman a green sweater, both are casually leaning against the cab of the truck. Behind them, the backdrop is a field of tall grass, hinting at a rural setting."}
+{"index": "partiprompts113", "data": "an elderly woman with shoulder-length straight gray hair and round metal-rimmed glasses sits comfortably in a plush armchair. She is wearing a lavender cardigan over a white blouse, and a silver necklace can be seen around her neck. In her lap rests an open hardcover book, and beside her, a small wooden side table holds a ceramic teacup and saucer."}
+{"index": "partiprompts114", "data": "A joyful black Labrador retriever with a glossy coat leaping up towards a smiling woman clad in a bright red sweater. The woman is standing in a grassy backyard, her arms open to embrace the energetic dog. Behind them, a wooden fence can be seen, partially covered by climbing green ivy."}
+{"index": "partiprompts115", "data": "A detailed painting that features the iconic Mona Lisa, with her enigmatic smile, set against a bustling backdrop of New York City. The cityscape includes towering skyscrapers, a yellow taxi cab, and the faint outline of the Statue of Liberty in the distance. The painting merges the classic with the contemporary, as the Mona Lisa is depicted in her traditional attire, while the city behind her pulses with modern life."}
+{"index": "partiprompts116", "data": "A tall man with dark hair, wearing a gray suit and sunglasses, is carefully stooping down to enter a sleek, low-profile red sports car. The car's glossy paint reflects the sunlight, highlighting its aerodynamic design. The vehicle is parked on a clean, concrete driveway beside a neatly trimmed lawn."}
+{"index": "partiprompts117", "data": "A man with shoulder-length blonde hair and deep brown eyes stands casually in a room. He's wearing a simple white t-shirt that contrasts with his distressed blue jeans. Around him, the space is minimally furnished, with a single potted plant in the corner adding a touch of greenery."}
+{"index": "partiprompts118", "data": "a collection of individuals clad in bright ski gear against the contrasting backdrop of a vast beige sand dune. Each person is equipped with skis and poles, ready to ascend the gentle slope of the dune under a clear blue sky. Their colorful attire stands out vividly against the monochrome landscape of sand."}
+{"index": "partiprompts119", "data": "a group of four musicians performing live on a modestly-sized stage with bright spotlights casting shadows behind them. The lead singer, clad in a red leather jacket, grips the microphone stand, while the guitarist, dressed in a black shirt, strums his electric guitar with fervor. The drummer, situated at the back, is partially obscured by his drum kit, and the bassist sways to the rhythm. In front of the stage, a small but enthusiastic audience is gathered, some with raised hands, enjoying the live music."}
+{"index": "partiprompts12", "data": "A scenic train journey unfolds during the monsoon season in Kerala, with raindrops rhythmically tapping against the windowpane. Inside one of the train's carriages, a plush koala bear toy, donning a small, colorful hat, is propped up against the glass, peering out at the lush landscape. Outside, numerous tall coconut trees sway gently in the rain, their green fronds glistening with moisture, creating a verdant backdrop as the train meanders through the countryside."}
+{"index": "partiprompts120", "data": "A striking black and white image captures the whimsical scene of a panda, adorned with a pointed wizard's hat, perched atop a majestic horse. The panda appears engrossed in an open book it holds with its paws. The horse, with a glossy chestnut coat, stands motionless on an urban street, its hooves nestled among tufts of vibrant green grass peeking through the pavement cracks. In the background, a large expanse of a gray concrete wall serves as a canvas for a mural of vivid flowers in hues of red, yellow, and blue, with the bold letters spelling out \"PEACE\" adding a splash of color and a message of tranquility to the urban setting."}
+{"index": "partiprompts121", "data": "A plump wombat, adorned in a crisp white panama hat and a vibrant floral Hawaiian shirt, lounges comfortably in a bright yellow beach chair. In its paws, it delicately holds a martini glass, the drink precariously balanced atop the keys of an open laptop resting on its lap. Behind the relaxed marsupial, the silhouettes of palm trees sway gently, their forms blurred into the tropical backdrop."}
+{"index": "partiprompts122", "data": "A majestic granite statue of a wombat warrior, clad in intricately detailed armor, stands proudly in the center of a temple's cella. The statue, gripping a broad sword with both hands, is bathed in a soft beam of light that filters down from an unseen source above, highlighting the textures of its stony surface. It is perched upon an ornate pedestal, which adds to its imposing presence. The scene is captured through a wide-angle lens, giving it a grandiose and expansive feel, reminiscent of an anime-style oil painting with vibrant colors and dynamic shading."}
+{"index": "partiprompts123", "data": "An impressionistic painting depicts a vibrant blue cow standing serenely in a field of delicate white flowers. Adjacent to the cow, there is a robust tree with a canopy of red leaves and branches laden with yellow fruit. The brushstrokes suggest a gentle breeze moving through the scene, and the cow's shadow is cast softly on the green grass beneath it."}
+{"index": "partiprompts124", "data": "A cartoonish scene unfolds with a white rabbit, dressed in a snug blue jogging outfit, clutching its side in evident discomfort, its facial expression twisted in pain. Meanwhile, a confident turtle, sporting a bright red tank top, strides past the finish line with a triumphant smile. The background is a simple race track that curves out of sight, lined with cheering spectators composed of various animated creatures."}
+{"index": "partiprompts125", "data": "An intricate Chinese ink and wash painting that depicts a majestic tiger, its fur rendered in delicate brush strokes, wearing a traditional train conductor's hat atop its head. The tiger's piercing eyes gaze forward as it firmly grasps a skateboard, which features a prominent yin-yang symbol in its design, symbolizing balance. The background of the painting is a subtle wash of grays, suggesting a misty and timeless landscape."}
+{"index": "partiprompts126", "data": "An animated squirrel with a rebellious punk rock vibe, clad in a black leather jacket adorned with shiny metal studs, is captured mid-shout into a silver microphone. The rodent stands confidently on an old, rugged tree stump that serves as an impromptu stage, with one paw gripping a small brown beer bottle. The stage is dimly lit, with only a few spotlights highlighting the squirrel's dynamic performance."}
+{"index": "partiprompts127", "data": "A slow-moving sloth, with shaggy brown fur and a relaxed expression, is seated in a bright red go-kart on a winding race track. In its three-toed claw, it clutches a ripe yellow banana, seemingly undisturbed by the race. Just a few meters behind the kart, a single banana peel lies on the asphalt track, a potential hazard for the other racers."}
+{"index": "partiprompts128", "data": "A striking portrait photograph showcasing a fluffy, cream-colored hamster adorned with a vibrant orange beanie and oversized black sunglasses. The hamster is gripping a small white sign with bold black letters that proclaim \"Let's PAINT!\" The background is a simple, blurred shade of grey, ensuring the hamster remains the focal point of the image."}
+{"index": "partiprompts129", "data": "A contented sloth, with a wide grin on its face, is decked out in an eclectic ensemble featuring a sleek black leather jacket and a brown cowboy hat atop its head. It's also sporting a traditional tartan kilt paired with a smart red bowtie around its neck. In one claw, the sloth firmly grips a wooden quarterstaff, while the other supports a large, thick book with a leather-bound cover."}
+{"index": "partiprompts13", "data": "An imaginative aerial panorama of downtown Manhattan, where the iconic Millennium Wheel stands out, juxtaposed against the backdrop of the Statue of Liberty, both bathed in the golden hues of the afternoon sun. The skyline is transformed by the surreal addition of the Great Pyramid, which sits majestically on a sandy island, surrounded by the steel and glass skyscrapers. The Hudson River meanders around the cityscape, reflecting the unusual yet striking architectural ensemble."}
+{"index": "partiprompts130", "data": "An anime-style illustration depicts a muscular, metallic tiger with sharp, angular features, standing on a rooftop. The tiger is in a dynamic pose, gripping a sleek, red electric guitar, and its mouth is open wide as if caught in the midst of a powerful roar or song. Above the tiger, a bright spotlight casts a dramatic beam of light, illuminating the scene and creating stark shadows on the surrounding rooftop features."}
+{"index": "partiprompts131", "data": "A whimsical child's crayon drawing depicting a green gecko adorned with a blue and white striped train conductor's hat. The gecko, with a playful smile, is holding a small flag featuring a black and white yin-yang symbol. The drawing is characterized by bold, colorful strokes and is placed on a fridge door, held by a variety of magnets."}
+{"index": "partiprompts132", "data": "In a whimsical scene, a gray donkey with a determined expression is engaged in a game of tug-of-war with a large, purple octopus. The donkey has the rope firmly gripped between its teeth, while the octopus's tentacles are wrapped around the other end of the rope, creating a stark contrast in textures. In the midst of their playful battle, a nimble orange cat is captured mid-leap over the taut rope, adding a dynamic element to the unusual tableau."}
+{"index": "partiprompts133", "data": "An anthropomorphic beaver, exuding an air of sophistication, stands upright beside a towering pile of books in a library setting. The beaver is adorned with a pair of round, wire-rimmed glasses perched on its nose, a neatly buttoned vest, and a vibrant necktie featuring an array of geometric patterns. The surrounding shelves are filled with an assortment of leather-bound and hardcover books, some with gold lettering on their spines, creating a backdrop of literary abundance."}
+{"index": "partiprompts134", "data": "A whimsical image captures a green frog with a contemplative expression, sitting comfortably on a lily pad while holding a newspaper. The newspaper, humorously titled \"Toaday,\" features a bold headline and an illustration of another frog on its front page. The frog's webbed fingers are spread across the paper, which is slightly crumpled from being held."}
+{"index": "partiprompts135", "data": "A tall giraffe, adorned in a whimsical white bathing suit with polka dots, is cautiously approaching the edge of a wooden diving board. The diving board extends over a large, clear blue swimming pool, reflecting the bright sunlight. The giraffe's long legs move gingerly, displaying a mix of hesitation and grace as it prepares to execute its dive."}
+{"index": "partiprompts136", "data": "A whimsical scene featuring a small elf with pointed ears and a green hat, sipping orange juice through a long straw from a disproportionately large orange. Next to the elf, a curious squirrel perches on its hind legs, while an owl with wide, observant eyes watches intently from a branch overhead. The orange's vibrant color contrasts with the muted browns and greens of the surrounding forest foliage."}
+{"index": "partiprompts137", "data": "A glossy black dog with a shiny coat is comfortably seated on a rustic wooden chair, its tail gently resting on the seat. Beside the chair, a white cat with distinctive black ears and bright green eyes is standing on its hind legs, with its front paws placed on the edge of the chair, as if engaging in a silent conversation with the dog. The chair sits on a smooth, terracotta-tiled floor, and behind them, a potted plant adds a touch of greenery to the scene."}
+{"index": "partiprompts138", "data": "A vibrant scene featuring a punk rock platypus, its webbed feet firmly planted on an old tree stump. The creature is clad in a black leather jacket, embellished with shiny metal studs, and it's passionately shouting into a silver microphone. Around its neck hangs a bright red bandana, and the stump is situated in a small clearing surrounded by tall, green grass."}
+{"index": "partiprompts139", "data": "A whimsical scene where a llama, adorned with a pair of oversized, round sunglasses, stands confidently on the metallic deck of a spacecraft. The deck beneath the llama's hooves gleams with a polished silver finish, reflecting the starry cosmos that surrounds the vessel. In the vast backdrop, the Earth looms large, a swirl of blue oceans and white clouds, providing a stunning contrast to the spaceship's sleek, futuristic design."}
+{"index": "partiprompts14", "data": "A surreal lunar landscape unfolds with the iconic Great Pyramids and the Sphinx, all replicated in meticulous detail on the moon's dusty, grey surface. In the foreground, the silhouette of an astronaut, clad in a pearly white spacesuit, is captured from behind, gazing upon the ancient wonders. Above this otherworldly scene, the Earth hangs majestically in the dark expanse of space, its blue and white visage a stark contrast to the barren moonscape."}
+{"index": "partiprompts140", "data": "A vibrant pink flamingo stands attentively, its beak nestled within the pages of an oversized book laid open on a grassy patch. To the side of this curious scene, a towering stack of hardcover books leans slightly, as if mirroring the flamingo's posture. The entire whimsical setup is captured in a high-resolution DSLR photograph, showcasing the intricate details of the flamingo's feathers and the colorful book spines."}
+{"index": "partiprompts141", "data": "A towering Egyptian obelisk stands under a clear sky, its ancient hieroglyphs barely visible in the sunlight. Atop this stone pillar, a menacing black dragon with scales like onyx crouches, its wings unfurled and its fiery breath directed towards a solitary knight below. The knight, clad in shining silver armor, holds up a shield in a futile attempt to deflect the intense heat, while the ground around him is scorched and illuminated by the dragon's flames."}
+{"index": "partiprompts142", "data": "An animated frog with a rebellious punk rock style, clad in a black leather jacket adorned with shiny metal studs, is energetically shouting into a silver microphone. The frog's vibrant green skin contrasts with the dark jacket, and it stands confidently on a large green lily pad floating on a pond's surface. Around the lily pad, the water is calm, and other pads are scattered nearby, some with blooming pink flowers."}
+{"index": "partiprompts143", "data": "Two sea creatures, a swordfish with its elongated, pointed bill and a narwhal with its iconic spiraled tusk, are engaged in an underwater duel on a sandy seabed that resembles an arena. The swordfish's sleek, silvery body contrasts with the mottled gray and white of the narwhal as they circle each other. On the sidelines, a bright red crab and a dark-shelled lobster wave their claws enthusiastically, as if cheering on the competitors amidst the swaying sea plants."}
+{"index": "partiprompts144", "data": "A whimsical image capturing a squirrel standing upright on a patch of green grass. The squirrel, with its bushy tail and brown fur, is holding a wooden arrow above its head with its tiny paws, as if it's just won a battle. In its left hand, it grips a miniature longbow, and around it, fallen autumn leaves add a touch of natural decor to the scene."}
+{"index": "partiprompts145", "data": "A majestic brown horse stands in profile, its coat gleaming in the sunlight, with a black saddle securely fastened on its back. The number 55 is prominently displayed in white on the horse's rear flank, indicating its identification or racing number. The horse's mane is neatly combed, and it appears calm and well-trained, ready for a ride or competition."}
+{"index": "partiprompts146", "data": "A whimsical scene featuring a large red dragon, its scales glistening with a fiery hue, donned in a sleek black tuxedo with a crisp white shirt and a bow tie. The dragon is seated at a wooden table, intently focused on a chessboard where the pieces are intricately designed to resemble miniature robots, each with its own unique metallic finish. The game is set against a backdrop of a stone-walled room, with the dragon's tail casually draped over the edge of its chair."}
+{"index": "partiprompts147", "data": "A glossy black dog with a curious expression sits snugly between a lush green bush and an unusual pair of green pants that stand upright as if an invisible person were inside them. The pants are supported by a hidden frame, creating a whimsical garden display. The dog's shiny coat contrasts with the matte texture of the foliage and the fabric of the pants."}
+{"index": "partiprompts148", "data": "A panoramic view of a sprawling field blanketed with vibrant wildflowers, where a tall giraffe and a striped zebra stand side by side. The giraffe's long neck stretches towards the blue sky, while the zebra's black and white stripes provide a stark contrast against the multicolored floral backdrop. In the distance, a line of acacia trees can be seen under the bright sunlight, completing this picturesque savanna scene."}
+{"index": "partiprompts149", "data": "A fluffy white poodle, adorned with a red baseball cap, stands on its hind legs in front of a green chalkboard. In one paw, it holds a large, open dictionary, while the other paw is used to scrawl the word \"bonez\" in white chalk. The floor beneath the poodle is scattered with colorful pieces of chalk, and the room is filled with small wooden desks and chairs, suggesting a classroom setting."}
+{"index": "partiprompts15", "data": "A towering statue of Abraham Lincoln, cast in a silvery-gray stone, is adorned with a gleaming, opaque astronaut's helmet that reflects the barren lunar landscape. The statue is positioned on the moon's surface, with its craters and dust visible around the base. In the dark sky above, the planet Earth looms large, a swirl of blue and white against the blackness of space."}
+{"index": "partiprompts150", "data": "An oil-on-canvas masterpiece that captures the dynamic essence of a blue night sky, bursting with the energy of swirling azure hues and speckled with stars that seem to explode in shades of yellow. Dominating the celestial scene is a luminous, fuzzy-edged yellow crescent moon, casting its soft glow from the upper portion of the canvas. Below this cosmic display, a tranquil village is depicted to the right, its quaint houses huddled in repose. On the left, a towering cypress tree stretches upwards, its branches undulating like flames, creating a stark contrast against the serene sky. In the distance, amidst the gentle roll of blue hills, the spire of a church stands tall, a silent sentinel overlooking the sleepy hamlet."}
+{"index": "partiprompts151", "data": "A vivid oil painting that depicts a surreal and dreamlike seascape, where time seems to have lost all meaning. The canvas is filled with an array of clocks and watches, all of which are distorted and melting across the barren terrain, their shapes soft and elongated in a manner that defies reality. On the left side of the painting, a small wooden table stands, its surface dominated by a golden pocket watch that has attracted a swarm of tiny ants. At the heart of the scene, there is an odd, flesh-like figure draped over a lifeless tree, adding to the enigmatic and otherworldly atmosphere of the artwork."}
+{"index": "partiprompts152", "data": "A lone figure, cloaked in a heavy mist, stands on an ancient cobblestone street, gazing upwards at the towering, dark gothic architecture that looms ominously above. The soft, golden glow of an old-fashioned street lamp casts a warm light nearby, contrasting with the cool, grey stones underfoot. This scene, reminiscent of a bygone era, is captured in the textured brushstrokes of an oil painting, giving it a timeless quality."}
+{"index": "partiprompts153", "data": "A haunting painting depicts a ghastly, pale creature with an expression of terror, its body bearing a resemblance to both a lifeless corpse and the nascent form of a sperm or fetus. The creature's twisted outline is mirrored in the tumultuous, swirling patterns that dominate the blood-red sky above. The entire scene is set against a backdrop that evokes a sense of foreboding, with the stark red tones and fluid lines creating a sense of motion and unease."}
+{"index": "partiprompts154", "data": "An intricately detailed oil painting depicts a raccoon dressed in a black suit with a crisp white shirt and a red bow tie. The raccoon stands upright, donning a black top hat and gripping a wooden cane with a silver handle in one paw, while the other paw clutches a dark garbage bag. The background of the painting features soft, brush-stroked trees and mountains, reminiscent of traditional Chinese landscapes, with a delicate mist enveloping the scene."}
+{"index": "partiprompts155", "data": "A detailed oil painting captures the intricate fur of a young badger as it gently sniffs at a bright yellow rose. The scene is set against the rough texture of a large tree trunk, with the badger's claws slightly digging into the bark. In the softly painted background, a tranquil waterfall cascades down, its waters a shimmering blue amidst the greenery."}
+{"index": "partiprompts156", "data": "An oil painting depicting an abstract anime landscape, where a vibrant door stands out amidst a backdrop of shadowy hues. The door, painted in bright, luminescent colors, appears to be a gateway, cutting through the surrounding darkness with its inviting glow. Swirls of contrasting colors give the impression of a mystical portal, beckoning viewers to step through into a realm of knowledge and discovery."}
+{"index": "partiprompts157", "data": "An intricate oil painting that captures two rabbits standing upright in a pose reminiscent of the iconic American Gothic portrait. The rabbits are anthropomorphized, donning early 20th-century rural clothing with the male rabbit wearing a black jacket and the female in a colonial print apron. The background features a wooden farmhouse with a gothic window, emulating the style and composition of the original artwork."}
+{"index": "partiprompts158", "data": "A detailed painting depicts an ancient, ornate treasure chest with intricate carvings and a heavy, rusted metal clasp, nestled in the shadows of a dark, damp cave. Beside the chest, a broad sword with an embellished hilt stands propped up, its blade catching the faint, eerie glow emanating from within the chest. The rough texture of the cave walls is accentuated by the contrast of the sword's gleam and the chest's subtle sheen."}
+{"index": "partiprompts159", "data": "An abstract oil painting that radiates a blend of bright, joyful colors, with swirls of yellow and white that suggest a sense of light and positivity. The canvas is filled with dynamic strokes and splashes of color that seem to dance and spread across the surface, reaching out as if to touch every edge of the frame. The painting gives the impression of happiness diffusing into the space around it, with no single point of focus but rather a harmonious amalgamation of uplifting hues."}
+{"index": "partiprompts16", "data": "A plush teddy bear, adorned with a shiny black motorcycle helmet and a flowing red cape, is perched confidently on a miniature red motorcycle. The toy bike and its adventurous rider are positioned against the bustling backdrop of Rio de Janeiro, with the iconic Dois Irmãos mountain peaks rising majestically in the distance. The scene captures the playful contrast between the soft texture of the teddy bear and the sleek metal of the motorcycle, all under the bright Brazilian sun."}
+{"index": "partiprompts160", "data": "An abstract cubist painting depicts a chaotic scene where a tornado, composed of angular shark figures, collides with the geometric forms of a skyscraper. The skyscraper is rendered in a series of fragmented, multi-colored planes, suggesting the reflective surfaces of glass and steel. The sharks, with their sharp edges and varying shades of grey and blue, swirl around in a tumultuous dance of destruction within the painting's frame."}
+{"index": "partiprompts161", "data": "a detailed oil painting that captures the essence of an elderly raccoon adorned with a distinguished black top hat. The raccoon's fur is depicted with textured, swirling strokes reminiscent of Van Gogh's signature style, and it clutches a bright red apple in its paws. The background swirls with vibrant colors, giving the impression of movement around the still figure of the raccoon."}
+{"index": "partiprompts162", "data": "a surrealistic painting that depicts a vibrant red sports car with its form melting over the curved edges of a large, ornate golden clock. The background of the painting is a desolate landscape with a clear blue sky, reminiscent of Salvador Dali's distinctive style. The car's glossy paint appears to be dripping like liquid, blending the concepts of time and motion in a dreamlike tableau."}
+{"index": "partiprompts163", "data": "a vivid and bizarre oil painting by Salvador Dalí that depicts a cat with exaggerated features engaged in a game of checkers. The checkerboard is floating in an undefined space, with pieces that seem to melt over the edges. The background swirls with a mix of warm and cool colors, adding to the dream-like quality of the scene."}
+{"index": "partiprompts164", "data": "An intricately detailed oil painting captures the essence of a young badger, its fur rendered with fine brushstrokes that give it a tactile quality. The badger's snout is gently poised above a vibrant yellow rose, which stands out against a backdrop of muted green foliage. The contrast between the animal's coarse fur and the delicate petals of the rose is emphasized through the artist's skillful use of texture and color."}
+{"index": "partiprompts165", "data": "an abstract oil painting dominated by deep red and black hues, with thick, textured patches of white creating a stark contrast. The canvas is stretched over a sturdy wooden frame, and the paint appears to be applied with vigorous, impasto techniques. This piece of art is hung on a neutral-colored wall, allowing the vibrant colors and bold textures to stand out prominently."}
+{"index": "partiprompts166", "data": "An exquisite oil painting that captures a raccoon with an almost human-like poise, dressed in attire reminiscent of the 17th century. The raccoon's fur is rendered in rich, textured strokes of brown and gray, and it wears a white ruffled collar and a deep red velvet coat that would befit a noble of Rembrandt's era. The background of the painting is a muted blend of dark, warm tones, creating a subtle contrast that draws attention to the subject's detailed and expressive face."}
+{"index": "partiprompts167", "data": "An abstract oil painting that depicts a chaotic blend of vibrant colors and swirling patterns, giving the impression of a vast, disorienting landscape. The canvas is filled with bold strokes of reds, blues, and yellows that seem to clash and compete for space, symbolizing the complexity and confusion of navigating through life. Amidst the turmoil, a small, indistinct figure appears to be wandering, searching for direction in the overwhelming expanse."}
+{"index": "partiprompts168", "data": "a vibrant abstract painting featuring three geometric shapes: a large blue triangle, a smaller yellow triangle, and a red triangle, all against a white background. The blue triangle has a smooth texture, while the yellow and red triangles have a more textured, almost brush-stroked appearance. The shapes are arranged in a way that suggests dynamic movement and balance within the composition."}
+{"index": "partiprompts169", "data": "a detailed watercolor painting that captures a majestic snowy owl with its pristine white feathers standing in the midst of a lush green field. The owl's bright yellow eyes are a stark contrast to the soft hues of the grass, and its feathers are intricately detailed, giving a sense of texture to the artwork. The field is dotted with wildflowers and the occasional blade of grass that sways gently, suggesting a light breeze in this tranquil scene."}
+{"index": "partiprompts17", "data": "An altered image of the Space Shuttle Endeavor, its exterior painted a bright yellow, as it soars high above the Earth's atmosphere. The vast expanse of South America is prominently visible below, with the deep blue of the surrounding ocean contrasting against the yellow of the shuttle. The curvature of the Earth can be seen at the edges of the photo, highlighting the shuttle's altitude."}
+{"index": "partiprompts170", "data": "An intricately detailed oil painting that showcases a vibrant and whimsical creature, a fusion of a hamster and a dragon, set against a swirling backdrop of psychedelic colors. The creature's fur is a kaleidoscope of hues, with scales that shimmer in iridescent tones, and it's depicted with a playful yet majestic pose. The artwork is framed in an ornate, golden frame that complements the fantastical theme of the painting."}
+{"index": "partiprompts171", "data": "a detailed 17th-century Dutch Baroque painting depicting a chestnut horse standing amidst a vibrant field of tulips and daisies. The horse's mane and tail are elegantly captured by the artist, flowing with the gentle breeze that seems to animate the scene. In the background, a traditional windmill sits under a partly cloudy sky, completing this pastoral landscape."}
+{"index": "partiprompts172", "data": "An intricately detailed oil painting that captures the whimsical essence of a feline super math wizard. The cat, adorned with a wizard's hat and cape, is surrounded by floating mathematical symbols and equations. The rich textures of the brush strokes give depth to the cat's fur and the magical elements, creating a vivid and captivating scene."}
+{"index": "partiprompts173", "data": "a detailed sketch of a space shuttle, rendered in the intricate, technical style reminiscent of Leonardo da Vinci's famous drawings. The shuttle is depicted with numerous annotations and measurements, showcasing its complex design and structure. The paper on which it is drawn has an aged, yellowed appearance, adding to the historical feel of the artwork."}
+{"index": "partiprompts174", "data": "an impressionistic painting that features a vibrant array of colors, depicting a tree with a swirl of green and yellow leaves next to an old stone building with a red-tiled roof. The brush strokes are thick and visible, giving the painting a textured look, and the tree's branches seem to dance around the building's edges. The sky in the background is a mix of blues and purples, suggesting either dawn or dusk."}
+{"index": "partiprompts175", "data": "a detailed oil painting depicting a ginger cat with green eyes, intently focused on a game of checkers. The cat is seated at a small wooden table, with the checkerboard laid out in front of it, pieces strategically placed. The painting captures the texture of the cat's fur and the wood grain of the table, set against a soft, neutral background that draws attention to the subject."}
+{"index": "partiprompts176", "data": "An intricate piece of graffiti art depicting a robot with hues of blue and silver sprawls across a weathered brick wall. Bold, stylized letters spell out \"Fly an airplane\" just above the robot, adding a whimsical touch to the urban canvas. In front of the mural, a concrete sidewalk stretches out, with tufts of green grass stubbornly sprouting from the cracks, hinting at nature's resilience in the cityscape."}
+{"index": "partiprompts177", "data": "A vibrant depiction of a robot, spray-painted in hues of blue and silver, adorns an aged brick wall. The sidewalk in front of the wall, made of weathered concrete slabs, is interrupted by tufts of green grass sprouting from the cracks. The artwork casts a shadow on the uneven ground, hinting at the late afternoon sun."}
+{"index": "partiprompts178", "data": "A vibrant and colorful graffiti mural on a textured concrete wall, featuring the phrase \"BE EXCELLENT TO EACH OTHER\" in bold, stylized lettering. Next to the text, there's an image of a whimsical green alien, donning a sleek black tuxedo with a bright red bow tie. The alien's playful expression adds a touch of humor to the otherwise rough urban canvas."}
+{"index": "partiprompts179", "data": "A grand city fountain situated in the center of a bustling square, with a creamy white liquid cascading down its tiers instead of water. The fountain's base is surrounded by numerous cats of various colors and sizes, eagerly lapping up the milk. The stone structure of the fountain is intricately carved, and despite the unusual substitution of milk, it stands as an impressive focal point in the urban landscape."}
+{"index": "partiprompts18", "data": "A close-up image captures a furry wombat with a curious expression, donning a bright red backpack snugly fitted on its back. The adorable marsupial stands on its hind legs, with both arms raised in a triumphant or playful gesture. In the distance, the iconic Mount Rushmore serves as a majestic backdrop, with the carved faces of the four presidents etched into the mountainside."}
+{"index": "partiprompts180", "data": "A vibrant watercolor mural depicting a group of foxes playing jazz instruments adorns a large wall on a bustling city street. The foxes are painted in various shades of orange and red, with one playing a saxophone and another on the drums, set against a backdrop of colorful musical notes. The wall itself is part of a row of buildings, with pedestrians passing by and occasionally stopping to admire the artwork."}
+{"index": "partiprompts181", "data": "A quaint fairy cottage nestled in a lush garden, with delicate smoke wisps rising from its stone chimney. The cottage features a thatched roof and walls covered in climbing ivy. A curious squirrel peers out from a small, round window, framed by wooden shutters."}
+{"index": "partiprompts182", "data": "a close-up image of a sandy beach, where a bright red bucket lies on its side, surrounded by an array of colorful seashells scattered across the fine grains. there are no birds in sight, particularly no sandpipers, which often frequent such shores. the gentle waves in the background suggest the proximity to the water's edge."}
+{"index": "partiprompts183", "data": "a concrete sidewalk running alongside a weathered wooden post, which has a bright blue '5' prominently painted on its flat top surface. The post is planted firmly in the ground, with patches of green grass sprouting around its base. To the side of the sidewalk, there's a neatly trimmed hedge that stretches out of view."}
+{"index": "partiprompts184", "data": "a tranquil lakeside setting where a herd of sauropods is seen gracefully traversing the water's edge. The gentle giants, with their long necks and tails, are silhouetted against the backdrop of a dense forest. The calm lake reflects the soft hues of the sky as the prehistoric creatures continue their age-old migration."}
+{"index": "partiprompts185", "data": "A picturesque painting depicting a charming white country home with a spacious wrap-around porch adorned with hanging flower baskets. The house is set against a backdrop of lush greenery, with a cobblestone pathway leading to its welcoming front steps. The porch railing is intricately designed, and the home's windows boast traditional shutters, adding to the quaint aesthetic of the scene."}
+{"index": "partiprompts186", "data": "A pristine white bird with a long neck and elegant feathers stands in the foreground, with a towering dinosaur sculpture positioned behind it among a grove of trees. The dinosaur, a deep green in color with textured skin, contrasts sharply with the smooth plumage of the bird. The trees cast dappled shadows on the scene, highlighting the intricate details of both the bird and the prehistoric figure."}
+{"index": "partiprompts187", "data": "a breathtaking photograph capturing the vibrant hues of a sunset with streaks of pink and orange painting the sky behind the majestic Grand Canyon. The canyon's intricate rock formations are silhouetted against the illuminated backdrop, showcasing the deep crevices and towering spires. In the foreground, the Colorado River can be glimpsed winding its way through the ancient geological marvel."}
+{"index": "partiprompts188", "data": "a spectacular display of fireworks illuminates the night sky with bursts of red, white, and blue. the vibrant colors reflect off a nearby lake, creating a mirror image of the aerial spectacle. in the foreground, silhouettes of a small crowd can be seen gathered to watch the show, with some individuals pointing upwards in awe."}
+{"index": "partiprompts189", "data": "a modern storefront with large glass windows and a bold sign above the entrance that reads 'openai' in sleek, white lettering. The facade is painted in a muted gray, complementing the contemporary design. Inside, through the transparent windows, one can see rows of neatly arranged products and a few customers browsing."}
+{"index": "partiprompts19", "data": "Inside a temple, a unique wall painting captures the whimsical image of pandas engaged in a game of tennis, rendered in a style reminiscent of ancient Egyptian hieroglyphics. The pandas are depicted with exaggerated, playful expressions, using rackets that are stylistically simplified. The wall on which the painting is displayed has a textured, sandy hue, providing a stark contrast to the dark, bold lines of the painted figures."}
+{"index": "partiprompts190", "data": "A bold, white sign with the words 'KEEP OFF THE GRASS' stands prominently next to a lush, green lawn. The sign, with its stark black lettering, is mounted on a metal pole and positioned at the edge of the neatly trimmed grass. Surrounding the lawn are small flowering plants, adding a touch of color to the scene."}
+{"index": "partiprompts191", "data": "A monochromatic photograph capturing the stark contrast of a solitary black tree against a white, snowy landscape. The tree's intricate branches are devoid of leaves, creating a network of dark lines that stand out sharply. In the background, the horizon is barely distinguishable, with the white of the snow blending into the overcast sky above."}
+{"index": "partiprompts192", "data": "a standard green tennis court with white boundary lines, where numerous bright yellow tennis balls are strewn across the playing surface. Surrounding the court is a tall chain-link fence, and there is a player's bench off to the side with a couple of rackets resting on it. The net is taut and stands prominently in the center of the court, casting a faint shadow on the ground."}
+{"index": "partiprompts193", "data": "A clear night sky where a slender crescent moon hangs delicately between the silhouetted branches of tall trees. The moon's pale light casts a soft glow on the intricate patterns of the branches, creating a contrast against the dark blue of the night sky. No other celestial bodies are visible in the small patch of sky framed by the intertwining tree limbs."}
+{"index": "partiprompts194", "data": "a picturesque autumn scene where a quaint cottage with a thatched roof sits beside a tranquil lake, surrounded by trees with leaves in vibrant shades of orange, red, and yellow. The cottage's wooden exterior is complemented by white-framed windows, and a stone chimney rises above the roofline. The lake reflects the warm fall colors, creating a mirror image of the foliage and the small structure on its calm surface."}
+{"index": "partiprompts195", "data": "a view through a window pane speckled with raindrops, showcasing a cityscape of tall buildings with reflective glass facades. the gray overcast sky looms above the urban skyline, and the raindrops create a blurred effect on the structures in the distance. the window's frame is a stark white, contrasting with the muted colors of the city beyond."}
+{"index": "partiprompts196", "data": "a standard-sized basketball hoop with a net that has seen better days, mounted against a faded red brick wall. wedged firmly within the hoop is an oversized blue rubber ball, too large to pass through the net. the ground below is a cracked concrete surface, with faded court lines barely visible."}
+{"index": "partiprompts197", "data": "In the center of a bustling intersection, a large tree with a thick trunk and sprawling branches stands out amidst the concrete. Its green leaves contrast sharply with the grey asphalt roads that converge around it. Traffic lights and street signs are positioned awkwardly around the tree's base, creating an unusual juxtaposition of nature and urban infrastructure."}
+{"index": "partiprompts198", "data": "A clear blue sky serves as the backdrop for a white airplane, leaving behind a long, linear chemtrail that stretches across the expanse. The trail is a stark white against the deep blue, and it slowly diffuses at the edges as it drifts away from the plane. Below, the landscape is dotted with the occasional fluffy white cloud, providing a serene contrast to the straight line above."}
+{"index": "partiprompts199", "data": "a brightly colored storefront with large, bold letters spelling out 'AwesomePurchase' above the entrance. The shop's window displays are neatly arranged with an array of products, and a small, potted plant sits to the left of the door. The facade of the building is a clean, modern white, contrasting with the vibrant signage."}
+{"index": "partiprompts2", "data": "A high-resolution DSLR image captures a glossy Volkswagen van, its exterior artistically adorned with a vibrant mural depicting a bustling cityscape. Parked on a patch of lush green grass, the van serves as a backdrop to an unusual sight: a contented sloth, standing upright. The sloth is dressed in an eclectic ensemble consisting of a sleek leather jacket, a wide-brimmed cowboy hat, a traditional tartan kilt, and a neatly tied bowtie. In its clawed hands, it firmly grasps a wooden quarterstaff and an oversized, leather-bound book."}
+{"index": "partiprompts20", "data": "A ceramic Athenian vase with a smooth, matte black finish stands prominently against a plain white background. The vase is adorned with a unique painting that depicts pangolins engaged in a game of basketball, rendered in a style reminiscent of ancient Egyptian hieroglyphics. The intricate design features a combination of earthy tones and bold outlines, giving the artwork a sense of dynamic movement and historical depth."}
+{"index": "partiprompts200", "data": "A classic wooden rocking chair with a smooth, varnished finish sits adjacent to the green, chain-link fence of a tennis court. The court's surface is a vibrant blue with white boundary lines, and a few tennis balls can be seen resting near the net. In the background, there's a tall fence that separates the court from a row of dense, neatly trimmed hedges."}
+{"index": "partiprompts201", "data": "a picturesque mountain stream with crystal clear water flowing over smooth, rounded stones. Several salmon, their scales glistening with a pinkish hue, can be seen leaping energetically from the water, attempting to navigate upstream. The banks of the stream are lined with lush green vegetation, and small wildflowers peek out from the foliage."}
+{"index": "partiprompts202", "data": "a rustic windmill with weathered wooden blades stands amidst a field of vibrant wildflowers. the structure, painted in faded red and white, towers over the colorful blooms, which include poppies and daisies. the windmill's base is encircled by a low stone wall, and in the background, the clear sky stretches above the pastoral scene."}
+{"index": "partiprompts203", "data": "a massive tornado with a swirling gray funnel cloud, tearing through the landscape with debris flying around its base. At the top of the vortex, a small wooden house, with its windows shattered and roof partially torn off, is being lifted into the air. The tornado is moving across an open field, and the sky above is a menacing shade of dark gray."}
+{"index": "partiprompts204", "data": "A glossy black grand piano stands majestically adjacent to the green mesh of a tennis court net. The piano's open lid reveals its intricate golden interior, contrasting with the stark white lines of the court. Nearby, a collection of tennis balls is scattered on the ground, hinting at a recent game."}
+{"index": "partiprompts205", "data": "the word 'START' is boldly written in white chalk on a gray concrete sidewalk. the letters are large and slightly smudged at the edges, indicating recent use. to the side of the word, there's a small pile of colorful chalk pieces, and the sidewalk extends into the distance, bordered by a neatly trimmed green lawn."}
+{"index": "partiprompts206", "data": "A dilapidated spaceship, covered in patches of rust and with peeling paint, is depicted in the midst of a powerful blast-off, leaving a trail of smoke and fire behind. In the distance, a sprawling cityscape with towering skyscrapers stretches towards the horizon, where it meets the calm expanse of the ocean. Beyond the city, a range of majestic mountains rises up, and above it all, a large, dark moon dominates the sky, casting a mysterious glow. The entire scene is rendered in a high-contrast anime style, with sharp lines and dramatic shading that give the illustration a dynamic and intense feel."}
+{"index": "partiprompts207", "data": "Inside a subway train, the seats are occupied by several red pandas with soft, reddish-brown fur. One particularly curious red panda is engrossed in reading a newspaper, holding it with its black and white paws. Through the train's windows, a dense jungle with lush green foliage can be seen passing by in the background. The interior of the train is a mix of metallic grays and bright artificial lighting, contrasting with the natural colors visible outside."}
+{"index": "partiprompts208", "data": "A vibrant yellow 2017 Porsche 911 is captured in motion, navigating a winding mountain road with its sleek body hugging the curve. The sports car's headlights are piercing through the overcast weather, illuminating the path ahead. In the background, a lush green valley stretches out beneath a sky filled with grey clouds, hinting at the vast expanse beyond the road's edge."}
+{"index": "partiprompts209", "data": "A striking blue semi-truck with a matching trailer is captured mid-air, soaring over a neatly aligned row of motorcycles. The motorcycles are positioned between two sturdy metal ramps, which facilitated the truck's impressive jump. The scene unfolds in an open area with a clear sky overhead, emphasizing the boldness of the blue truck as it defies gravity in this daring stunt."}
+{"index": "partiprompts21", "data": "A detailed photograph captures the image of a statue with the likeness of an ancient pharaoh, unexpectedly accessorized with a pair of bronze steampunk goggles resting atop its head. The statue is dressed in an anachronistic fashion, featuring a crisp white t-shirt and a fitted black leather jacket that contrasts with its traditional headdress. The background is a simple, solid color that accentuates the statue's unconventional attire and the intricate details of the steampunk eyewear."}
+{"index": "partiprompts210", "data": "A vibrant yellow dump truck, its bed brimming with black and white soccer balls, navigates through the colorful terrain of a coral reef. The surrounding waters are a clear blue, teeming with marine life, and in the distance, a massive blue whale glides gracefully. The coral formations exhibit a multitude of hues and intricate textures, providing a stark contrast to the industrial machine in their midst."}
+{"index": "partiprompts211", "data": "A vibrant hot air balloon adorned with a colorful chameleon logo floats gracefully against a bright blue sky. The sun casts a warm glow on the scene, highlighting the balloon's rainbow hues and the fluffy white clouds that dot the horizon. Below the balloon, a patchwork of green fields and small houses can be seen from this aerial vantage point."}
+{"index": "partiprompts212", "data": "An old red pickup truck, its body covered in patches of rust, sits abandoned in an open field. The truck's white doors stand in stark contrast to the faded red paint, and the windshield is shattered, with spiderweb cracks running across the glass. The vehicle's bed is empty, and the tires are worn, hinting at many years of service and neglect."}
+{"index": "partiprompts213", "data": "An old-fashioned covered wagon with a weathered canvas top and wooden spokes on its wheels is captured from behind. Peeking out from the flap of the canvas, a large polar bear's head emerges, its fur a stark contrast to the beige fabric of the wagon. The wagon itself is parked on a gravel path, surrounded by tufts of grass and small shrubs."}
+{"index": "partiprompts214", "data": "A sleek motorcycle with the word \"BUZZ\" emblazoned in bold letters on its side is parked inside an ornate bank lobby. The lobby features marble floors and intricate gold trimmings along the walls. The motorcycle's chrome accents gleam under the soft glow of the chandelier overhead, contrasting with the rich, dark wood of the bank's counters."}
+{"index": "partiprompts215", "data": "a vintage blue Porsche 356 is captured mid-turn on a winding asphalt road, its polished surface reflecting the bright sunlight. The car's classic design is accentuated by its rounded headlights and sleek bodywork. Alongside the road, a low stone wall partially covered in moss provides a contrast to the well-maintained vehicle."}
+{"index": "partiprompts216", "data": "A vibrant underwater scene where a yellow dump truck, brimming with black and white soccer balls, is whimsically placed among the colorful coral reef. The truck appears to be 'scuba diving' in the clear blue waters, surrounded by schools of tropical fish and intricate sea life. The coral formations exhibit a multitude of hues, from bright pinks to deep purples, creating a playful contrast with the artificiality of the submerged vehicle."}
+{"index": "partiprompts217", "data": "A dusty red pickup truck parked on a gravel path, with a chestnut horse standing calmly to its left, its mane gently blowing in the breeze. On the right side of the truck, two dogs, one with a golden coat and the other with black and white spots, sit attentively. The truck's bed is filled with hay bales, hinting at a day's work on the farm."}
+{"index": "partiprompts218", "data": "an empty metal bike rack with a silver finish, positioned on a concrete sidewalk. several colorful bike locks are attached to it, indicating the frequent use of the rack by cyclists. the absence of bicycles gives the rack a forlorn appearance against the backdrop of a quiet street."}
+{"index": "partiprompts219", "data": "An aerial perspective showcases a red pickup truck with a silver flatbed, loaded with neatly stacked brown cardboard boxes. The truck is parked on a gray concrete driveway, adjacent to a well-manicured green lawn. Surrounding the vehicle, a few orange traffic cones are placed, indicating a temporary work zone or moving area."}
+{"index": "partiprompts22", "data": "A stuffed toy monkey, with a soft brown texture, is precariously balanced on a floating log in the middle of the Charles River. It's adorned with a distinctive red and white Boston Red Sox baseball cap. In the background, the prominent buildings of the Massachusetts Institute of Technology (MIT) can be seen, creating a juxtaposition of playful whimsy and academic prestige."}
+{"index": "partiprompts220", "data": "A large blue airplane with white accents is taxiing on a vast concrete runway, its engines humming steadily. The sun is setting behind it, casting a warm glow and elongating the shadow of the plane on the ground. The runway is marked with white lines and numbers, indicating the designated path for the aircraft."}
+{"index": "partiprompts221", "data": "a large commercial airplane with a white and blue fuselage soaring towards a towering cumulus cloud that resembles a monstrous face. The cloud's formation gives the impression of gaping jaws and hollow eyes, casting a whimsical shadow over the landscape below. The airplane's wings reflect the sunlight, creating a stark contrast against the darkening sky around the cloud formation."}
+{"index": "partiprompts222", "data": "a sleek red convertible sports car with its top down is navigating a sharp bend on a coastal road. the car's polished chrome rims catch the sunlight as it speeds along the asphalt, hugging the curve tightly. on the passenger side, the endless expanse of the ocean can be seen, with waves gently crashing against the rocky shore."}
+{"index": "partiprompts223", "data": "a busy intersection where a red sedan and a large white delivery truck are stopped side by side, waiting for the traffic light to turn green. The traffic light is mounted on a metal pole on the corner of the crosswalk. The road is marked with white lines and arrows indicating lanes for turning and going straight."}
+{"index": "partiprompts224", "data": "a bright blue pickup truck parked on a gravel road, its flatbed occupied by a large rhinoceros. The animal's rough, gray skin contrasts sharply with the vehicle's smooth, metallic paint. Nearby, a few scattered acacia trees provide a sparse canopy in the otherwise open savannah landscape."}
+{"index": "partiprompts225", "data": "A vibrant orange pickup truck parked beside a sleek yellow Porsche 911 on a smooth gray asphalt road. The Porsche's polished surface reflects the sunlight, highlighting its aerodynamic shape and the truck's sturdy, boxy frame. Between the two vehicles, the contrasting sizes and designs are evident, with the pickup's raised suspension towering over the low-profile sports car."}
+{"index": "partiprompts226", "data": "a futuristic spaceship with a design reminiscent of the iconic Sydney Opera House, featuring multiple white, shell-like structures that form its hull. The vessel hovers above the ground, with a slight iridescent sheen on its surface that reflects the light of a distant sun. It is surrounded by a barren landscape, which contrasts sharply with the spaceship's smooth, curved architecture."}
+{"index": "partiprompts227", "data": "a pristine white yacht floats serenely in the tranquil waters of a secluded bay, its sleek hull reflecting the bright sunshine. The expansive deck is lined with polished wooden railings, and several deck chairs are arranged facing the open water. In the distance, the gentle hills surrounding the bay are dotted with green foliage, providing a picturesque backdrop under the clear blue sky."}
+{"index": "partiprompts228", "data": "a sleek black Harley-Davidson motorcycle, its chrome accents gleaming in the light, adorned with an intricate flame decal in hues of red and orange. The motorcycle is parked on a smooth concrete surface, and its polished wheels reflect the surrounding environment. The handlebars are equipped with leather grips, and the seat is crafted from a rich, black leather that looks both comfortable and stylish."}
+{"index": "partiprompts229", "data": "An old, rusty red pickup truck stands out with its white wheel rims, parked on a stretch of gravel road. The truck's paint is faded and peeling in places, revealing the passage of time on its body. In the truck bed, there's a collection of used tools and a couple of wooden crates, hinting at its utilitarian past."}
+{"index": "partiprompts23", "data": "Two ceramic cups sit side by side on a wooden table, one filled with a steaming latte that has a detailed foam art depiction of the United States map on its surface. The other cup, equally warm, showcases an intricate latte art representation of the African continent. Both cups have a glossy finish, and the table reflects the soft glow of the overhead lighting."}
+{"index": "partiprompts230", "data": "a patriotic-themed chopper motorcycle, its body emblazoned with the iconic red, white, and blue of the Stars and Stripes. The bike's gleaming chrome accents catch the light, highlighting its meticulous craftsmanship. Parked on a stretch of open road, the motorcycle's American flag motif stands out boldly against the asphalt."}
+{"index": "partiprompts231", "data": "a precarious stack of three pickup trucks with the bottom one painted red, the middle one a faded blue, and the top one a dusty white. Each truck is dented and scratched, suggesting they've been through rough conditions. The trucks are set against a backdrop of a clear sky, and they cast long shadows on the gravel lot where they are stacked."}
+{"index": "partiprompts232", "data": "a rusted submarine lying on the sandy ocean floor, its once sleek black exterior now mottled with patches of corrosion and marine growth. The submarine's hatch is partially buried in the sediment, and schools of small, colorful fish swim in and out of the broken portholes. The surrounding waters are a deep, murky blue, with shafts of sunlight filtering down from the surface, illuminating the submarine's ghostly silhouette."}
+{"index": "partiprompts233", "data": "Three commercial airliners, with their distinctive liveries, are lined up side by side at an airport terminal. The planes, with their white fuselages and colored tails, are connected to the jet bridges, which are bustling with activity. In the foreground, the tarmac is marked with guiding lines and ground service equipment can be seen servicing the aircraft."}
+{"index": "partiprompts234", "data": "A sleek white boat with the words 'BLUE GROOVE' emblazoned in bold blue letters along its hull. The boat is moored to a wooden dock with ropes coiled neatly on the planks. The vessel's polished surface reflects the bright sunlight, and a few life jackets can be seen piled inside the boat's interior."}
+{"index": "partiprompts235", "data": "a powerful steam locomotive with a black and red exterior, billowing white steam as it speeds along the tracks through a vast, sandy desert landscape. The locomotive's wheels kick up small clouds of sand, and the clear blue sky stretches endlessly above. No other vehicles or structures are in sight, just the occasional cactus dotting the horizon."}
+{"index": "partiprompts236", "data": "A grand wall within a royal castle, adorned with two large, ornate frames. The painting on the left showcases a regal raccoon king, depicted in vibrant oil colors with meticulous attention to his fur and crown. Opposite to it, the right painting is an equally detailed portrayal of the royal raccoon queen, her gaze dignified and attire resplendent. At the foot of these majestic artworks, a small, fluffy dog with a curious expression stands, clutching a handwritten sign in its mouth that pleads, \"plz conserve,\" adding a touch of whimsy to the stately scene."}
+{"index": "partiprompts237", "data": "A whimsical scene unfolds in a lecture hall where a donkey, adorned in a vibrant clown costume complete with a ruffled collar and a pointed hat, stands confidently at the podium. The donkey is captured in a high-resolution photo, addressing an audience of attentive students seated in rows of wooden desks. Behind the donkey, a large blackboard is filled with complex mathematical equations, hinting at the serious nature of the lecture juxtaposed with the humorous attire of the lecturer."}
+{"index": "partiprompts238", "data": "A warm and inviting living room featuring a plush beige couch with a colorful throw pillow. Above the couch hangs a whimsical painting of a corgi, framed in a simple black frame that contrasts with the light-colored wall. In front of the couch, a round wooden coffee table holds a clear vase filled with fresh flowers, adding a touch of nature to the space."}
+{"index": "partiprompts239", "data": "A spacious room with a high ceiling, where a single, narrow beam of natural light streams down from a small skylight above. This focused beam of light casts a warm glow upon an easel standing directly beneath it. Mounted on the easel is a detailed Rembrandt-style painting, depicting the intricate features of a raccoon's face, which is highlighted by the light against the otherwise dim surroundings."}
+{"index": "partiprompts24", "data": "A surreal landscape where the iconic skyline of downtown Manhattan, with its towering skyscrapers and bustling streets, is juxtaposed against the majestic backdrop of Mount Everest, its peak shrouded in snow and clouds. In the foreground, the ancient Great Pyramid of Giza stands solitary, its limestone blocks weathered to a golden hue, casting a long shadow in the direction of the urban architecture. This unusual combination of human-made wonders spans different continents and eras, creating a striking and imaginative scene."}
+{"index": "partiprompts240", "data": "a gathering of several cats with various coat patterns sitting attentively around a small table. A whiteboard stands in the background with the phrase \"stack more layers\" scrawled across it in bold, black marker. The room is filled with miniature chairs and a tiny coffee mug placed at the center of the table, suggesting an organized feline meeting."}
+{"index": "partiprompts241", "data": "A living room scene featuring an overturned glass of red wine on a beige fabric couch. The spilled wine has created an accidental pattern on the couch's surface, whimsically resembling the word \"OOPS\". The couch is positioned next to a small wooden side table, upon which rests an open book and a remote control, untouched by the spill."}
+{"index": "partiprompts242", "data": "A spacious living room features an unlit fireplace with a sleek, flat-screen television mounted above it. The television screen displays a heartwarming scene of a lion embracing a giraffe in a cartoon animation. The mantle of the fireplace is adorned with decorative items, including a small clock and a couple of framed photographs."}
+{"index": "partiprompts243", "data": "a sizable gift box wrapped in shimmering silver paper and secured with a glossy red ribbon, positioned to the left of a lush green Christmas tree adorned with twinkling lights and golden ornaments. The tree stands on a soft, white tree skirt that contrasts with the dark wooden floor, and scattered around are smaller presents adding to the festive scene."}
+{"index": "partiprompts244", "data": "a spacious room featuring a deep blue wall that serves as the backdrop for an expansive framed watercolor painting. The artwork depicts a serene mountain landscape with subtle hues of green and blue, suggesting a peaceful natural setting. Below the painting, a small wooden table holds a decorative vase with dried flowers, complementing the room's aesthetic."}
+{"index": "partiprompts245", "data": "a spacious room featuring a large painting of the Statue of Liberty prominently displayed on the main wall. Two modern chairs with sleek, silver frames and black cushions are positioned facing the artwork, creating a simple yet elegant seating area. The floor beneath is a polished hardwood, reflecting the soft light that filters into the room from a nearby window."}
+{"index": "partiprompts246", "data": "a spacious living room featuring a towering Egyptian statue situated in the corner, its surface a blend of gold and azure hues. The room is furnished with a plush beige sofa and a glass coffee table in the center. Along the walls, bookshelves filled with an array of books add a cozy feel to the space."}
+{"index": "partiprompts247", "data": "a bright yellow wall serves as the backdrop for an impressive, large framed oil painting, which vividly depicts a vintage car in shades of red and chrome. The painting is positioned at eye level and is flanked by two small wall-mounted lamps that cast a soft glow, accentuating the colors and details of the artwork. Below the painting, a narrow console table with a glossy finish reflects the light, enhancing the overall visual impact of the display."}
+{"index": "partiprompts248", "data": "A large, bold sign with the words 'KEEP OFF THE GRASS' is painted in white letters on a weathered brick wall. The wall has a rough texture and shows signs of age with some bricks slightly chipped and discolored. In front of the wall, there's a small strip of green grass that appears well-maintained, contrasting with the urban backdrop."}
+{"index": "partiprompts249", "data": "A sparse kitchen interior featuring natural wood cabinets and pristine white appliances, including a refrigerator and an oven. The space is characterized by clean lines and a minimalist aesthetic, with a light-colored tile backsplash complementing the simplicity of the room. The wooden cabinets have sleek, modern handles, and the absence of clutter on the countertops accentuates the room's spacious feel."}
+{"index": "partiprompts25", "data": "An imaginative scene where the iconic Sydney Opera House, with its white sail-like shells, sits prominently on the left. To the right, the Eiffel Tower, constructed of intricate iron lattice work, towers over the landscape. Behind both landmarks, the majestic Mount Everest looms, its snow-capped peak piercing the sky."}
+{"index": "partiprompts250", "data": "a festive array of red and yellow balloons tied with curling ribbons, gently bobbing from the breeze of a spinning ceiling fan. The fan has wooden blades and a brass finish, which contrasts with the bright colors of the balloons. The balloons are clustered in a joyful bunch, casting soft shadows on the ceiling above."}
+{"index": "partiprompts251", "data": "A grand piano with a glossy black finish, its lid propped open to reveal the intricate inner workings. Resting on the music stand is an open songbook, its pages filled with musical notations above the ivory keys. The piano sits in a room with hardwood floors and a high ceiling, which enhances the instrument's rich sound."}
+{"index": "partiprompts252", "data": "a rustic wood cabin nestled in a clearing, its dark brown logs contrasting with the bright green of the surrounding grass. In front of the cabin, there's a circular fire pit made of stacked stones, with a few wooden benches arranged around it. The fire pit is currently unlit, and a stack of chopped firewood is neatly piled to one side, ready for use."}
+{"index": "partiprompts253", "data": "A spacious living area featuring a large flat-screen television mounted on a pale wall, directly above a sleek, black entertainment console. In front of the television, there is a rectangular coffee table made of dark wood, surrounded by a comfortable beige sofa and two matching armchairs. The table is adorned with a small potted plant and a few magazines neatly arranged to one side."}
+{"index": "partiprompts254", "data": "An elegant ceiling fan with intricately designed blades, suspended from a high ceiling. The fan is equipped with an ornate light fixture that features delicate glass panels and brass accents. The light casts a warm glow that reflects off the polished wooden floor below."}
+{"index": "partiprompts255", "data": "a grand piano, its glossy black surface partially obscured by the warm glow of multicolored Christmas lights draped across it. The piano stands in a room with hardwood floors and a high ceiling, giving it an elegant presence. Nearby, a green potted plant adds a touch of life to the scene, contrasting with the piano's ebony finish."}
+{"index": "partiprompts256", "data": "a quaint kitchen space with a small white goat standing amidst the appliances and cabinetry. The cabinets are a pale shade of wood, and the countertops are a matching white, cluttered with various kitchen utensils and a bowl of fresh vegetables. Sunlight streams in through a window above the sink, casting a warm glow on the goat and the tiled flooring."}
+{"index": "partiprompts257", "data": "a rectangular wooden coffee table situated in the center of a living room, with a glossy lifestyle magazine spread open on its surface. The table also features a small potted plant with vibrant green leaves, adding a touch of nature to the setting. Around the table, a plush beige carpet can be seen, complementing the warm tones of the room's decor."}
+{"index": "partiprompts258", "data": "a spacious kitchen featuring a large stainless steel refrigerator with a double-door design. The refrigerator stands next to sleek, dark wooden cabinets that reach up to the ceiling. In front of the refrigerator, there is a kitchen island with a white marble countertop, and hanging above are three modern pendant lights with a brushed metal finish."}
+{"index": "partiprompts259", "data": "a collection of various laptops with different sizes and colors, stacked haphazardly on a plush beige sofa. The sofa is positioned against a white wall, and the laptops appear to be in a state of disuse with some open and others closed. Nearby, a small wooden coffee table holds a single potted plant, adding a touch of greenery to the scene."}
+{"index": "partiprompts26", "data": "A surreal and unlikely scene where the iconic Millennium Wheel, with its towering white and blue structure, stands adjacent to the green-hued Statue of Liberty, which raises its torch high into the sky. In the background, the intricate spires of the Sagrada Familia church rise majestically, showcasing its unique architectural blend of Gothic and Art Nouveau styles. The three landmarks are juxtaposed in a way that defies their geographical realities, creating a fantastical skyline."}
+{"index": "partiprompts260", "data": "three violins with a glossy finish and delicate strings lying side by side on a polished hardwood floor. the instruments, each with a rich brown color and unique wood grain patterns, are positioned near a black music stand. natural light from a nearby window casts soft shadows around the violins, highlighting their elegant curves."}
+{"index": "partiprompts261", "data": "A sleek black grand piano sits majestically in the center of the room, its polished surface reflecting the overhead lights. Adjacent to it is a white bench with curved legs, positioned perfectly for a pianist to sit and play. The piano's open lid reveals the intricate strings and hammers inside, ready to produce melodious sounds."}
+{"index": "partiprompts262", "data": "A sleek, silver robot with articulated arms is standing in a modern kitchen, surrounded by stainless steel appliances. It is carefully stirring a pot on the stove, which is filled with a colorful mixture of vegetables. The countertops are neatly arranged with various cooking utensils and ingredients, including a cutting board with freshly chopped herbs."}
+{"index": "partiprompts263", "data": "two grand pianos with glossy black finishes are positioned adjacent to each other in a spacious room with high ceilings. each piano is open, revealing the intricate strings and hammers inside, and the polished wood reflects the overhead lighting. between the pianos, there's a narrow walkway, and a red velvet curtain can be seen in the background, suggesting a performance area or a music hall setting."}
+{"index": "partiprompts264", "data": "Two ceramic cups filled with steaming coffee are placed on a wooden table with a natural grain finish. The cup on the left showcases intricate latte art spelling out the word \"LOVE\" with a heart-shaped design, while the cup on the right has the word \"PEACE\" beautifully crafted atop its frothy surface. Both cups have a glossy finish, and the warm lighting accentuates the creamy texture of the latte art."}
+{"index": "partiprompts265", "data": "a metallic humanoid robot with a sleek silver finish is captured in mid-air, its limbs splayed out in a dramatic fashion. the robot is surrounded by a vibrant array of Easter eggs, each painted in bright, glossy hues of pink, blue, yellow, and green. the eggs are scattered haphazardly on the lush green grass beneath the robot, creating a stark contrast with the android's chrome appearance."}
+{"index": "partiprompts266", "data": "A surreal image of a clear glass light bulb adrift in the vast expanse of outer space, its filament replaced by a miniature sailing boat with white sails unfurled. The light bulb, seemingly floating among the stars and nebulae, casts a subtle glow that illuminates the delicate features of the boat within. Surrounding the bulb, the cosmos stretches out in shades of deep blues and purples, speckled with the twinkling of distant stars."}
+{"index": "partiprompts267", "data": "A towering robot with a head shaped like an oversized coffee cup looms over a city street, its metallic body reflecting the sunlight. One of its colossal feet is planted firmly on a crushed red sedan, crumpling the vehicle beneath its weight. The robot's arms, equipped with articulated joints, are poised as if ready for action, while around it, the street is scattered with debris from the chaos."}
+{"index": "partiprompts268", "data": "a striking propaganda poster featuring a cat with a sly expression, dressed in an elaborate costume reminiscent of French Emperor Napoleon Bonaparte. The feline figure is holding a large, yellow wedge of cheese as if it were a precious treasure. The background of the poster is a bold red, with ornate golden details that give it an air of regal authority."}
+{"index": "partiprompts269", "data": "Looking up from a low angle, a tall white ladder with a single rung is propped against a textured yellow brick wall. The ladder's shadow casts a long, dark line across the bricks, hinting at the late afternoon sun. The wall shows signs of age with some bricks slightly eroded, giving it a rugged appearance."}
+{"index": "partiprompts27", "data": "A detailed Persian metal engraving vase, showcasing intricate patterns and designs, rests to the left side of a vibrant bouquet of orange flowers. The vase, with its silver sheen and traditional craftsmanship, sits on a polished wooden table. The orange petals of the flowers provide a stark contrast to the metallic tones of the vase, creating a visually appealing composition."}
+{"index": "partiprompts270", "data": "A row of waste disposal bins neatly aligned against a concrete wall, with a brown trash bin in the center. To its left, there is a green compost bin marked with recycling symbols, and to its right stands a blue recycling bin designated for paper and plastics. The bins are situated on a gray pavement, and each has a label indicating its specific use for waste segregation."}
+{"index": "partiprompts271", "data": "a neatly printed quote, 'Do unto others as they would do unto you,' in a simple black font centered on a pristine white canvas. The text is surrounded by a thin black border, and the canvas is positioned against a light grey wall. Just below the quote, in smaller letters, the source of the saying is attributed, adding a touch of elegance to the presentation."}
+{"index": "partiprompts272", "data": "A massive, red-striped ball made of lightweight styrofoam, careened into a small, round wooden table, causing it to collapse under the unexpected force. The table, previously adorned with a delicate lace tablecloth and a ceramic vase, now lies in disarray with the broken vase and scattered flowers beside the remnants of the table. The ball, seemingly unscathed by the collision, rests awkwardly against the splintered wood of the shattered table."}
+{"index": "partiprompts273", "data": "A smooth, rectangular bar of dark chocolate lying on a white marble surface, with the ironic word \"WRAPPER\" embossed on its surface in bold letters. The chocolate bar, with its neatly segmented squares, is unwrapped and ready to be broken apart and enjoyed. Surrounding the bar, there are faint traces of its former gold foil wrapper, crinkled and pushed aside."}
+{"index": "partiprompts274", "data": "A unique, high-contrast painting depicting an espresso machine with a dark, almost sinister appearance, as if it were crafted from shadows and whispers. The machine stands out against a stark white background, its spouts resembling outstretched hands, ready to brew a concoction from the very essence of human souls. The artwork conveys a surreal and eerie atmosphere, where the machine becomes an otherworldly artifact with a power beyond mere coffee making."}
+{"index": "partiprompts275", "data": "An innovative water sculpture shaped like a flat-screen television stands in the center of a dimly lit room. The liquid crystal display, made entirely of cascading water, presents a luminous cityscape at night, with twinkling lights from skyscrapers and a reflection of the moon on the water's surface. Surrounding the unique installation, the floor is tiled in dark hues, accentuating the ethereal glow of the projected urban scene."}
+{"index": "partiprompts276", "data": "Long: Scattered across the dark wooden floor are long, jagged shards of a shattered mirror, each piece reflecting the intense, yellow eyes of a great horned owl perched nearby. The owl's feathers are a mix of deep browns and soft grays, and it sits stoically on the branch of an indoor plant. The room is dimly lit, casting a moody glow that enhances the sharpness of the mirror fragments and the piercing gaze of the owl."}
+{"index": "partiprompts277", "data": "A striking piece of street art depicting a white robot with a vibrant red mohawk, painted against the rough texture of a red brick wall. The robot's eyes are detailed with a piercing blue, contrasting sharply with the warm tones of the bricks. Around the robot, the wall is adorned with various tags and smaller pieces of graffiti, adding to the urban tapestry of the scene."}
+{"index": "partiprompts278", "data": "Two violins with rich, brown varnish are standing upright, their necks leaning against a light-colored wooden chair. The bows are placed carefully on the ground in front of them, with the horsehair facing upwards. The instruments cast soft shadows on the polished floor, hinting at the natural light entering the room."}
+{"index": "partiprompts279", "data": "A gleaming golden trophy with intricate engravings stands too tall to fit inside a small, worn brown leather suitcase that lies open on the floor. The suitcase, cluttered with clothes and other personal items, cannot accommodate the trophy's wide base and elongated handles. The surrounding space is cramped, with other travel accessories and a pair of shoes scattered nearby, indicating a packing process interrupted by the realization that the trophy must be transported separately."}
+{"index": "partiprompts28", "data": "An intricate culinary creation, a map of the United States crafted entirely out of assorted sushi pieces, is displayed on a large, round white plate. The plate sits on a dark wooden table, accompanied by a tall glass of red wine to its right, casting a slight shadow on the polished surface. Each state is represented by a different type of sushi, offering a colorful and textured mosaic of rice, seaweed, and various fish."}
+{"index": "partiprompts280", "data": "a close-up image of a wooden fiddle with fine grain details, resting beside an orange basketball on the green surface of a ping pong table. The table has white boundary lines and a net stretched taut in the center. In the background, there's a blurred view of a room with a gray floor and a stack of folded chairs against the wall."}
+{"index": "partiprompts281", "data": "An outdoor scene featuring a textured gravel driveway where a bright orange basketball rests to the left of two black and white soccer balls. The driveway is bordered by a patch of green grass, and the balls cast soft shadows on the ground due to the overhead sunlight. In the background, there is a glimpse of a wooden fence that marks the boundary of the property."}
+{"index": "partiprompts282", "data": "a vibrant lavender backpack with a plush triceratops head peeking out from the top, its green eyes and three distinct horns adding a playful touch. The backpack is made of a soft, velvety material and is resting against a pale wooden bench. Around it, there are scattered crayons and a few sheets of paper with childlike drawings."}
+{"index": "partiprompts283", "data": "An intricate oil painting depicting a colossal robot constructed from an assortment of sushi pieces, wielding a pair of oversized wooden chopsticks. The robot stands against a backdrop of a futuristic cityscape, with its sushi components meticulously detailed to show the texture of rice and seaweed. The chopsticks in the robot's grasp are elegantly positioned as if ready to pluck a piece of sushi from a hovering platter."}
+{"index": "partiprompts284", "data": "Three robots of varying hues are positioned in a row, each with a distinct and sleek design. The white robot on the left boasts a glossy finish and rounded edges, while the central red robot has a more angular form with matte surfaces. To the right, the black robot stands with a metallic sheen, its articulated joints suggesting advanced mobility. They are all placed on a smooth, gray concrete floor, and behind them is a plain wall with a few cables hanging neatly arranged."}
+{"index": "partiprompts285", "data": "A delicate, spherical glass sculpture with intricate blue and green patterns, previously perched on a wooden shelf, has tumbled to the floor due to the lack of secure anchoring. The shelf, lined with various other art pieces, stands against a pale yellow wall. The fallen sculpture now lies near a potted fern, its position suggesting the quiet aftermath of the incident."}
+{"index": "partiprompts286", "data": "A bright yellow, diamond-shaped traffic sign stands out with a black silhouette of a wooly mammoth prominently displayed in its center. The sign's edges are slightly worn, indicating it has withstood the elements over time. It is mounted on a metal pole beside a road that meanders through a grassy area with a few scattered trees."}
+{"index": "partiprompts287", "data": "Two well-worn baseballs, their white leather stained and scuffed from use, rest on a wooden floor on either side of a vibrant yellow basketball. The basketball, with its pebbled texture and black lines, stands out in contrast to the muted tones of the baseballs. The trio of sports equipment is casually arranged, suggesting a recent game or practice session."}
+{"index": "partiprompts288", "data": "A spacious, open book lies flat on a wooden table, its pages filled with blocks of text and a large, detailed illustration of a cat on the right side. The illustration depicts a gray feline with intricate patterns, lounging amidst a backdrop of sketched furniture. The left page is densely packed with small, black font, narrating a story that accompanies the image, and the edges of the book's pages show signs of frequent use."}
+{"index": "partiprompts289", "data": "An exquisite mahogany chair with intricate carvings stands prominently in the room. The chair features a high back with elaborate designs and a plush red cushion that provides a stark contrast to the dark wood. Its legs are elegantly curved, adding to the overall sophistication of the piece. The texture of the cushion looks soft and inviting, beckoning one to sit and enjoy the comfort it offers."}
+{"index": "partiprompts29", "data": "a traditional kachina doll stands with its intricate feathers adorning its head, creating a vibrant crown of colors. It is dressed in a ceremonial white dress with detailed patterns, and its feet are clad in meticulously crafted brown boots. The doll is positioned against a plain backdrop, which accentuates its detailed craftsmanship and the rich cultural heritage it represents."}
+{"index": "partiprompts290", "data": "a spacious square where a large, weathered stone pedestal stands prominently at the center, devoid of the horse statue that once adorned it. The pedestal's surface is rough and covered with patches of moss, indicating its age and exposure to the elements. Around the base, cobblestones are laid in a pattern that radiates outward, suggesting the importance of the now-absent statue."}
+{"index": "partiprompts291", "data": "Two flags are flying side by side against a clear sky; the first is a white flag emblazoned with a bold red circle in its center, while the second is a solid blue flag with no markings. The flags are attached to grey metal poles that stand in close proximity to each other, creating a striking contrast of colors. The fabric of the flags ripples gently in the breeze, highlighting their distinct and vibrant hues."}
+{"index": "partiprompts292", "data": "A sleek, metallic robot with articulated joints and glowing blue eyes stands holding a large, rectangular sign with bold, colorful letters that spell out \"Let's PAINT!\" The robot's silver surface reflects the bright lights of the room, and it is positioned next to an array of paint cans in vibrant hues, neatly arranged on a tarp-covered floor. Behind the robot, a blank canvas on an easel awaits the first stroke of creativity."}
+{"index": "partiprompts293", "data": "a small brown football rests on a green surface, positioned in front of a trio of bright yellow tennis balls that are neatly aligned. The football's laces are prominently displayed, contrasting with the smooth, fuzzy texture of the tennis balls. In the background, there is a blurred net, suggesting the proximity of a sports field or court."}
+{"index": "partiprompts294", "data": "a whimsical illustration of a small, white baby daikon radish with rosy cheeks and green shoots atop its head, donning a pink, frilly tutu. It is walking a brown, fluffy dog on a red leash, which looks up at the radish with a playful expression. The background features a simple, pastel-colored path that winds through a grassy field."}
+{"index": "partiprompts295", "data": "a vibrant flower with large, crimson petals and a bright yellow center, standing in stark contrast to the moon's barren, grey surface. The flower's delicate texture is a surprising sight amidst the moon's craters and dust. In the background, the Earth can be seen rising, a swirl of blue and white, providing a breathtaking backdrop to this surreal lunar scene."}
+{"index": "partiprompts296", "data": "A peculiar tree with a trunk that twists slightly as it rises stands in the center of a garden. Its branches are adorned with square-shaped, blue apples that hang amidst circular, bright yellow leaves. The contrast between the unconventional fruit and the vibrant foliage creates a striking visual against the backdrop of a clear sky."}
+{"index": "partiprompts297", "data": "A vibrant apple tree with a sturdy brown trunk, its branches laden with bright red, square-shaped apples, each one distinct in its geometric form. The tree is surrounded by a lush canopy of circular, green leaves that contrast sharply with the unusual shape of the fruit. Sunlight filters through the foliage, casting dappled shadows on the ground below."}
+{"index": "partiprompts298", "data": "A unique tree stands with its branches adorned with leaves that resemble vibrant purple balloons, glistening in the sunlight. The tree's trunk is a deep brown with a rough texture, contrasting sharply with the smooth, balloon-like foliage. Around the base of the tree, a bed of green grass provides a natural carpet, setting off the whimsical appearance of the tree's unusual leaves."}
+{"index": "partiprompts299", "data": "A tall, leafy tree casting its reflection on the glass sunroof of a parked blue sedan. The car's glossy paintwork accentuates the tree's intricate silhouette, and the sunroof provides a clear, mirror-like surface that captures the sky and branches above. Around the vehicle, the pavement is speckled with small shadows from the tree's leaves, hinting at the calm, breezy day."}
+{"index": "partiprompts3", "data": "An anime-style illustration depicts a whimsical scene where a kangaroo, rendered in vibrant shades of brown and tan, clutches a rectangular sign with the words \"Starry Night\" scrawled across it in bold, whimsical lettering. The kangaroo is seated comfortably in front of the iconic Sydney Opera House, its distinctive white sail-like structures contrasting sharply with the deep blue of the night sky. To the kangaroo's side, the Eiffel Tower looms, its iron lattice silhouette adding to the surreal composition. Above, the sky is alive with dynamic swirls of blue and bursts of radiant yellow stars, capturing the essence of a dreamlike cosmic event."}
+{"index": "partiprompts30", "data": "An intricately arranged wooden charcuterie board is adorned with an assortment of farm animal figurines, each skillfully crafted from various types of cheese and slices of ham. The cows, sheep, and pigs are positioned amidst a landscape of crackers and grapes, creating a playful barnyard scene. In the background, a brown dog with perked ears and a glossy coat sits attentively, its gaze fixed on the edible menagerie with an unmistakable look of longing."}
+{"index": "partiprompts300", "data": "a vibrant arrangement of blue and yellow flowers, with delicate petals and lush green stems, placed in a clear glass vase. The vase is situated on a polished wooden table, which reflects the soft light illuminating the room. Around the vase, there are a few scattered leaves, adding a touch of natural charm to the setting."}
+{"index": "partiprompts301", "data": "a potted plant with delicate small flowers featuring vibrant purple petals sits on a wooden windowsill. the plant's green leaves are lush and interspersed among the flowers, creating a contrast against the light-colored wall behind it. sunlight filters through the window, casting a soft glow on the plant's foliage and petals."}
+{"index": "partiprompts302", "data": "a vibrant plant with star-shaped orange flowers blooming amidst lush green leaves. The plant is potted in a terracotta container that sits on a wooden shelf. Around it, other houseplants contribute to a small indoor garden atmosphere."}
+{"index": "partiprompts303", "data": "A whimsical illustration featuring a small, white baby daikon radish with little green shoots on top, dressed in a pink tutu. The radish character is anthropomorphized with tiny arms and legs, walking a brown dog on a red leash. The dog appears to be a friendly, medium-sized breed with a wagging tail, and they are both on a gray sidewalk next to a grassy area."}
+{"index": "partiprompts304", "data": "A close-up image of a unique four-leaf clover, intricately formed from droplets of water on a smooth, reflective surface. Each leaf of the clover is perfectly shaped, with the water's surface tension creating a delicate and symmetrical appearance. The background is a soft blur, emphasizing the clarity and detail of the water-formed clover in the foreground."}
+{"index": "partiprompts305", "data": "A quaint garden space featuring a robust apple tree with a sturdy trunk and branches laden with ripe red apples. The garden itself is bordered by a low, stone wall and contains a variety of flowering plants and shrubs. In the foreground, a small wooden bench invites visitors to sit and enjoy the peaceful surroundings."}
+{"index": "partiprompts306", "data": "A lush green ivy plant with broad leaves, creeping up the side of a weathered red brick wall. The wall has a rough texture, and the plant's tendrils are firmly attached, showcasing the contrast between the organic growth and the man-made structure. Small patches of moss can also be seen interspersed between the bricks, adding to the visual diversity of the scene."}
+{"index": "partiprompts307", "data": "a vibrant green plant with broad leaves and a sturdy stem, situated at the bottom of a clear, gently flowing stream. The stream's bed is lined with smooth, multicolored pebbles that glisten under the water. Sunlight filters through the water, casting dappled patterns on the plant and the surrounding rocks."}
+{"index": "partiprompts308", "data": "a peculiar sight of a tree with vibrant yellow leaves, each leaf delicately edged with hints of autumnal orange. Among the branches hang unusual blue apples, their smooth surfaces reflecting the soft sunlight. The tree stands alone in a field, its roots sprawling across the rich, brown earth."}
+{"index": "partiprompts309", "data": "a unique flower with delicate pink petals arranged in a circular pattern, at the center of which is an intricate design resembling the face of a cat with green eyes. The flower's texture appears soft and velvety, and it is situated among a bed of green leaves that provide a contrasting backdrop to its whimsical feature."}
+{"index": "partiprompts31", "data": "A vibrant and detailed image showcasing a large ceramic bowl filled with steaming ramen, the noodles and broth glistening under the light. Floating amidst the savory dish are several origami boats, each crafted from paper in hues of red, blue, and yellow, adding a whimsical touch to the culinary scene. The bowl sits on a dark wooden table, contrasting with the bright colors of the paper crafts and the rich tones of the ramen ingredients."}
+{"index": "partiprompts310", "data": "A close-up image of an intricately designed lotus flower, which appears to be crafted entirely from crystal-clear water droplets. The flower is set against a backdrop of soft green lily pads floating on a tranquil pond. Sunlight filters through the scene, highlighting the delicate texture and the shimmering surface of the water-formed petals."}
+{"index": "partiprompts311", "data": "A vibrant garden scene where two red tulips stand out with their bright petals, surrounded by three white daisies with yellow centers. The flowers are nestled among green leaves and stems, with the red tulips slightly taller than the daisies. The arrangement is set against a backdrop of dark soil, hinting at a well-tended garden bed."}
+{"index": "partiprompts312", "data": "a vibrant apple tree with a multitude of shiny red apples nestled among lush green leaves. the tree's branches are spread wide, with the sunlight filtering through the foliage and casting dappled shadows on the ground below. some apples hang low enough to be within arm's reach, while others are perched higher up, peeking out from the leafy canopy."}
+{"index": "partiprompts313", "data": "a picturesque scene featuring a small tree, its branches laden with delicate white blossoms, standing in the center of a lush green lawn. the tree's rounded shape is accentuated by the contrast of the vibrant green leaves against the pure white petals. surrounding the tree, a variety of colorful flowers can be seen, adding to the charm of the tranquil setting."}
+{"index": "partiprompts314", "data": "A close-up image capturing the intricate details of a maple leaf, which is composed entirely of clear, sparkling water droplets. The leaf is set against a smooth, dark background that accentuates its delicate water structure. The droplets glisten as they cling to the invisible veins of the leaf, creating a natural yet surreal piece of art."}
+{"index": "partiprompts315", "data": "a colorful butterfly-shaped kite entangled among the branches of a tall oak tree. the kite's wings are a vibrant mix of blue and yellow, contrasting with the green leaves. the tree's rough bark and the kite's silky texture are juxtaposed as the kite flutters gently in the breeze."}
+{"index": "partiprompts316", "data": "A creative image showcasing a palm tree that appears to be crafted entirely out of water, with droplets glistening as they form the shape of the fronds and trunk. The tree stands against a clear blue sky, and the sun's rays seem to dance off the watery surface, giving the illusion of movement. The water-palm is positioned on the left side of the frame, with its reflection subtly visible on the wet sand beneath it."}
+{"index": "partiprompts317", "data": "a cheerful yellow banana with a wide, drawn-on smile, donning a red bandana with white paisley patterns. It's placed against a backdrop of assorted fruits on a light wooden table. The banana's peel is partially opened, revealing its ripe, edible interior."}
+{"index": "partiprompts318", "data": "A solitary tree stands in the midst of a grassy field, its branches reaching out in all directions without a single leaf to be seen. The bark of the tree is a rough, gray texture, contrasting with the vibrant green of the grass. Despite the warm season, the tree's bare limbs give it a stark, sculptural appearance against the clear blue sky."}
+{"index": "partiprompts319", "data": "An up-close image showcasing the intricate interior of a walnut, split cleanly down the middle to reveal its textured, brain-like halves. The walnut's shell exhibits a rich brown color with darker lines marking the divisions, while the nut inside contrasts with its lighter, creamy hue. The cross-section rests against a backdrop of a wooden table with visible grain patterns, highlighting the natural origin of the walnut."}
+{"index": "partiprompts32", "data": "Two ceramic cups sit side by side on a marble countertop, one featuring an intricate latte art design of the Eiffel Tower, the other boasting a foamy depiction of the Statue of Liberty. Both cups have a glossy finish, with the Eiffel Tower cup having a delicate, pastel blue hue, while the Statue of Liberty cup is a soft shade of pink. The steam rising from the cups suggests the coffee is freshly brewed, and a silver spoon rests beside each cup, positioned on a small, round saucer."}
+{"index": "partiprompts320", "data": "a whimsical scene featuring a bright orange fruit donning a miniature brown cowboy hat with intricate stitching. The orange sits atop a wooden table, its textured peel contrasting with the smooth surface beneath. To the side of the orange, there's a small cactus in a terracotta pot, completing the playful western theme."}
+{"index": "partiprompts321", "data": "a sturdy tree with a thick trunk and sprawling branches that have intertwined with the metal links of a silver chain-link fence. the fence is partially enveloped by the tree's roots and bark, creating a unique blend of natural and man-made elements. the surrounding area is dotted with green grass and a few scattered wildflowers."}
+{"index": "partiprompts322", "data": "A whimsical scene at the beach where a pineapple, complete with its spiky green leaves, is balanced atop a vibrant blue wave as if it were surfing. The pineapple's textured, golden-brown skin glistens with droplets of ocean water. In the background, the sandy shore is dotted with colorful beach umbrellas and sunbathers enjoying the sunny day."}
+{"index": "partiprompts323", "data": "a large, round orange pumpkin carved with a smiling face, sitting on a wooden table. Inside the hollowed-out pumpkin, a small flickering candle casts a warm glow through the cut-out eyes and mouth. The pumpkin is surrounded by a scattering of fallen autumn leaves."}
+{"index": "partiprompts33", "data": "An imaginative scene unfolds with a castle intricately constructed from golden tortilla chips, its towers and walls standing tall amidst a flowing river of vibrant red salsa. Surrounding the edible fortress, tiny burritos, wrapped in soft tortillas with visible fillings, appear to be animated and meandering along the banks of the salsa river. The entire whimsical landscape is set upon a large plate, suggesting a playful, culinary creation."}
+{"index": "partiprompts34", "data": "An animated scene where a hamburger with boxing gloves is aggressively confronting a weary hot dog in a miniature boxing ring. The hot dog, adorned with a squiggle of mustard, appears to be on the verge of defeat, leaning heavily against the ropes of the ring. Surrounding the ring are enthusiastic condiment bottles and a cheering crowd of assorted snack foods."}
+{"index": "partiprompts35", "data": "A dining table is laden with an array of Singaporean dishes, featuring a plate of fragrant chicken rice with golden-brown skin, a bowl of bak chor mee with minced meat and springy noodles, and a steaming bowl of spicy laksa with prawns and fish cakes. The tablecloth is a vibrant red, complementing the colorful dishes, and chopsticks rest beside each bowl. In the background, a pitcher of iced water and glasses are neatly arranged, ready for serving."}
+{"index": "partiprompts36", "data": "A close-up view of a juicy burger patty resting on a soft, lightly toasted bottom bun. Fresh green lettuce and bright red tomato slices are neatly placed on top of the patty. Bold yellow letters spelling out \"COFFEE\" are artistically drizzled across the burger in a smooth mustard script. The burger sits on a plain white plate, with a few mustard droplets scattered around the edges."}
+{"index": "partiprompts37", "data": "A clean white plate sits empty on a polished wooden table, with no bananas in sight. Beside it, a clear glass stands, also devoid of any orange juice, reflecting the light from the room. The table surface is smooth and the area around the plate and glass is uncluttered, emphasizing their emptiness."}
+{"index": "partiprompts38", "data": "A clear glass filled with vibrant orange juice sits to the right of a white ceramic plate, which cradles two slices of golden-brown toast generously spread with melting butter. The plate and glass are positioned on a light wooden table, with a patterned napkin folded neatly beside them. Sunlight filters through a nearby window, casting a warm glow on the breakfast setup and highlighting the texture of the freshly baked toast."}
+{"index": "partiprompts39", "data": "A whimsical bowl of soup sits on a wooden table, its broth a vibrant green with chunks of tofu arranged to resemble the eyes and mouth of a playful monster. The words \"deep learning\" are spelled out in thin slices of carrot, floating on the surface of the soup. The bowl itself is a deep blue ceramic, providing a striking contrast to the lighter colors of the soup and its garnishes."}
+{"index": "partiprompts4", "data": "A towering Gundam robot, painted in a striking combination of white, blue, and red, stands with its gleaming sword raised high against the backdrop of a sprawling metropolis. The cityscape features an array of tall, glass skyscrapers that reflect the fading light of the sunset. Beyond the urban expanse, a majestic mountain range looms, leading to the tranquil expanse of the ocean, while above, a dark, ominous moon dominates the twilight sky. The entire scene is rendered in a vivid, high-contrast style reminiscent of a detailed anime illustration."}
+{"index": "partiprompts40", "data": "A cold bottle of amber-colored beer with droplets of condensation sits beside a ceramic ashtray. The ashtray contains a half-smoked cigarette, its gray ash contrasting with the ashtray's dark hue. Nearby, a small pile of bottle caps and a lighter can be seen on the wooden surface of the table."}
+{"index": "partiprompts41", "data": "A ripe, golden pineapple sits centered on a light wooden table, with a single green-bottled beer to its left and a pair of identical bottles to its right. The beers have droplets of condensation on their surfaces, indicating they are chilled. The pineapple's spiky green leaves contrast with the smooth, cylindrical shape of the beer bottles."}
+{"index": "partiprompts42", "data": "A chilled, transparent bottle of light beer with condensation beads forming on its surface, placed on a wooden table. A bright yellow slice of lemon is wedged onto the rim of the bottle, adding a citrus accent to the beverage. The bottle label is partially obscured by the moisture, but hints of green and gold are visible on the design."}
+{"index": "partiprompts43", "data": "A large, clear glass pitcher filled to the brim with golden beer, the frothy head spilling slightly over the edge. An elephant's trunk, textured and wrinkled, is playfully dipped into the pitcher, disrupting the liquid's surface. The pitcher sits on a wooden table, with a few scattered peanuts nearby, hinting at a bar-like setting."}
+{"index": "partiprompts44", "data": "A ceramic plate filled with fluffy white rice, crowned with a colorful medley of sautéed vegetables including bright green broccoli, red bell peppers, and golden corn kernels. The plate is set upon a dark wooden dining table, contrasting with the vibrant hues of the food. Beside the plate, there's a set of silverware neatly placed on a navy blue napkin."}
+{"index": "partiprompts45", "data": "a vintage wine bottle with a tapered neck, its label partially peeled off, serving as a makeshift candle holder. A white candle, with wax drippings along the bottle's sides, is firmly stuck in the spout, casting a soft glow. The bottle is set upon a rustic wooden table, surrounded by a scattering of wax remnants and a few scattered wine glasses."}
+{"index": "partiprompts46", "data": "On a wooden cutting board, there are five vibrant green bell peppers neatly arranged to the right of two glossy red onions. The vegetables are freshly washed, with droplets of water accentuating their colors. The cutting board is set against a backdrop of a light gray kitchen countertop, which contrasts with the bright hues of the produce."}
+{"index": "partiprompts47", "data": "A tall, transparent glass filled with the amber-colored Long Island Iced Tea cocktail, garnished with a slice of lemon on the rim. Beside the glass, there's a white napkin folded neatly on a dark wooden bar top. The drink is accompanied by a thin, black straw, and in the background, there are bottles of various liquors lined up against a mirrored wall."}
+{"index": "partiprompts48", "data": "A steaming bowl of Pho, with a rich, clear broth and a generous topping of fresh bean sprouts. The bowl sits on a dark wooden table, accompanied by a side plate of lime wedges and basil leaves. The noodles are submerged beneath the broth, and thin slices of beef float on the surface, partially obscured by the green sprouts."}
+{"index": "partiprompts49", "data": "a close-up view of a chilled Long Island Iced Tea cocktail, served in a tall glass with a lemon wedge perched on the rim. The drink is a blend of light and dark liquids, giving it a gradient appearance, and it's garnished with a small, colorful paper umbrella. Condensation beads on the outside of the glass, indicating its refreshing temperature."}
+{"index": "partiprompts5", "data": "A whimsical scene captured in a portrait photo featuring a kangaroo donned in a vibrant orange hoodie and stylish blue sunglasses. The kangaroo stands confidently on the lush green grass, with the iconic Sydney Opera House forming an impressive backdrop. Clutched against its chest is a white sign with bold black letters that warmly proclaim \"Welcome Friends!\" to all who gaze upon the image."}
+{"index": "partiprompts50", "data": "a classic old-fashioned cocktail sits on a polished wooden bar top, its amber liquid gently hugging a large, clear ice cube. next to the glass, a white linen napkin is neatly folded, with a silver cocktail spoon resting atop. the drink is garnished with a vibrant orange peel, adding a pop of color and a hint of citrus aroma to the scene."}
+{"index": "partiprompts51", "data": "a golden-brown roast turkey being carefully taken out of a stainless steel oven by someone wearing oven mitts. the kitchen is filled with the aroma of the cooked turkey, and the counter nearby is set with various dishes and utensils needed for the meal preparation. the oven light casts a warm glow on the surrounding cream-colored kitchen tiles and the dark granite countertop."}
+{"index": "partiprompts52", "data": "A vibrant green bell pepper rests to the left of a glossy red bell pepper on a wooden cutting board. The two peppers are positioned against a backdrop of a well-stocked kitchen, with an array of spices and cooking utensils in the background. The contrasting colors of the peppers create a visually appealing composition on the neutral-toned surface of the board."}
+{"index": "partiprompts53", "data": "A tall glass filled with a vibrant red Bloody Mary cocktail, garnished with a green celery stick and a skewered olive. The glass is beaded with condensation and sits on a small white plate beside a folded white napkin. The cocktail is placed on a dark wooden bar counter, illuminated by warm overhead lighting."}
+{"index": "partiprompts54", "data": "A couple is seated at a small round table in a cozy café, with the man enjoying a warm latte from a tall white mug. The woman, casually dressed in a green sweater, is sipping a cold beer from a clear pint glass. Between them is a small vase with a single yellow tulip, adding a touch of color to the scene."}
+{"index": "partiprompts55", "data": "A stream of white milk flowing gracefully from a clear glass pitcher into a deep blue ceramic bowl. The bowl is sitting on a marble countertop, surrounded by a scattering of fresh strawberries and a box of cereal. Sunlight filters through a nearby window, casting a warm glow on the scene and highlighting the smooth texture of the milk's surface."}
+{"index": "partiprompts56", "data": "a piece of golden-brown toast resting on a white ceramic plate, topped with bright green, creamy slices of avocado arranged neatly. next to the plate, there's a silver knife with some avocado residue on the blade, and a sprinkle of red pepper flakes over the avocado slices adds a pop of color."}
+{"index": "partiprompts57", "data": "A close-up image of a ceramic plate filled with a colorful assortment of food, including slices of grilled chicken, a mix of steamed vegetables, and a scoop of mashed potatoes garnished with a sprig of parsley. The plate is set on a dark wooden dining table, and beside it lies a set of silverware wrapped neatly in a cloth napkin. The food is arranged in an appetizing display, showcasing a variety of textures from the crisp vegetables to the creamy potatoes."}
+{"index": "partiprompts58", "data": "A piece of golden-brown toast resting on a white ceramic plate, topped with bright yellow, freshly sliced mango. The mango slices are arranged in a fan-like pattern, and the plate sits on a light wooden table with a few crumbs scattered around. The texture of the toast contrasts with the soft, juicy mango pieces, creating an appetizing snack."}
+{"index": "partiprompts59", "data": "A whimsical dessert creation, an orange jello molded into the shape of a small man, stands proudly on a white ceramic plate. The jello figure is translucent with a glossy sheen, capturing the light in its wobbly form. Around it, there are scattered mint leaves for decoration, providing a contrast to the vibrant orange color of the gelatin."}
+{"index": "partiprompts6", "data": "A striking statue of the Egyptian god Anubis, depicted with a jackal's head, is dressed unconventionally in modern attire consisting of a crisp white t-shirt, a black leather jacket, and a pair of aviator goggles resting atop its head. The statue stands in sharp contrast against the backdrop of a full moon that illuminates the night sky over the sprawling cityscape of Los Angeles. The city lights twinkle in the distance, creating a mosaic of urban life behind the ancient deity's contemporary ensemble."}
+{"index": "partiprompts60", "data": "A futuristic robot, sporting a sleek silver body and a black visor, stands with its chest puffed out, displaying the bold number 42 emblazoned in white. It towers confidently in front of a vibrant red F1 race car, which is parked on an asphalt track. In the background, the silhouette of a cityscape is visible against the orange and pink hues of a setting sun, all captured in a dramatic wide-angle perspective reminiscent of a dynamic comic book illustration."}
+{"index": "partiprompts61", "data": "A vibrant mural on a red brick wall features the inspirational phrase \"BE EXCELLENT TO EACH OTHER\" in bold, black lettering. Next to the text, there's a whimsical graffiti depiction of a green alien donning a sleek black tuxedo, complete with a bow tie. In the foreground, a bright yellow fire hydrant stands out on the gray concrete sidewalk, adding a pop of color to the urban scene."}
+{"index": "partiprompts62", "data": "An intricately designed robot with a polished metallic surface, donning a vibrant red and white race car suit, stands with a confident posture in front of a sleek F1 race car. The robot's black visor reflects the brilliant hues of the setting sun, which casts a warm glow over the futuristic cityscape depicted in the background. The illustration, reminiscent of a scene from a dynamic comic book, captures the essence of speed and technology."}
+{"index": "partiprompts63", "data": "A playful collection of 2x2 emoji icons, each resembling a vibrant macaron with a distinct facial expression. The top left macaron is a sunny yellow with a beaming smile, while the top right is a fiery red with furrowed brows and an angry scowl. Below them, the bottom left is a bright blue with wide, surprised eyes, and the bottom right is a soft lavender with a tearful, sobbing face. Each of the macaron emojis is whimsically topped with a miniature brown cowboy hat, adding a touch of whimsy to their appearance."}
+{"index": "partiprompts64", "data": "An aerial view of a coastal French city, captured in a satellite image, reveals a sprawling green park on the western edge, bordered by neatly arranged streets. To the north, a majestic mountain looms over the urban landscape, its peak just touching the edges of a drifting cloud that partially obscures the view. The intricate patterns of the city's layout are visible, with residential areas, winding roads, and the shimmering coastline all distinctly marked from this bird's-eye perspective."}
+{"index": "partiprompts65", "data": "A vibrant graffiti artwork displaying the word \"WOMBAT\" in bold, multicolored letters, each character outlined in black to create a striking contrast against the stark white wall. The letters are embellished with various shades of blue, green, red, and yellow, with dramatic splashes of paint scattered around the composition. The texture of the dripping paint adds a dynamic and tactile quality to the mural."}
+{"index": "partiprompts66", "data": "A vibrant display of graffiti showcasing the word \"GIGGLE\" in thick, colorful letters splashed across a weathered red brick wall. Each letter is painted in a different hue, creating a rainbow effect that stands out against the faded background. Just beside the lettering, there's a large splotch of white paint that looks as if it has exploded from behind the word, adding a dynamic sense of movement to the static wall."}
+{"index": "partiprompts67", "data": "A minimalist vector art logo, where the letters P, A, and X are creatively arranged to form the simple outline of an elephant facing left. The elephant silhouette is depicted in a vibrant orange hue against a clean, white background. The design is sleek and modern, with the negative space around the letters contributing to the overall elephant shape."}
+{"index": "partiprompts68", "data": "In the center of the composition, there is a neatly arranged stack of three vibrant red cubes, each with a smooth, glossy finish that reflects the ambient light. To the right of this stack, there is a deep blue sphere with a matte texture, providing a stark contrast to the geometric sharpness of the cubes. On the left side, two emerald green cones with a slightly textured surface are positioned, their pointed tips directed upwards, creating a symmetrical balance in the arrangement."}
+{"index": "partiprompts69", "data": "a digital illustration of an adorable baby penguin emoji, sporting a vibrant blue hat on its head and snug red gloves on its flippers. The penguin is dressed in a bright green shirt that contrasts with its sleek black and white feathers, and it's wearing cheerful yellow pants that add a pop of color. The emoji is set against a clean, white background, making the colorful attire of the penguin stand out even more."}
+{"index": "partiprompts7", "data": "A surreal composite image showcasing the iconic Sydney Opera House with its distinctive white sail-like structures, positioned improbably beside the towering Eiffel Tower, its iron lattice work silhouetted against the night. The backdrop is a vibrant blue sky, pulsating with dynamic energy, where yellow stars burst forth in a dazzling display, and swirls of deeper blue spiral outward. The scene is bathed in an ethereal light that highlights the contrasting textures of the smooth, shell-like tiles of the Opera House and the intricate metalwork of the Eiffel Tower."}
+{"index": "partiprompts70", "data": "a geometric pattern consisting of multiple squares, each progressively smaller and nested within the other. The outermost square is a bright yellow, with each subsequent square transitioning smoothly into a deeper shade of orange as they move inward. The squares are evenly spaced, creating a gradient effect that draws the eye toward the center, where the deepest hue of orange resides."}
+{"index": "partiprompts71", "data": "A colorful children's book cover featuring a whimsical illustration of a fluffy white dog with a playful expression, wearing a green bandana, driving a bright red pickup truck. The truck is adorned with yellow stripes and is set against a backdrop of rolling green hills under a clear blue sky. The dog's paws are on the wheel, and the truck seems to be bouncing along a dirt path leading towards a cartoonish city skyline in the distance."}
+{"index": "partiprompts72", "data": "On a flat surface, there are two small, white circles positioned to the left side of a large, red triangle. The triangle is centrally placed on a bright green rectangular mat. The circles appear to be made of a smooth material, while the triangle has a slightly textured surface, creating a contrast in both color and texture."}
+{"index": "partiprompts73", "data": "A striking yin-yang symbol where the traditional circles are replaced by the fierce heads of a tiger, one black and one orange. The symbol is set against a plain background that accentuates its bold colors and intricate details. The tiger heads are detailed with stripes that seamlessly blend into the swirling design of the yin-yang."}
+{"index": "partiprompts74", "data": "a detailed pen-and-ink drawing that features a meticulously crosshatched sphere resting on a flat surface. The sphere has a prominent dark square etched onto its surface, creating a stark contrast with the rest of the shaded areas. The texture of the crosshatching gives the illusion of depth and dimension to the drawing."}
+{"index": "partiprompts75", "data": "a whimsical drawing of a brown and white horned owl, with bright yellow eyes, wearing a small black graduation cap atop its head. The owl is clutching a tiny rolled-up diploma tied with a red ribbon in its talons. The illustration has a light blue background, and the owl is perched on a stack of colorful books with golden titles etched on the spines."}
+{"index": "partiprompts76", "data": "A graphic image featuring a stark black background that serves as a canvas for a large, vibrant yellow circle positioned centrally. Below and to the right of the circle, there's a small, matte red square, creating a stark contrast in both color and shape. The simplicity of the composition draws attention to the geometric figures and their bold colors."}
+{"index": "partiprompts77", "data": "A minimalist graphic with a stark white background featuring a large, vibrant blue circle dominating the center. Below and to the right of the circle, there's a small, emerald green square, providing a stark contrast in both color and shape. The smooth textures of the shapes give the image a clean and modern aesthetic."}
+{"index": "partiprompts78", "data": "a striking painting dominated by shades of black and white, creating a stark contrast on the canvas. In the right corner, a vivid red flower stands out, adding a pop of color to the monochromatic background. The texture of the brush strokes is visible, giving the painting a dynamic and tactile quality."}
+{"index": "partiprompts79", "data": "An artistic representation of the planet Earth, with a swirl of musical notes in black ink encircling the globe. The Earth is depicted in vibrant blues and greens, indicating the oceans and continents, while the musical notes appear to dance around the planet's surface. The background of the drawing is a stark white, emphasizing the contrast and the harmony between music and the world."}
+{"index": "partiprompts8", "data": "An animated warrior wombat, clad in silver armor, stands boldly in a fighting stance with a gleaming sword in one hand and a sturdy round shield in the other. The iconic Arc de Triomphe looms in the background, partially veiled by a thin layer of mist that softens the contours of the monument. Despite the mist, the sun is positioned high above, casting a subtle glow on the scene and highlighting the wombat's determined expression."}
+{"index": "partiprompts80", "data": "A shiny metallic blue sphere rests to the left of a vibrant yellow felt box on a smooth, gray surface. The sphere reflects the light, creating a gleaming highlight on its curved surface, while the felt box has a soft, matte texture that contrasts with the sphere's reflective finish. The box appears slightly larger than the sphere, and both objects are positioned against a neutral background that emphasizes their colors and textures."}
+{"index": "partiprompts81", "data": "a flag with three distinct vertical stripes prominently displayed against a clear sky. The leftmost stripe is a deep blue, the middle is a crisp white, and the rightmost stripe is a vibrant red. The flag is attached to a silver pole that is mounted on a grey building, fluttering gently in the breeze."}
+{"index": "partiprompts82", "data": "A geometric composition featuring a large yellow triangle positioned above a green square and a red rectangle. The shapes are arranged against a plain background, creating a stark contrast in colors. The yellow triangle has a smooth texture, while the green square and red rectangle appear to have a matte finish."}
+{"index": "partiprompts83", "data": "An intricately designed digital emoji showcasing a whimsical cup of boba tea, its surface a glistening shade of pastel pink. The cup is adorned with a pair of sparkling, heart-shaped eyes and a curved, endearing smile, exuding an aura of being lovestruck. Above the cup, a playful animation of tiny pink hearts floats, enhancing the emoji's charming appeal."}
+{"index": "partiprompts84", "data": "The book cover features a sleek, modern design with a gradient of blue to purple hues, symbolizing the concept of 'Backpropaganda' by the author I.C. Gradients. The title is emblazoned in bold, white font across the top, with the author's name neatly printed at the bottom. The cover also displays an abstract illustration that hints at the theme of artificial intelligence, with interconnected nodes and lines subtly forming a network pattern."}
+{"index": "partiprompts85", "data": "a vibrant stained glass window featuring a depiction of a tyrannosaurus rex in hues of green and blue, set against a backdrop of a peaceful prehistoric landscape. The dinosaur's posture suggests a moment of rest, with its massive tail curling alongside its body. Sunlight filters through the textured glass, casting colorful patterns on the interior walls of the room."}
+{"index": "partiprompts86", "data": "A sizable blue box with a smooth, matte finish sits prominently in the center of the room. Atop its surface rest three small, vibrant yellow boxes, each with a glossy texture and sharp, clean edges. The blue box's substantial size dwarfs the trio of yellow boxes, creating a striking contrast in both color and scale."}
+{"index": "partiprompts87", "data": "An array of geometric shapes arranged on a dark, matte black surface. There are ten triangles, each with a different hue ranging from vibrant red to deep blue, and five squares that are uniformly white. The shapes are scattered in no particular order, creating a visually striking contrast against the black background."}
+{"index": "partiprompts88", "data": "a detailed background pattern featuring a repeating sequence of red roses with lush green leaves and white skulls with subtle gray shading. the roses are in full bloom, showcasing their intricate petals, while the skulls have a smooth texture and hollow eye sockets. the pattern is set against a neutral-toned backdrop, creating a striking contrast between the elements of life and death."}
+{"index": "partiprompts89", "data": "A vibrant blue wooden pyramid sits atop a glossy red plastic box, which appears sturdy and capable of supporting the weight of the pyramid. The box's surface is smooth, contrasting with the textured grain of the wooden pyramid. The objects are placed on a beige carpet, and the pyramid's sharp edges cast a slight shadow on the box, emphasizing their geometric shapes."}
+{"index": "partiprompts9", "data": "An imaginative anime-style illustration that features the iconic Sydney Opera House with its distinctive white sail-like shells, sitting adjacent to the towering Eiffel Tower with its intricate iron lattice work. Both structures are set against a vibrant blue night sky, pulsating with dynamic energy, where yellow stars burst forth amidst swirling patterns of electric blue. The fantastical scene is further accentuated by the exaggerated proportions and stylized elements typical of anime art, creating a surreal and whimsical landscape."}
+{"index": "partiprompts90", "data": "In the detailed Renaissance paintings, the Virgin Mary is depicted seated gracefully within a stone loggia, her robes a rich blend of blues and reds, with delicate folds that suggest softness and depth. The background features a dreamlike, hazy landscape, rendered in muted tones through the use of the sfumato technique, which blends colors and tones subtly together. This landscape seems to stretch endlessly into the distance, with faint outlines of trees and hills, giving the impression of a serene and untouched wilderness."}
+{"index": "partiprompts91", "data": "A visually striking mixed media piece featuring a central photograph of a woman with flowing orange hair that cascades over her shoulders. The background contrasts with a monochromatic sketch of a bustling city skyline, complete with towering skyscrapers and intricate architectural details. The woman's piercing gaze seems to transcend the two-dimensional space, creating a dynamic interplay between the realism of the photograph and the abstract nature of the sketched cityscape."}
+{"index": "partiprompts92", "data": "A detailed photograph captures the intricate features of a pharaoh statue adorned with unconventional accessories. The statue is wearing steampunk glasses that have intricate bronze gears and round, reflective lenses. It is also dressed in a stark white t-shirt that contrasts with a dark, textured leather jacket draped over its shoulders. The image is taken with a high-quality DSLR camera, ensuring that the textures and colors of the statue and its attire are vivid and sharp. The background is a simple, unobtrusive blur, drawing all attention to the anachronistic ensemble of the pharaoh."}
+{"index": "partiprompts93", "data": "An imaginative depiction of the Mona Lisa, portrayed in a modern setting where she is seated at a wooden breakfast table. In her hands, she holds a delicate white porcelain cup from which she sips coffee. Before her lies a plate with a fluffy omelette and a golden-brown croissant, accompanied by a small vase containing a single red rose. The scene is illuminated by natural light that filters through a nearby window, casting a soft glow on the table's surface."}
+{"index": "partiprompts94", "data": "A surreal image capturing an astronaut in a white space suit, mounted on a chestnut brown horse amidst the dense greenery of a forest. The horse stands at the edge of a tranquil river, its surface adorned with floating water lilies. Sunlight filters through the canopy, casting dappled shadows on the scene."}
+{"index": "partiprompts95", "data": "A detailed oil painting that captures the essence of a smiling businesswoman, her expression warm and inviting. She is depicted holding a sleek, modern cell phone in her right hand, which contrasts with the classical style of the artwork reminiscent of Rembrandt's technique. The rich, golden light highlights the textures of her suit and the soft curls in her hair, while the deep, warm background tones complement her confident stance."}
+{"index": "partiprompts96", "data": "A large Saint Bernard dog with a reddish-brown and white coat stands on its hind legs, its front paws reaching up into the air. Seated on the dog's broad shoulders is a young girl with curly hair, wearing a pink dress and a wide smile. The scene takes place in a spacious backyard, where a wooden fence can be seen in the background, partially covered by climbing ivy."}
+{"index": "partiprompts97", "data": "An empty space where an invisible man would be, with a pair of horn-rimmed glasses seemingly floating in mid-air, and a pearl bead necklace draped in the space below them. In the space where his hands would be, a smartphone is held, as if being operated by the unseen figure. Around this curious scene, the room appears ordinary, with a couch and a coffee table nearby, upon which rests a scattering of magazines and a remote control."}
+{"index": "partiprompts98", "data": "a politician stands on a wooden stage, dressed in a bright red soccer jersey emblazoned with a white number '10'. In one hand, they hold a yellow and blue volleyball, gesturing with the other as they address the crowd. Behind them, a large banner hangs, displaying the emblem of a local sports team, and to their right, a podium with microphones stands ready to amplify their speech."}
+{"index": "partiprompts99", "data": "a man dressed in a dark business suit and tie is carefully ascending a silver aluminum ladder. The ladder is propped securely against the side of a pristine white house with beige trim around the windows. Sunlight reflects off the house's clean siding, highlighting the contrast between the man's formal attire and the manual task at hand."}
+{"index": "posescript0", "data": "You are balancing on your right foot, your left leg is raised and bent at the knee, which is now elevated above your waist level. Your body is turned towards the left, and you're gracefully leaning back, creating a slight arch in your spine. Your right arm is bent at the elbow, with your hand extended forward, hovering just in front of your chest, as if reaching for something just out of grasp. The position appears to be a dynamic, possibly yoga-inspired pose, illustrating both strength and flexibility."}
+{"index": "posescript1", "data": "A figure is posed dynamically, showcasing a particular stance where their left leg is bent forward, creating a powerful line, while the right leg extends straight back, as if poised for movement. The right shoulder dips forward, complementing the overall forward inclination of the torso, suggesting a sense of momentum. Both arms are positioned at angles, trailing down and back from the body to balance the posture, while the head is subtly canted to the right, adding a touch of grace to the disposition."}
+{"index": "posescript10", "data": "A person is positioned on a cushioned grey floor mat, with their knees drawn closely to their chest and their bare feet slightly tucked underneath. Their arms are extended wide, fingers splayed upon the cool floor surface, creating a sense of balance. The individual's gaze is directed intently downwards, focused on the space just above their folded legs, as if in deep contemplation or in the midst of a stretching routine."}
+{"index": "posescript11", "data": "A person is captured in a dynamic pose, with both arms raised to head level and extended toward the right, careful not to touch the head. The individual's gaze is directed attentively to the right, as if focusing on something in the distance. Their stance exudes balance and intention, with the left foot placed forward and the right foot behind, creating a strong, stable base for the graceful positioning of the limbs."}
+{"index": "posescript12", "data": "In a spacious room with a polished wooden floor, a person is captured in a dynamic pose, exhibiting an exaggerated stride. Their left leg is extended far in front, the foot planted firmly on the ground, while the right leg is stretched far behind. The right arm is bent at the elbow and directed downwards, while the left arm is held straight and swept backward. The individual's body leans heavily to the right, conveying a sense of motion and balance."}
+{"index": "posescript13", "data": "A figure balances gracefully on their right foot, with their left leg extended straight behind them, parallel to the ground. The torso of the person is pitched forward in a dynamic pose, creating a straight line from head to the elevated heel. The left arm mirrors the extension of the leg as it reaches slightly forward, while the right arm is bent at the elbow and opens to the right, adding a sense of movement and balance to the overall posture."}
+{"index": "posescript14", "data": "A person practicing a yoga pose on a light grey mat in a spacious room with white walls. He stands firmly, balancing on his left leg while his right knee is bent upwards, and his right hand gently holds his right ankle. His left arm is extended back, enhancing his poise, and his gaze is intently fixed towards the left corner of the room, displaying a sense of concentration and balance."}
+{"index": "posescript15", "data": "A humanoid robot toy is positioned in a dynamic stance on a smooth, gray concrete floor. Its legs are angled with its toy feet firmly pointing towards the ground and its knees subtly bent forward, creating an impression of being ready to spring into action. The torso of the toy remains erect, with its head oriented straight ahead as if surveying the horizon, while its arms are bent and held slightly away from its body, the right arm retracted a fraction more than the left, giving a sense of asymmetrical balance to the overall posture."}
+{"index": "posescript16", "data": "A dynamic athletic stance where the individual's left leg is raised just off the ground, bent at a sharp angle at the knee, demonstrating balance and readiness. The right arm extends forward emphatically, elbow bent upwards, suggesting a poised gesture or possibly the midst of an action. Meanwhile, the left arm angles down and to the front, balancing the posture, as the head tilts slightly forward, gaze lifted upward with an air of determination or focus."}
+{"index": "posescript17", "data": "A figure captured mid-stride presents a dynamic stance wherein the left leg is extended backward, toe grazing the ground, while the right leg is propelled forward, indicative of a purposeful step. Their left arm mirrors the forward momentum, bent at the elbow and directed ahead, while the right arm stretches straight behind them, creating a strong sense of motion. The silhouette of this person suggests a brisk walk or the beginning of a run, accentuated by the precise positioning of limbs that conveys both balance and speed."}
+{"index": "posescript18", "data": "An individual is poised in a challenging yoga pose on a seafoam green exercise mat. Their left leg is anchored firmly on the ground, providing support as their right leg extends directly in front of them, parallel to the floor. The left arm reaches gracefully towards the rear, while the right arm creates a horizontal line, extending outwards from the shoulder. The subject's head is tilted back ever so slightly, showing a serene expression as light from a nearby window reflects softly off their skin."}
+{"index": "posescript19", "data": "A figure depicted with a dynamic pose: its left arm extends forward while the right arm is elongated back, parallel with its spine. Both legs are positioned closely together in a near vertical line, with the right leg gracefully stacked atop the left. The silhouette reflects a ballet dancer's poise during a challenging balancing act."}
+{"index": "posescript2", "data": "An individual is captured in a dynamic pose reminiscent of a reverse bridge. Suspended with their feet hovering above the ground, they balance their weight primarily on their arms, which show a slight bend at the elbows. Their gaze is intently focused down toward their hands, indicating concentration and bodily awareness."}
+{"index": "posescript20", "data": "The individual in question presents a dynamic stance, with the left foot stepping forward onto a beige, textured rug that covers the wooden floor. They are extending their right arm upward toward the ceiling, which has a smooth, white finish. Simultaneously, the left arm stretches back, contrasting against the pale yellow wall behind them, creating an image of motion within the room."}
+{"index": "posescript21", "data": "a figure in a dynamic pose, with their upper body leaning towards the right, arms raised high above their head forming straight angles at the elbows. The right knee of the subject is sharply bent, supporting their weight, while the left leg extends gracefully back, enhancing the sense of motion in the posture. The individual's head is angled downwards, focusing intently on something out of view, giving the entire pose an air of concentration and balance."}
+{"index": "posescript22", "data": "A figure in a dynamic pose, standing on their right foot with the left leg lifted gracefully behind the body, knee bent at a 90-degree angle. Their left arm extends horizontally at the level of their face, fingers elegantly pointed, while the right arm is raised with the hand near the chin level, palm facing inward. The head is gently tilted back, eyes gazing upward, creating a line of sight that follows the direction of the extended left arm."}
+{"index": "posescript23", "data": "A young athlete in a white tank top and black shorts positioned with both of his hands grasping an invisible object near his waist on the right side. His posture is dynamic, with both knees bent, indicating readiness for movement, and his torso tilted forward, suggesting a sense of forward momentum. His gaze is locked straight ahead, showing focus and determination, as if he's visualizing his next action in a sport or dance routine."}
+{"index": "posescript24", "data": "A figure is in a dynamic crouched position on a geometric patterned floor, with one knee supporting the pose and the other leg extended slightly to the side. The person's left arm reaches toward the ground for balance, palm facing down, while the right arm is bent, with the hand placed near the face. The individual's head is tilted upward, possibly in concentration, complementing the fluid lines of the body's pose."}
+{"index": "posescript25", "data": "A dynamic sculpture of an athlete in the midst of a sprint, with the form capturing the essence of motion. The head of the figure is titled upwards, suggesting determination and focus, while the hands are strategically positioned close to the body; the left arm is tucked below the right, which extends outward to emulate the action of an intense run. The muscles are sculpted in a way to depict tension and energy, complementing the overall impression of speed and agility."}
+{"index": "posescript26", "data": "In a room with beige carpeting, an individual is in the midst of a sit-up exercise, positioned on the floor with their body slightly raised. Their left hand is pressed against their cheek to support their head, while their left leg is bent inward, creating an angle at the knee. The space around them is sparse, and to their side, a folded yoga mat and a pair of dumbbells lay within reach."}
+{"index": "posescript27", "data": "An individual is standing in front of a gently curved white wall, embodying a sense of tranquility through their posture. Their arms hang loosely at their sides, contributing to the relaxed aura they exude. The left leg is gracefully bent at the knee such that the left foot is nestled against the right inner thigh, displaying the classic pose of a tree in yoga practice. The person's attire is minimal and provides a contrast to the simplicity of the backdrop. Their focus appears to be directed inward, further emphasizing the meditative state suggested by their stance."}
+{"index": "posescript28", "data": "An individual is captured in a dynamic pose with their body leaning forward, their right arm bent at a 90-degree angle in front of them, and their left arm extended behind. They wear a bright yellow short-sleeved shirt which contrasts with their dark blue trousers. Their gaze is intently focused towards their right, possibly fixed on something or someone out of view, giving the impression of movement or anticipation."}
+{"index": "posescript29", "data": "An individual is seated on a smooth, light-colored floor with a relaxed posture, hands lifted in the air, palms facing upwards. Their head is tilted back, eyes likely gazing towards the ceiling, while their legs extend forward, creating a subtle V-shape with a small gap between the feet. The surrounding space appears calm and uncluttered, allowing the person's posture to stand out in the environment."}
+{"index": "posescript3", "data": "A figure is captured mid-motion, their left foot firmly planted on the gray asphalt, displaying a well-worn sneaker. Their right leg is extended forward with precision, the muscles taut and the silhouette of the leg cuts a straight line through the air. Both arms are angled, elbows bent in an almost geometric formation, with one arm stacked neatly over the other in front of the chest, hinting at a martial arts stance or a dancer's poise."}
+{"index": "posescript30", "data": "An individual is captured in a dynamic exercise pose on a gray fitness mat, their body nearly aligned for a push up. However, the hips are notably rotated to the right, creating a twist through their torso. The person's gaze is directed to the right, maintaining the line of the twist, with both arms extended straight and palms pressed firmly against the ground."}
+{"index": "posescript31", "data": "A focused individual captured in a dynamic martial arts pose, with both legs bent at the knees in a strong and stable crouch. The person's body is hunched purposefully over, conveying readiness and balance. Each arm is extended forward, bent at the elbow, and the wrists are also bent in meticulous form, displaying the precision and discipline of the martial arts stance."}
+{"index": "posescript32", "data": "A figure can be seen balancing on both feet, their torso tilted sharply to the left, creating a dynamic angle. Their arms are stretched out on either side of their body for balance, with each elbow forming a precise 90-degree bend. The person may seem to be mid-exercise or in the middle of a stretching routine, possibly wearing comfortable athletic attire suitable for the activity."}
+{"index": "posescript33", "data": "A figure is poised in an active stance, with their buttocks pushed back and their torso inclined forwards, suggesting a sense of readiness or engagement in an activity. Their arms are extended downwards, positioned close to the body, with elbows bent and hands reaching straight out in front, as if ready to grasp or manipulate something. The person's head is subtly tilted back, indicating a focus towards the ceiling or sky, possibly observing something of interest or intent above them."}
+{"index": "posescript34", "data": "A dynamic posture captured mid-action with the left knee bent and the right leg lifted off the ground, extended slightly forward. The right arm remains relaxed, hanging down along the side, while the left arm is energetically bent, elbow out and hand raised upward. The individual's head is tilted forward, focused as if preparing for a swift movement or to maintain balance during a physical activity."}
+{"index": "posescript35", "data": "a figure in a dynamic pose with their right leg slightly bent, standing firmly on the ground, while their left leg is bent and positioned behind the right, suggesting motion or a dance step. The right arm is raised up and curved over the head in an elegant arc, while the left arm extends horizontally out to the side and slightly to the rear, enhancing the sense of balance and stretch. The individual's stance is wide, with a considerable gap between the feet, adding a powerful presence to the overall posture."}
+{"index": "posescript36", "data": "In an expansive room, the subject is captured in a dynamic squatting position with their torso angled towards the left. The left arm is firmly planted on the floor, supporting the weight of the body, while the right arm extends directly forward, fingers outstretched and intent. The head tilts in the same direction as the torso, with eyes fixed forward, expressing concentration and balance."}
+{"index": "posescript37", "data": "A dynamic stance captured in a moment of intense action shows an individual with their legs spread apart for balance. Their right arm is drawn back, poised in a throwing position, with their hand just below the level of their head, ready to launch. The left arm is relaxed and lowered, the elbow bent, and the hand gently resting on the stomach area, creating a counterbalance to the tension in the right arm."}
+{"index": "posescript38", "data": "A person stands in a poised stance with their legs straight, making contact with each other down to the feet, which are clad in black, lace-up shoes. Their back maintains a subtle lean forward, suggesting a stance of attentiveness or mild anticipation. Their arms rest casually along the sides of their body, with hands gently relaxed. The person's head is tilted just a fraction to the left, their gaze perhaps fixed on an unseen object of interest in their immediate vicinity."}
+{"index": "posescript39", "data": "A person is positioned in a dynamic, asymmetrical pose within a spacious room. Their legs are spread slightly wider than shoulder-width apart, almost as if they are about to sit into an invisible chair, reflecting an athletic stance. The individual's right arm hangs casually by their side while their left arm extends outward and upward, creating a feeling of movement. Their head is subtly turned to the right, giving the impression that their attention is fixed on something outside of the immediate view."}
+{"index": "posescript4", "data": "A depiction of a human figure demonstrating a specific stance: the individual's legs are positioned beyond shoulder width apart, firmly planted on a textured grey mat. The torso is inclined forward modestly, as if preparing to engage in a gentle stretch or a physical activity. The subject's head is courteously tilted to the right, conveying a sense of thoughtfulness or focus, while both arms curve gracefully downward before the body, with hands nearing each other but not quite making contact, hovering just inches apart."}
+{"index": "posescript5", "data": "A figure is positioned in a spacious room, demonstrating a wide stance squat. Their right hand is gently placed on their left hip, while their left hand rests below the right, accentuating the curve of their waist. The person's head is turned to the right, possibly focusing on an object or point in that direction, creating a strong and balanced posture."}
+{"index": "posescript6", "data": "A person in an athletic stance with a focused expression, balancing on their left leg that is extended straight beneath them, touching the ground. Their right leg is bent at the knee and lifted behind their body, muscles tensed in a dynamic posture. Both arms are stretched out in front, parallel to the ground, with hands facing down as if they are about to begin a sprint."}
+{"index": "posescript7", "data": "A slender figure is maintaining a poised stance, balancing skillfully on their right leg while the left leg extends outward as if caught mid-motion. The right arm maintains a semi-rigid extension, giving a sense of dynamic tension while the left arm is bent, the hand gently cupping upwards like it's cradling an invisible sphere. Their torso remains erect and proud, with the head elegantly tilted upwards and to the left, suggesting a gaze directed towards something intriguing just out of view."}
+{"index": "posescript8", "data": "In this indoor scene, a person is seated directly on a hardwood floor with their legs extended out and slightly bent. The left leg is folded at a more acute angle than the right. Their gaze is directed upwards, towards the ceiling, with a look of concentration. Arms are extended behind, with fingers outstretched, touching the floor for support, and palms turned outward, suggesting an open and relaxed posture. The surrounding space is free from obstructions, offering room for comfort and movement."}
+{"index": "posescript9", "data": "In a well-lit room with a soft beige carpet, an individual is practicing a forward bend yoga stretch. Their torso is folded over, head nestled between their arms, with their hands lightly meeting at their feet, showing a relaxed and mindful posture. Although the knees maintain a slight bend to protect the joints, the legs remain parallel to each other, with only a small gap separating them, creating a harmonious balance in the pose."}
+{"index": "stanford0", "data": "A solitary figure stands on the sandy expanse of the beach, identifiable as a lifeguard by the bright red shorts and white tank top they are wearing. The sky above is a blanket of grey, hinting at the overcast weather as gentle waves rhythmically roll onto the shore. Beside the lifeguard, a vivid yellow and black rescue board leans against a wooden post, its surface speckled with droplets from the sea's mist. Nearby, the shoreline extends into the distance, marked by the sporadic presence of seagulls and scattered shells."}
+{"index": "stanford1", "data": "A lively playground scene unfolds with a bustling atmosphere of children and adults engaging in various activities on a surface of soft beige sand. In the midst of the joyful chaos, a young boy wearing a vivid blue shirt and coordinating blue pants is energetically climbing a metallic pole, his hands gripping firmly as he ascends. Nearby, a woman clad in a brown dress stands with a matching brown umbrella held aloft, providing her with shade from the sun. As the sun casts its light, the shapes of people's shadows mingle on the sandy ground, creating a playful tapestry of silhouettes."}
+{"index": "stanford10", "data": "Gently bobbing on the tranquil water, a white boat is moored beside a slender wooden dock that stretches out from the shore. Beyond the dock, the body of water is smooth as glass, reflecting the silhouettes of distant homes nestled against a backdrop of lush greenery. Towering trees with robust canopies rise behind the houses, completing the serene lakeside landscape."}
+{"index": "stanford11", "data": "A delectable square slice of pizza with a golden-brown crust sits at an angle on a round, emerald-green plate. Toppings of earthy mushrooms, glossy black olives, and juicy red tomato chunks are scattered across the melted cheese. Resting beside the pizza slice, shredded purple and crisp white cabbage add a vibrant contrast to the plate, creating an appealing mix of shapes and colors."}
+{"index": "stanford12", "data": "A vibrant array of very green broccoli florets rests on a large square plate, accompanied by a scattering of slender yellow beans. Both vegetables are lightly coated in a pale, creamy sauce that adds a subtle sheen to the dish. The plate itself, with its expansive, flat surface, provides an ample canvas for the vividly colored, healthy meal."}
+{"index": "stanford13", "data": "A majestic bird of prey with white breast and belly contrasted by brown wings is perched confidently on the sturdy branch of a leafy green tree. Its sharp gaze, highlighted by a striking yellow eye, reveals a sense of purpose as it holds a shimmering silver fish with dark spots in its firm beak. The tree, bathed in sunlight, provides a natural backdrop, accentuating the textures of the bird's feathers and the subtle sheen of the fish's scales."}
+{"index": "stanford14", "data": "A traveler stands in a busy train station, clad in a tan jacket that falls to their mid-thigh, paired with a snugly fitted gray scarf around their neck. On their back, a worn brown backpack suggests a journey in progress. They are intently gazing at the cell phone in their hands, perhaps checking a schedule or reading a message. Before them, a sleek silver train rests on the track, its doors poised to open and welcome passengers to the next leg of their travels."}
+{"index": "stanford15", "data": "A skier, clad in a bright yellow snowsuit that stands out against the white snow, swiftly descends a snowy slope. A cloud of freshly stirred powder trails behind them, evidence of an exhilarating jump just taken. In their gloved hands, they firmly grip two black ski poles that cut through the powdery snow with each focused movement. The vast expanse of the mountain can be seen around them, adorned with snow-laden conifers and the distant peaks shrouded in mist."}
+{"index": "stanford16", "data": "A woman strides confidently down a bustling city sidewalk, her hand pressing a sleek black smartphone to her ear. Above her brow, a stylish pair of sunglasses rests, poised on her head amidst locks of hair. Around her, the sidewalk teems with a diverse crowd of pedestrians, each absorbed in the rush of their own daily routines."}
+{"index": "stanford17", "data": "Two young girls are trekking on a dirt trail that meanders through a dense forest on a large, imposing mountain. The trees enveloping the path are lush and varying shades of green. Each girl is holding a brightly colored umbrella to shield themselves from the elements. The mountain's peak looms in the distance, partially obscured by the canopy of towering trees."}
+{"index": "stanford18", "data": "A delicious pink smoothie sits in the center of a clean, white ceramic plate, with a sprinkle of cinnamon dusting the surface, giving it an aromatic allure. Beside the smoothie, a ripe, yellow banana lies slightly curved, its peel gently touched by the morning light. Close to the banana, a small cluster of plump blueberries adds a vibrant pop of deep blue to the arrangement, contrasting with the soft pink hue of the smoothie and the pristine white of the plate."}
+{"index": "stanford19", "data": "As the evening progresses, the sky transitions into a mesmerizing canvas with the sun dipping below the horizon, casting a warm orange glow across the scenery. The blue sky, now infused with hues of pink and purple, serves as a backdrop for the scattered, fluffy clouds. Below this enchanting sky, the city comes to life with streets bustling with the rhythmic flow of traffic, illuminated by the flickering of numerous traffic lights."}
+{"index": "stanford2", "data": "A vibrant blue train is parked at a bustling train station, its red doors sliding open to welcome passengers. The train, adorned with white stripes, stands out against the grey concrete of the platform. A long yellow line stretches across the platform, marking the boundary for commuters. The station is buzzing with activity, with passengers rushing, while the train awaits its next departure."}
+{"index": "stanford20", "data": "A cozy corner featuring a brown wooden shelf laden with assorted electronics, including silver DVD players and an outdated radio, their cords neatly arranged along the side. Occupying a prime spot on one of the shelves is a ginger tabby cat, curled up and napping amidst the items. Below, on the floor, rest two small, shiny silver barbells, catching the light from the overhead fixture. Right beside the shelf stands an impressive large bottle encased in a carved, dark-stained wooden crate, adding a touch of rustic elegance to the setting."}
+{"index": "stanford21", "data": "Two vibrant yellow pendant lights dangle gracefully from a thick, black wire, illuminating the area beneath them. Below the glow of the lights stand two lush green trees, their leaves rustling slightly in a gentle breeze. Beside these trees rests a tall tan building, its walls smooth and unassuming in the soft light. Scattered next to the building are various signs, displaying bold letters and symbols, directing passersby and adding a splash of color to the urban landscape."}
+{"index": "stanford22", "data": "A man clad in a denim jacket and jeans, topped with a pristine white cowboy hat, gently strokes the mane of his equally white horse. The horse stands calmly by a wooden fence that outlines a sprawling field. Naked trees form a stark silhouette against the dimming sky, indicating the approach of evening."}
+{"index": "stanford23", "data": "A tricolored calico cat with a mixture of brown, black, and white fur comfortably sits on top of a modern flat-screen television. The TV is currently on, displaying a colorful nature documentary, providing a dynamic contrast to the cat's serene posture. Surrounding the television is a backdrop of brown and beige floral wallpaper, contributing to a warm and homely aesthetic within the room."}
+{"index": "stanford24", "data": "An antique typewriter with prominent round keys that protrude upwards, indicative of its vintage design, is displayed on a sturdy wooden table. The typewriter's deep black hue contrasts with the stark white labeling on each button, offering a classic, monochromatic aesthetic. Around the typewriter, the wooden table shows signs of use, adding to the object's historical character."}
+{"index": "stanford25", "data": "A close-up image reveals a hand grasping a sleek, dark necktie, its fabric slightly sheen in the light. The person's hand, adorned with a shiny wedding ring on the ring finger, emerges from the cuff of a gray collared shirt. Subtly peppered across the skin of the person's neck are a smattering of light brown freckles, offering a glimpse of their complexion beneath."}
+{"index": "stanford26", "data": "A peaceful scene with an infant peacefully napping on a soft mattress covered with a vibrant, patterned fabric. The little one is dressed in a cozy shirt featuring black, white, and blue stripes. Close to the slumbering baby, there is a cuddly toy doll dressed in a whimsical blue and purple hoody, suggesting a playful atmosphere in the child's sleeping area."}
+{"index": "stanford27", "data": "An imposing large building with gray stone walls and tall windows secured by dark iron bars, casting shadows on the facade. In front of this edifice, a diverse group of people walk by on the wide concrete sidewalk, some in casual attire, others in business suits, all going about their daily routines. A bustling street runs parallel to the sidewalk, filled with an array of vehicles including yellow taxis, red buses, and private cars of various makes and colors, creating a vibrant urban scene."}
+{"index": "stanford28", "data": "A young boy, sporting a black sweater and a pair of blue jeans, casually strolls alongside a friendly-looking dark brown horse. The boy also has a vibrant blue shirt loosely tied around his waist, adding a pop of color to his outfit. The horse, with its shiny coat and gentle eyes, walks calmly next to him on a gravel path that's lined with wooden fences on both sides."}
+{"index": "stanford29", "data": "A classic vintage airplane, predominantly white with red and black accents, is parked on a clear area adjacent to a tranquil body of water. The sky above is densely populated with fluffy, white clouds, hinting at the possibility of changeable weather. Nearby, a small, unassuming box sits on the ground, its purpose unclear, yet it appears inconspicuous against the grandeur of the aircraft."}
+{"index": "stanford3", "data": "A bundled-up individual trekking through freshly fallen snow, wearing a thick black jacket, fitted blue jeans, and sturdy boots designed for winter weather. This person has their head covered with a warm hat and is carefully navigating in front of a muted-color building that shows the subtle signs of weathering from the cold season. Nearby, a solitary parking meter stands encased in a layer of snow, its coin slot obscured by the icy accumulation."}
+{"index": "stanford30", "data": "An elegant three-tiered cake, with layers alternating between chocolate brown and creamy white icing, stands as the centerpiece on a polished wooden table. Scattered gracefully around the base of the cake are delicate red and yellow flower petals, adding a touch of color to the presentation. Beside the cake rests a pristine white plate, upon which lies a silver fork with an ornate handle, ready for the first slice to be served."}
+{"index": "stanford31", "data": "A middle-aged man is perched atop a majestic gray elephant, slowly making their way through a lush, waist-high swamp. The man is wearing a wide-brimmed hat and khaki clothing, blending with the natural surroundings. This tranquil scene is framed by two thick, moss-covered logs from an ancient tree, hinting at the dense forest beyond."}
+{"index": "stanford32", "data": "A gentleman stands behind a white linen-clad table, donning a sleek black jacket paired with a vibrant green tie. Atop the table lies an elaborately decorated cake with white icing and colorful sprinkles. The man, whose black hat adds a touch of sophistication, carefully holds a silver knife with an ornate handle, poised to slice the festive confection."}
+{"index": "stanford33", "data": "In a clear, spacious field, two towering giraffes can be seen leisurely standing behind a tall, silver metallic fence that glints in the sunlight. Directly in front of this barrier, there is a man draped in a vivid blue shirt adorned with a tropical floral print, which contrasts sharply with the plain backdrop. On his head, he sports a sleek black cap, and he appears to be gazing intently at the majestic creatures as they graze in the background."}
+{"index": "stanford34", "data": "A beach scene captures a man, clad in blue and white striped swim shorts, standing barefoot on the warm, golden sand. To his side, a playful black and white dog, with its gaze fixed on an object in the sky, waits in anticipation. Suspended in the air above them is a spinning white frisbee, creating a dynamic moment of play and excitement just off the coast, where the gentle waves lap at the shore."}
+{"index": "stanford35", "data": "Two athletes are engaged in a spirited game of tennis on a court with a light brown clay surface. Both are dressed in crisp white tennis uniforms, with the female player sporting a classic white skort. She's in the midst of a powerful swing, her racket slicing through the air with precision. Her male counterpart waits intently across the court, preparing for his return. The white boundary lines of the court are stark against the brown of the clay, emphasizing the competitive space they share."}
+{"index": "stanford36", "data": "A cozy scene with a vibrant orange cat peacefully curled up on a rich blue fleece blanket. Upon the cat's back rests a whimsical red and white striped hat, perched as though donned in mid-play. Directly in front of the feline, a soft blue pillow lies slightly askew, offering a comfortable resting spot. The cat's one eye is partially open, giving a glimpse of its golden iris, surveying its surroundings with a languid gaze."}
+{"index": "stanford37", "data": "A large, circular clock with a green face is affixed to the exterior of a brick building, the brickwork displaying a variety of warm, earthy tones. The clock features Roman numerals in an elegant font, and each numeral is accented with subtle gold embellishments that catch the light. Directly beside the clock, there is a single arched window with a lattice design, providing a glimpse into the building's interior."}
+{"index": "stanford38", "data": "A man with short, brown hair is standing in an open field, dressed in a crisp white shirt and black athletic shorts. In his hand, he grips a vibrant green frisbee that is decorated with intricate black graphics. Around him, the grass is slightly overgrown and sways gently in the breeze."}
+{"index": "stanford39", "data": "A rustic wooden table is set against a backdrop of soft, beige walls, holding a weathered brown piece of butcher paper. Sprawled across the paper is an array of colorful vegetables, from vivid red tomatoes to deep green cucumbers and bright orange carrots. In front of this cornucopia of produce, a small, worn yellow notebook lies open, its pages filled with neat handwriting."}
+{"index": "stanford4", "data": "A young boy, donned in a vibrant blue shirt and black trousers, is energetically swinging a cobalt blue baseball bat at a neighborhood park. Beyond the chain-link fence, a group of his peers can be seen, their expressions a mix of anticipation and excitement as they observe the game. Nearby, an adult male, potentially a coach or a parent, stands in stark contrast, attired formally in a crisp white shirt and dark tie, his eyes fixed on the boy's technique."}
+{"index": "stanford5", "data": "Eight white hands, all gripped around the edges of sleek smartphones, are positioned at the bottom of the frame. The phones display colorful screens visible against the backdrop of a light-textured wall. Above two of the hands, the lower halves of faces are in view; one features the bottom of a male's face with an unshaven chin, likely in his mid-thirties, wearing a light green canvas jacket, and the other, a female with sun-kissed blond hair, is dressed in a classic blue denim jacket. Both appear to be engrossed in the content on their devices."}
+{"index": "stanford6", "data": "An almost empty airport tarmac bathed in the soft glow of the early morning sun. A few scattered luggage carts stand idle near the terminal, while two commercial airplanes sit parked on the apron, their boarding stairs down, anticipating the arrival of passengers. In the distance, by the edge of the main airport building, stand two tall poles, painted in alternating bands of red and white, standing out against the clear blue sky."}
+{"index": "stanford7", "data": "A man in a casual gray shirt and faded blue jeans is captured mid-air above a sleek black skateboard, executing a skillful jump. To the side of him lies a large black skateboard ramp with visible scuff marks from frequent use. In the expansive blue sky overhead, a few wispy clouds drift lazily by, adding a sense of height to his aerial feat."}
+{"index": "stanford8", "data": "A vibrant green door stands out against the stark contrast of its surrounding white walls, which are visibly marred with smudges and streaks of accumulated grime. Next to the door rests a large black bicycle, featuring a well-worn leather seat, a sign of frequent use. The bicycle's matching black handles complement the overall stark monochrome aesthetic, making it a prominent feature within this urban scene."}
+{"index": "stanford9", "data": "In the distance, towering black mountains with their peaks blanketed in thick layers of snow stand majestically. Against this dramatic backdrop, a flock of black birds is captured in their dynamic mid-flight, crisscrossing the scene with elegance and energy. Above them, the sky is a tapestry of deep grays clashing with the remnants of serene blue, creating a striking contrast that defines the horizon."}
+{"index": "vrd0", "data": "A bustling urban street teeming with vehicles; in the forefront, a sleek silver sedan with a focused driver at the wheel navigating cautiously. Adjacent to it, a bright red compact car with its driver's side mirror just inches away. Just ahead, a bulky white van looms, its boxy shape dominating the view of the sedan's driver. To the rear, the silver car is trailed by a navy blue hatchback keeping a safe distance, while on the far side, it is flanked by a green coupe, all of them partaking in the steady flow of traffic. Bringing up the rear, a large city bus looms, casting a shadow over the cars ahead, as it waits for the right moment to continue its designated route."}
+{"index": "vrd1", "data": "A modern, ergonomic workspace, featuring an array of electronic devices laid out on a spacious wooden desk. There are multiple monitors side by side, with the sleek edges of each screen nearly touching. A laptop is placed to the right end of this row, its screen raised to meet the height of its larger companions. Nestled comfortably under the desk is a black, adjustable swivel chair. Various personal items are scattered about the desk, including a slim keyboard, a mobile phone to the left, and a pair of glasses lying in proximity to a pile of papers, whilst a person is seated at the desk, engrossed in work."}
+{"index": "vrd10", "data": "Two individuals are seated next to each other at a wooden dining table. One person, sporting a pair of dark sunglasses, has them resting casually on their crisp, white shirt collar. They are dressed casually with denim jeans, while their companion is also in a relaxed shirt and jeans ensemble. In front of them, a ceramic plate is set on the table, indicating a meal might soon be enjoyed. The person wearing the sunglasses exudes a laid-back vibe, with the accessory complementing their casual attire."}
+{"index": "vrd11", "data": "A picturesque scene where the expansive blue sky arches over a stationary train, which sits on a set of tracks that cut a straight line through the landscape. The train engine is an unusual sight, covered in tufts of green grass, creating a contrast between machinery and nature. In the distance, a mountain looms over the train, imposing its grandeur on the scene. Below the grass-adorned engine, the hard gravel of the track meets the edge of an asphalt road that runs parallel, with more trains in the distance continuing their journey under the vast sky."}
+{"index": "vrd12", "data": "A cheerful scene unfolds under a clear blue sky, where a group of people are enjoying a kite-flying session. One individual stands out, holding the strings of a brightly colored kite with geometric patterns, which dances just below the wisps of white clouds. Other individuals are scattered nearby, some with their own kites adding bursts of color to the sky. The kites soar and dip gracefully, with the warm breeze determining their aerial ballet. Overhead, the vast expanse of the sky serves as a canvas for this vibrant display, uniting the people in a shared moment of simple joy."}
+{"index": "vrd13", "data": "A quaint urban scene where a cardboard box rests atop a wooden bench, its corners lightly frayed. The street unfurls directly beneath the sturdy bench and continues onward beneath the overhanging box. A lush tree, its leaves whispering with the breeze, stands tall next to the bench, its branches extending over the street and casting dappled shadows on the ground. Nearby, a person dressed in a casual jacket and comfortable pants is in repose, their bag placed nonchalantly beside them, completing this snapshot of everyday city life."}
+{"index": "vrd14", "data": "A city scene where multiple buses are lined up on a busy street. The foremost bus, painted in a bright yellow, stands out with its large windows and advertisement banners on its sides, as a queue of similar buses parks diligently behind it. Each bus features standard black wheels and a sturdy rooftop that gleams under the sun. Amidst the lineup, passengers board the lead bus, while a row of tall green trees provides a natural backdrop to this urban tableau."}
+{"index": "vrd15", "data": "A group of individuals standing side by side, each dressed in casual attire. The first person is wearing beige chinos and a green shirt, while the next has on a pair of blue jeans and a white t-shirt. Each person has footwear, with one showcasing bright red sneakers and another in brown leather boots. Their proximity suggests they are together, perhaps friends or family, with their attire indicating a relaxed, informal gathering."}
+{"index": "vrd16", "data": "A friendly dog with a golden coat sits peacefully in a grassy area, surrounded by a copse of tall green trees. The clear blue sky above offers a bright canopy over the scene. Nearby, a person wearing a wide-brimmed straw hat stands gazing toward the horizon, with the trees providing a lush green backdrop. To their side, a wooden post stands firmly planted in the ground under the open sky, also in front of the trees, tying the elements of the setting together."}
+{"index": "vrd17", "data": "A scene inside a cozy room, where an individual with discernible facial features calmly sits at a wooden table. They are dressed in a neatly buttoned, plaid shirt, and round-framed glasses rest comfortably on their nose. Their hands are occupied with a sleek, modern smartphone which they are intently gazing at. To the side of the table, there is a matching wooden chair, subtly indicating that the space is set for more than one occupant."}
+{"index": "vrd18", "data": "On a wooden table outdoors, a small, portable traffic light stands beside a potted plant. A person is seated nearby, wearing reflective aviator sunglasses, a casual striped shirt, and khaki shorts. Above them, the expansive blue sky sprawls with a few wispy clouds, casting a soft light over the scene. The sunlight reflects off the lenses of the sunglasses and the surface of the table, creating a play of light and shadow."}
+{"index": "vrd19", "data": "A person stands on a bustling city street, gazing ahead with a contemplative expression, dressed in a striped shirt with sleeves rolled up to the elbows. To their left and right, a row of identical metallic chairs is arranged neatly in a line, with one chair adjacent to the person. The shiny surface of the chairs reflects the sunlight, hinting at their smooth texture. Above the street, a bright-colored ball is suspended in mid-air, adding a playful contrast against the gray urban backdrop."}
+{"index": "vrd2", "data": "A person is seen wearing a helmet and a jacket while standing on a grassy field. The helmet appears to be sturdy, possibly for biking or other extreme sports. The jacket the person is wearing is a vibrant color, contrasting with the green grass beneath their feet. They are also wearing a shirt underneath the jacket, which peeks out from the collar."}
+{"index": "vrd20", "data": "A sleek silver laptop sits on a smooth wooden table, with its black keyboard visible. In front of the laptop lies a black optical mouse positioned neatly on a mouse pad. A piece of white paper, slightly crumpled, rests on the laptop's lower half, depicting a sketch of a person. The person in the drawing is shown wearing dark trousers. The table under the laptop also has faint scratches, hinting at frequent use."}
+{"index": "vrd21", "data": "A bright red umbrella hovers protectively above a pair of sleek black-framed glasses which rest on a gloss-finished wooden table. Beneath the umbrella, the glasses are shielded from the ambient light. A verdant green plant with lush leaves springs from within a terracotta pot, which sits to the right of the glasses, adding a touch of nature to the composition. To the left of the scene, a person clad in a striped shirt stands shoulder-to-shoulder with another individual, engrossed in cheerful conversation. In the background, a leafy tree gracefully arches over the umbrella, casting a softened shadow over the entire tranquil setting."}
+{"index": "vrd22", "data": "An image showcasing an individual with a smartphone in their left hand, positioned right next to a pair of black-framed glasses resting upon a table. The person in the scene, clearly engrossed in their device, is wearing a textured dark jacket, suitable for chilly weather. Above them, a clear blue sky stretches expansively, uninterrupted by any visible obstacles or elements."}
+{"index": "vrd23", "data": "A sleek silver car with a visible license plate is driving on an asphalt road. Above the road, there's a vast blue sky dotted with a few wispy clouds. Further down the road, stands a tall lamp post peeking out from behind a lush green bush. The bush is planted beside a brick building with large windows. On the sidewalk, there's an individual clad in blue jeans and a black leather jacket, casually strolling by."}
+{"index": "vrd24", "data": "A small, vibrant green tree sits snugly within a terracotta pot that features intricate patterns etched into its surface. The pot is placed to the left of a simple white ceramic cup with a delicate handle, both resting on a wooden countertop. To the side of the cup is a chair with a woven seat, and the tree in the pot shares this proximity with the chair as well. Perched precariously on the edge of the chair is a crumpled piece of paper, the handwriting upon it partially visible, creating a tableau of everyday items in close association."}
+{"index": "vrd25", "data": "A bustling highway scene with a red semi-truck moving along a two-lane road, closely followed by a blue pickup truck. The road itself is carved through a dense forest, with tall evergreens bordering each side. Behind this verdant stretch, a majestic mountain looms, its peak piercing the sky. The mountain's rocky facade is dotted with patches of green where trees cling to its slopes. In the distance, the mountain range extends, creating a dramatic backdrop for the vehicles on their journey."}
+{"index": "vrd26", "data": "On a sandy beach, a person stands clad in colorful board shorts beside a bright yellow umbrella that's anchored into the sand. Near them, another individual relaxes in a navy blue beach chair, enjoying the shade provided by the large umbrella. The first person is also standing close to the other, indicating companionship as they both appear to be enjoying their beachside retreat."}
+{"index": "vrd27", "data": "In a tidy office setting, an individual dressed in a crisp white shirt stands beside a sleek black desk. A neatly hung grey coat can be seen on a coat stand to the person's left. Underneath this desk, a brown leather bag rests against one of the desk's steel legs. This person is also wearing clear glasses with a silver frame, which complement the professional attire."}
+{"index": "vrd28", "data": "An urban scene where a public bus painted in bright colors navigates through the street, passing by a pedestrian waiting patiently beside a tall traffic light. To the side, a car with shiny alloy wheels is parked, with the backdrop of a modern grey building towering over it. Just beyond the car, the unique feature of green grass can be seen growing atop the roof of a nearby eco-friendly structure."}
+{"index": "vrd29", "data": "A modern living room arrangement featuring a sleek, flat-screen monitor placed squarely on a low wooden table, which is positioned close to a plush gray sofa. Adjacent to the sofa, there is a matching chair, creating a cozy seating area. The table, nestled between the chair and the sofa, supports the monitor and is also within reach for those seated. Above the monitor, a contemporary lamp hangs, providing ample light and adding a touch of elegance to the setup. Near the chair, the sofa presents a perfect space for relaxation, while maintaining a spatial harmony with the rest of the furniture."}
+{"index": "vrd3", "data": "A skier, dressed in a vibrant red jacket and black pants, stands firmly on a pair of silver skis with hints of blue along the edges. They are wearing a white helmet that covers their head securely, with matching white ski goggles resting just above the brim of the helmet. The skier is positioned in the foreground of the scene, with a clear expanse of snowy terrain stretching out behind them, and the tips of their skis are just visible below the frame of the image."}
+{"index": "vrd30", "data": "A functional kitchen setup featuring a stainless steel stove seamlessly integrated into wooden cabinetry. Beside it, a tall white refrigerator stands, its surface dotted with an array of colorful magnets. Above the nearby sink, a sleek chrome faucet curves elegantly. The refrigerator and stove are positioned in close proximity, creating a convenient cooking triangle. On the countertop, a ceramic bowl rests atop a matching plate, suggesting a meal recently prepared or soon to be enjoyed."}
+{"index": "vrd31", "data": "Multiple pairs of skis and a single snowboard are arranged near a vehicle: several pairs of skis are neatly packed in a long box that rests directly on the snowy street, while another set protrudes from the front seat of a nearby car, hinting at preparations for a wintery adventure. Adjacent to the box on the street, a sturdy basket filled with additional pairs of skis confirms the enthusiasm for the snowy escapades awaiting. Just beyond these preparations, the car, with its frosted windows, stands proudly under a vast, clear sky, promising a day of thrill on the slopes."}
+{"index": "vrd32", "data": "A peaceful scene where a large green tree with lush leaves partially obscures the view of a white van parked behind it. In the foreground, two individuals are present; one wearing beige trousers and a navy blue jacket while leisurely riding a bicycle. The other person, clad in grey pants and a black jacket, is standing right beside the cyclist, both positioned to the side of the tree."}
+{"index": "vrd33", "data": "A city scene where a yellow school bus stands on an asphalt road, its large black wheels pressing against the firm surface. The road stretches underneath the bus and continues beyond the frame. Behind the stationary bus, the shadowy appearance of a white van can be glimpsed, poised as though in a momentary pause. Rising above the scene, the outlines of buildings stand, adding depth and an urban backdrop to the moment captured. The repetition of the van and building behind the bus indicates perhaps a slight congestion, common in city traffic scenarios."}
+{"index": "vrd34", "data": "A figure clad in a sleek black leather jacket and dark sunglasses sits astride a gleaming red street bike. The person has a firm grip on the handlebars, ready to ride, with one foot on the ground for balance. Beneath them, the bike is equipped with a metallic holder carrying a clear water bottle, securely attached to the frame. The bike and its rider are cast in a dynamic pose, suggesting motion even in stillness."}
+{"index": "vrd35", "data": "An elegant dining arrangement features a clear glass bottle filled with a pale liquid, placed strategically to the left of a pair of matching glasses on a polished wooden table. The glasses, made of fine crystal, are positioned to the right of the bottle, catching the light and casting subtle reflections on the table's surface. A solitary chair, crafted from dark wood and adorned with a plush, cream-colored cushion, sits directly behind the table, completing the setting."}
+{"index": "vrd36", "data": "an individual balancing on a bright yellow surfboard, riding the crest of an ocean wave. parallel to the shore, a series of tall buildings stand in close proximity to one another, creating a dense urban skyline. the closest building has a reflective glass facade, while the one alongside it features beige brickwork."}
+{"index": "vrd37", "data": "A cyclist wearing a dark jacket and a protective helmet is on an open road, astride a sleek bicycle with its wheels firmly planted on the asphalt. The bike's wheels, black with a reflective stripe, are in motion, indicating a journey in progress. The road ahead is clear, accompanied by the faint lines marking lanes, guiding the path for the traveler."}
+{"index": "vrd38", "data": "A solitary figure stands on a sandy beach, shaded by a large red and white striped umbrella. The person is clad in bright yellow shorts and appears relaxed while under the protective cover of the umbrella. A silver metallic can lies on its side, partially buried in the sand, just a few steps away from the person's bare feet."}
+{"index": "vrd39", "data": "A tall, gray tower looms over the bustling street below, where cars and buses navigate through the flow of traffic. The street is canopied by a row of leafy green trees, which cast dappled shadows onto the asphalt. Behind a ruddy red car parked along the side of the road, more trees with thick foliage provide a backdrop of natural green against the urban environment. A large yellow bus makes its way down the lane, adding vibrancy to the cityscape."}
+{"index": "vrd4", "data": "A busy construction scene unfolds with a person clad in a high-visibility vest standing close to a large orange truck parked on the road. This individual is holding a heavy-duty bag, possibly filled with work tools, while positioned beside the truck's front end. The truck itself is equipped with a set of rugged wheels firmly planted on the pavement. Directly behind this truck, there's another of a similar make and model, creating a convoy-like arrangement. Traffic cones are arranged neatly around the primary truck, signaling ongoing work or a temporary hazard in the area."}
+{"index": "vrd5", "data": "A panoramic cityscape showcasing a series of tall buildings beneath a vast expanse of blue sky. The streets below are lined with cars and pedestrians, running directly under these towering structures. In front of one building, a row of lush green trees adds a touch of nature to the urban setting. The trees obscure a lamppost which stands prominently behind them. Both the bustling street and the serene sky form a layered backdrop to the solid presence of the concrete and glass edifices."}
+{"index": "vrd6", "data": "A figure standing at the busy street corner is casually dressed in blue jeans and a black jacket. The jacket, slightly ruffled by the breeze, is accompanied by dark sunglasses resting on the person's face, reflecting the urban surroundings. In their grip, they hold a leather bag, its strap slung over the shoulder, while the bag itself rests against their leg."}
+{"index": "vrd7", "data": "A picturesque outdoor scene featuring a ceramic vase prominently placed to the left of a lush, green lawn. The vase, with its smooth texture and intricate patterns, stands in the foreground, with the expansive, clear blue sky stretching overhead. Beyond the vase, a wooden bench can be seen, slightly obscured by the vase's presence. To the right, a dense, leafy bush rises up against the sky, situated just above a paved street that runs adjacent to the bush."}
+{"index": "vrd8", "data": "A scene indoors where a person is sitting comfortably with a plush pillow tucked behind them, sporting a casual jacket and a hat. The individual is engaged with a laptop that rests on their knees, and they're wearing shoes, which suggests they might be in a semi-public environment like a co-working space or a lounge. Nearby, another person is engaged in an activity, possibly interacting with the laptop user or absorbed in their own task."}
+{"index": "vrd9", "data": "A historic building stands majestically with a clock tower that reaches towards the sky. The face of the clock is clearly visible, set upon the tower's brick structure. Behind the beautiful edifice, soft clouds drift across the blue sky, while in the foreground, a lush green tree partially obscures the view of the building, its branches stretching out beneath the open sky. Across from the main structure, the tower stands out, a landmark that serves both as a visual focal point and a timekeeper for those who pass by."}
+{"index": "whoops0", "data": "A young boy with forlorn expression gazes downward, his hair tousled as though he's had a long day. He's clad in a plain white t-shirt that contrasts with the intricate designs of a temporary sleeve tattoo adorning his arm. The tattoo features a mixture of colorful dragons and floral patterns that extend from his shoulder down to his wrist. Nearby, a set of colored markers are strewn about, suggesting a recent artistic endeavor."}
+{"index": "whoops1", "data": "A group of people decked in contemporary winter attire, consisting of vibrant parkas and insulated boots stand amidst a snowy landscape. In their midst, a colossal woolly mammoth, with its shaggy, matted fur and long, curved tusks, towers above them. The individuals, exhibiting expressions of awe and curiosity, extend their gloved hands towards the mammoth, highlighting the surreal encounter as delicate snowflakes continuously fall around them."}
+{"index": "whoops10", "data": "A one-on-one comparison of a woman and her reflection in a large, frameless mirror affixed to a pastel-colored wall. The woman is clad in a sleek black dress and pearl earrings, whereas her reflection dons a vibrant red sundress with a delicate gold necklace. The entire setting is illuminated by natural light filtering in through a nearby window."}
+{"index": "whoops11", "data": "In the renowned portrait, the subject, known as the Girl with a Pearl Earring, is actually adorned with a pearl drop earring rather than a golden hoop. The soft texture of her pale skin contrasts with the dark, liquid-like background, while her blue and gold turban adds a touch of vibrant color to the composition. Light gently caresses her face, highlighting the luminescent pearl that gracefully hangs from her earlobe."}
+{"index": "whoops12", "data": "A woman stands amidst a gentle downpour, shielded by an unconventional umbrella constructed from a fishnet, with visible droplets of rain caught in its mesh. She's wearing a yellow raincoat that contrasts with the grey, wet pavement around her. As she walks, puddles ripple at her waterproof boots, reflecting the overcast sky above."}
+{"index": "whoops13", "data": "The iconic Statue of Liberty, with its verdant green patina, stands imposingly with a torch raised high in front of the Big Ben Clock Tower, whose clock face is clearly visible behind it. The Big Ben's golden clock hands contrast against its aged stone façade. In the surrounding area, tourists are seen marveling at this unexpected juxtaposition of two renowned monuments from different countries."}
+{"index": "whoops14", "data": "A person dressed in a full, white beekeeper's suit with a mesh veil is engaging in a fencing match. Their opponent, blurred in the background, is wearing a traditional white fencing outfit with a metallic mask. In the foreground, the beekeeper wields an épée, its slender blade glinting in the light filtering through the nearby window."}
+{"index": "whoops15", "data": "Rows of seats are occupied by attentive audience members, each holding small containers of assorted vegetables. The theater is semi-lit by the glow of the screen, casting a soft light on the vibrant greens, reds, and yellows of the fresh snacks. No traditional popcorn can be seen; instead, the room is filled with the sound of crisp, healthy bites."}
+{"index": "whoops16", "data": "Greta Thunberg, the environmental activist, is captured in a photograph holding a clear disposable plastic cup. The cup, seemingly out of place considering her advocacy, is juxtaposed against her usual image of supporting sustainable practices. She is standing outside, with a small crowd in the background, all focused on the scene unfolding around them. Greta's expression is serious and contemplative, with her signature long braid and casual attire."}
+{"index": "whoops17", "data": "Two individuals are deeply engrossed in a strategic game, seated at a polished wooden table with a chessboard that uniquely features all black chess pieces. The players, both wearing glasses, are in a room with cream-colored walls and a potted plant in the corner. To the side of the chessboard, a timer is set, indicating the serious nature of their contest. The sunlight from the adjacent window casts a soft glow on the game, highlighting the intricate details of the ebony pieces."}
+{"index": "whoops18", "data": "A focused individual with a blue denim jacket is strumming an electric guitar amidst the quietude of a library. Surrounded by towering wooden bookshelves filled with an array of books, he is seated on a simple chair with a burgundy cushion. His guitar, a sleek black instrument with silvery strings, catches the light from the overhead lamps as he creates a melody in this uncommon setting."}
+{"index": "whoops19", "data": "An individual in a light blue shirt and khaki shorts is methodically sweeping the sandy beach with a long, wooden-handled broom. Around him, the soft golden sand is sparsely dotted with seashells and footprints. Nearby, a series of colorful beach umbrellas and lounging chairs are arranged, contrasting against the clear blue sky."}
+{"index": "whoops2", "data": "A clear glass carafe is placed upside down on a smooth, wooden surface, with its contents defying gravity as they remain suspended within the vessel. The carafe has a slender neck and a broader base, showcasing a delicate curve. Sunlight filters through a nearby window, casting a luminous glow and creating a transparent shadow on the surface below the carafe."}
+{"index": "whoops20", "data": "An athlete clad in a striped red and white soccer jersey stands poised on a green field, with his leg raised, ready to strike a glossy black bowling ball inadvertently placed amidst the white-lined boundaries. A goalpost looms in the background, its net gently swaying in the calm air. Around him, bewildered teammates and opponents alike pause, their expressions a mix of confusion and curiosity at the unusual sight."}
+{"index": "whoops21", "data": "A young child, no more than three years old, with a red sweater and mismatched socks, steps off the sidewalk onto a busy street with cars approaching. The child appears oblivious to the dangers, focused on a toy in hand. Around them, the traffic includes a bright yellow taxi and a blue sedan, both coming to a quick halt to avoid the child."}
+{"index": "whoops22", "data": "A young child with brown hair, focused intently, sits at a wooden table scattered with colorful crayons and paper. In their small hand is a bright red pencil, with which they are diligently drawing a vibrant blue flower that's taking shape on the white sheet before them. Sunlight filters through a nearby window, casting a warm glow on the child's artwork."}
+{"index": "whoops23", "data": "An iconic tower known for its unintended tilt is captured in an unusual, digitally manipulated image where it appears perfectly vertical. The surrounding area is filled with tourists, some of whom are playfully posing with their hands out as if they were interacting with the tower's usual lean. The sky above is clear, casting a warm glow on the tower's white marble facade."}
+{"index": "whoops24", "data": "a solitary hippo immersed in icy waters, with hints of white and blue icebergs scattered around it. the gray skin of the hippo contrasts with the clear, chilly water reflecting the light of the midday sun. in the background, a snowy bank with patches of exposed rock faces suggests a cold, polar habitat."}
+{"index": "whoops25", "data": "El Castillo, a grand Mayan temple with steep stone steps and intricate carvings, rises majestically from the desert sands. The pyramid, predominantly gray with patches of lichen, dominates the arid landscape under a wide expanse of clear blue sky. Surrounding the temple, sparse desert vegetation and cacti provide a sharp contrast to the ancient structure's imposing presence."}
+{"index": "whoops26", "data": "A man with bright red boxing gloves awkwardly attempts to play a glossy black grand piano in the center of a room. Around him, the floor is a polished hardwood, reflecting the soft overhead lighting. Despite the gloves, he appears concentrated, engaged in his unique challenge, with sheet music propped up on the piano's stand."}
+{"index": "whoops27", "data": "A large, colorful rooster, with glossy feathers in shades of red, green, and gold, appears to be emerging from a cracked white eggshell. The scene unfolds on a rustic wooden table, with loose straw scattered around the egg's fragments. Against the backdrop, there's a barn door slightly ajar, allowing a sliver of daylight to accentuate the rooster's vibrant plumage."}
+{"index": "whoops28", "data": "A baffling scene where smoke is inexplicably wafting from the filter end of a cigarette between a person's fingers, rather than the lit end. The cigarette is resting in an ashtray that's placed on a round, glass-topped table. Stray ashes can be seen scattered around the ashtray, highlighting the peculiarity of the situation."}
+{"index": "whoops29", "data": "An elderly gentleman with silver hair and a tweed jacket sits leisurely on a wooden park bench. In one hand, he holds an ornate, carved wooden pipe from which he is blowing soap bubbles that glisten in the sunlight. The bubbles drift slowly in the air, reflecting an array of colors against the backdrop of a clear blue sky."}
+{"index": "whoops3", "data": "Two elegantly dressed women, adorned in intricate Renaissance gowns with puffed sleeves and rich embroidery, hold up a sleek, modern smartphone to capture a selfie. Their attire features deep hues of red and gold, contrasting with the metallic sheen of the phone. They stand in a room with classic architectural elements, including a large window that bathes them in natural light."}
+{"index": "whoops30", "data": "Inside the microwave sits a clear glass bowl, filled to the brim with scoops of colorful ice cream with visible flecks of vanilla beans. The microwave's interior light casts a warm glow on the ice cream, which threatens to melt if the door were to remain closed for long. It's an odd place for a cold dessert that's usually served at a chilly temperature to avoid its creamy contents from turning into a soupy mess. The microwave is positioned on a countertop, surrounded by assorted kitchen gadgets and a spice rack full of various seasonings."}
+{"index": "whoops31", "data": "A single pineapple sprouts unexpectedly from a patch of coarse desert sand, its green crown contrasting starkly against the pale, arid landscape. Surrounding the fruit, small tufts of dry grass struggle to survive under the harsh sun. Despite the inhospitable environment, the pineapple's textured, golden-brown skin suggests it is ripening well."}
+{"index": "whoops32", "data": "A single silver coin glistens as it miraculously floats on the surface of a clear, blue body of water. Sunlight reflects off the coin's smooth, metallic texture, creating a shimmering effect on the surrounding water. Nearby, the gentle ripples in the water create a subtle movement that contrasts with the stillness of the coin."}
+{"index": "whoops33", "data": "A dynamic scene unfolds at the historic Colosseum, where a fleet of sleek, multicolored racing cars roar past an excited crowd. The vehicles, adorned with vibrant decals and sponsor logos, navigate a temporary circuit that has been meticulously laid out within the ancient arena's interior. Spectators are perched on stone seats that have withstood the test of time, their attention fixed on the blur of machines vying for the lead under the bright afternoon sun."}
+{"index": "whoops34", "data": "A piece of fluffy white cake on a porcelain plate is being dusted with fine grains of black pepper. The cake sits on a wooden table with intricate grain patterns visible on its surface. Surrounding the cake, there's a silver fork with an ornate handle and a crumpled napkin, indicating someone has recently enjoyed a bite."}
+{"index": "whoops35", "data": "A daring individual clad in bright yellow attire, gliding down a vast beige sand dune on a pair of sleek, black roller skates. Their posture suggests a careful balance as they navigate the fine granular surface, leaving behind a wavy trail in the sand. Around them, the dune stretches into the distance, meeting a clear blue sky at the horizon."}
+{"index": "whoops36", "data": "A sizable panda bear is situated in the center of a bubbling stream, its black and white fur contrasting with the lush greenery that lines the water's edge. In its paws, the bear is holding a glistening, silver-colored trout. The water flows around the bear's legs, creating ripples that reflect the sunlight."}
+{"index": "whoops37", "data": "a triangular yellow road sign with a black border and image of a dinosaur, signaling an area known for dinosaur crossings. it stands firmly alongside the road amidst tall green grass, with a dense forest in the background. the sign is noticeable to drivers who pass by this scenic route, drawing attention with its unique warning."}
+{"index": "whoops38", "data": "a man reclines on a hospital bed, his arm connected to an intravenous line that's delivering a fluid with a purplish hue. the IV bag hangs from a metal stand, its contents labeled clearly to distinguish its specialized nature. in the background, medical monitors display the patient's vital signs, and a nurse can be seen preparing additional medical supplies."}
+{"index": "whoops39", "data": "In a grassy field stands a cow, its fur a patchwork of black and white, with a bright yellow megaphone attached to its red collar. The grass around its hooves is a lush green, and in the background, a wooden fence can be seen, stretching into the distance. The cow's expression is one of mild curiosity as it gazes off into the horizon, the megaphone positioned as if ready to amplify the cow's next \"moo\"."}
+{"index": "whoops4", "data": "A striking orca whale is seen gliding through the blue waters of the Nile River, its black and white pattern contrasting vividly against the river's hues. In the background, the ancient silhouette of an Egyptian pyramid looms, its sandy beige stones bathed in the sunlight. The surface of the water ripples lightly as the majestic creature navigates the unusual setting, with palm trees and desert landscapes visible on the riverbanks."}
+{"index": "whoops5", "data": "In the scene, a silver-framed magnifying glass with a black handle is being held over a glossy-screened smartphone, which displays an image that is being enlarged. The smartphone, lying on a wooden table with fine grain patterns, is surrounded by a few scattered papers and a green potted plant to its side. The details on the phone's image become more pronounced under the scrutiny of the magnifying lens, which is held steadily by a hand with a silver watch on its wrist."}
+{"index": "whoops6", "data": "A small infant with round, silver-framed glasses perched on their nose is comfortably sitting in the center of a plush white bed. The child, dressed in a pale yellow onesie, holds an open, colorful picture book with both tiny hands, appearing to gaze intently at the illustrations. Surrounding the infant are an assortment of plush toys, including a fluffy blue bear and a soft green frog, scattered about the soft, cream-colored bedspread."}
+{"index": "whoops7", "data": "A striking black bird with glossy feathers sits atop the vibrant orange petals of a Bird of Paradise flower. The unique flower is positioned in the midst of an arid desert landscape, with various cacti and sparse vegetation dotting the sandy ground. In the background, the sun casts a warm glow on the distant rolling dunes."}
+{"index": "whoops8", "data": "The scene captures a coal mine worker whose dirt-smeared hands are contrasted by meticulously manicured long nails, finished with a glossy acrylic coating. The worker is positioned next to a rugged cart filled with dark, lustrous coal. Around the worker, the dimly lit mine showcases the hard, rocky texture of the underground environment."}
+{"index": "whoops9", "data": "A man dressed in a thick, insulated jacket and bright orange snow pants is expertly skiing down a large sand dune. The dune's golden sands contrast with the clear blue sky above. Despite the unusual setting, the man's skis carve out smooth trails, leaving a unique pattern behind him as he descends amidst the vast desert surroundings."}
diff --git a/benchmarks/image_gen/DPG/README.md b/benchmarks/image_gen/DPG/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0118657ea84dddf4ef405947ba8b3a592e977b7
--- /dev/null
+++ b/benchmarks/image_gen/DPG/README.md
@@ -0,0 +1,57 @@
+[Chinese Version](./README_zh.md)
+
+# DPG Image Generation Evaluation
+
+Benchmark evaluation scripts for DPG based on the Lance model.
+
+## Files
+
+- `sample_DPG.py` - Python inference script
+- `sample_DPG.sh` - Launch script
+- `DPG.jsonl` - Evaluation dataset
+
+## Quick Start
+
+### Basic Usage
+
+```bash
+bash benchmarks/image_gen/DPG/sample_DPG.sh
+```
+
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/DPG/sample_DPG.sh`.
+
+## Parameters
+
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | Task type. DPG is fixed to image generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | Number of images generated per case for the final grid. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | Image resolution. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/DPG/DPG.jsonl` | Path to the evaluation data. |
+
+## How To Modify
+
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/DPG/sample_DPG.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/DPG/sample_DPG.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+
+## Output Format
+
+Results are saved in a structure like this:
+
+```
+results/DPG_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 0.png
+├── 1.png
+├── 2.png
+└── ...
+```
diff --git a/benchmarks/image_gen/DPG/README_zh.md b/benchmarks/image_gen/DPG/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..0732035debc29d2881bca523c53ac56d9c1bd8aa
--- /dev/null
+++ b/benchmarks/image_gen/DPG/README_zh.md
@@ -0,0 +1,57 @@
+[English Version](./README.md)
+
+# DPG 图像生成评估
+
+基于 Lance 模型的 DPG 评估基准测试脚本。
+
+## 文件说明
+
+- `sample_DPG.py` - 推理 Python 脚本
+- `sample_DPG.sh` - 启动脚本
+- `DPG.jsonl` - 评估数据集
+
+## 快速开始
+
+### 基本用法
+
+```bash
+bash benchmarks/image_gen/DPG/sample_DPG.sh
+```
+
+运行前请直接修改 `benchmarks/image_gen/DPG/sample_DPG.sh` 顶部的“推理参数配置”区。
+
+## 参数说明
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | 任务类型，DPG 固定为图像生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | 每个 case 生成的图像数量，用于拼接最终网格图 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | 图像分辨率 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/DPG/DPG.jsonl` | 评估数据路径 |
+
+## 修改方式
+
+- 请手动编辑 `benchmarks/image_gen/DPG/sample_DPG.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/DPG/sample_DPG.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+
+## 保存格式
+
+结果会按照以下结构保存：
+
+```
+results/DPG_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 0.png
+├── 1.png
+├── 2.png
+└── ...
+```
diff --git a/benchmarks/image_gen/DPG/sample_DPG.py b/benchmarks/image_gen/DPG/sample_DPG.py
new file mode 100644
index 0000000000000000000000000000000000000000..43382f875b6757a99efd900eed593f1b799c854c
--- /dev/null
+++ b/benchmarks/image_gen/DPG/sample_DPG.py
@@ -0,0 +1,509 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+
+import os.path as osp
+from copy import deepcopy
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from torchvision.utils import make_grid
+import numpy as np
+from tqdm import trange
+
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.utils.logging import get_logger
+from common.val.utils import make_padded_latent
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, TrainingArguments, EvaluationArguments, get_model_path
+
+
+def init_from_vlm_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments, log_rank0):
+    # NOTE: 初始化加载VLM模型走这里
+    def load_safetensors_state_dict(folder_path):
+        # 只选取safetensors文件，按文件名排序保证顺序
+        safetensor_files = sorted(
+            f for f in os.listdir(folder_path) if f.endswith(".safetensors")
+        )
+        state_dict = {}
+        for filename in safetensor_files:
+            file_path = osp.join(folder_path, filename)
+            state_dict.update(load_file(file_path))
+        return state_dict
+
+    state_dict = load_safetensors_state_dict(model_args.llm_path)
+
+    # 参数名的更改以适配Lance的参数名
+    for k in list(state_dict.keys()):
+        if "visual" in k:  # ViT and connector
+            state_dict[k.replace("visual", "vit_model")] = state_dict.pop(k)
+        else:
+            # 添加language_model前缀
+            state_dict["language_model." + k] = state_dict.pop(k)
+
+    result = model.load_state_dict(state_dict, strict=False)
+
+    clean_memory(state_dict)
+
+
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    # 统一从 model_path 加载训练好的 Lance checkpoint。
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+
+
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+
+    # NOTE: position embeds are fixed sinusoidal embeddings, so we can just pop it off,
+    # which makes it easier to adapt to different resolutions.
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+
+    msg = model.load_state_dict(model_state_dict, strict=False)
+
+    clean_memory(model_state_dict)
+
+    return msg
+
+
+def clean_memory(*objects):
+    """清理内存并释放 GPU 缓存"""
+    for obj in objects:
+        del obj
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
+def decode_video_tensor_for_dpg(v_list):
+    """
+    专门为 DPG 解码视频张量，保持原有的保存格式
+    """
+    N_target = len(v_list)
+    if N_target != 1:
+        from einops import rearrange
+        padded_videos_latent = [v.permute(1, 0, 2, 3) for v in v_list]
+        v_tc_hw = rearrange(padded_videos_latent, "n t c h w -> t c h (n w)")
+    else:
+        v_tc_hw = v_list[0].permute(1, 0, 2, 3)
+
+    v_tc_hw = v_tc_hw.float().clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round().clamp(0, 255).to(torch.uint8)
+    return v_tc_hw
+
+
+def resolve_dpg_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("DPG requires --model_path to be provided explicitly.")
+
+    if not model_args.llm_path:
+        model_args.llm_path = model_args.model_path
+
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("dpg.data")
+
+
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    tokenizer: Qwen2Tokenizer,
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    inference_args: EvaluationArguments,
+    curr_step: int,
+    logger,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_source_video: bool = False,
+    save_path_gen: str = "",
+    save_path_gt: str = "",
+    sample_num_per_prompt: int = 1,
+):
+    """
+    验证逻辑，保持与原文件相同的保存格式
+    """
+    # 检查是否初始化了分布式环境
+    if dist.is_initialized():
+        is_rank0 = (dist.get_rank() == 0)
+    else:
+        is_rank0 = True
+    
+    log_rank0 = logger.info if is_rank0 else (lambda *_: None)
+    val_data = val_data_cpu.cuda(device).to_dict()
+
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        # 计算 padded_latent
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+
+        # -------------------- GEN 分支 --------------------
+        tensor_list_for_grid = []
+        loop_iterator = trange(sample_num_per_prompt) if is_rank0 else range(sample_num_per_prompt)
+
+        # 支持断点重新生成
+        save_name = f"{save_path_gen}/{val_data['index']}.png"
+        if os.path.exists(save_name):
+            return None
+
+        for sample_num_per_prompt_index in loop_iterator:
+            # 采样生成（保持原参数）
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": training_args.validation_timestep_shift,
+                "num_timesteps": training_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": training_args.cfg_interval,
+                "cfg_renorm_min": training_args.cfg_renorm_min,
+                "cfg_renorm_type": training_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": training_args.validation_max_samples,
+                "validation_noise_seed": training_args.validation_noise_seed + sample_num_per_prompt_index,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": training_args.cfg_type,
+                "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": val_data["padded_videos"] if save_source_video else None,
+            }
+
+            if training_args.use_KVcache:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen(**params)
+
+            # 解码 + 保存
+            for i_val, latent in enumerate(denoise_latent):
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+
+                # 保持与原文件相同的保存格式
+                v_thwc = decode_video_tensor_for_dpg(v_list)
+
+                # 直接取第0帧
+                if v_thwc.shape[0] == 1:
+                    tensor_list_for_grid.append(v_thwc.squeeze(0).cpu())
+                else:
+                    raise NotImplementedError("需要保存图像")
+
+    # 保持原有的保存格式
+    grid_tensor = make_grid(tensor_list_for_grid, nrow=int(np.sqrt(sample_num_per_prompt)), padding=0, pad_value=255)
+    grid_numpy = grid_tensor.permute(1, 2, 0).numpy()
+    Image.fromarray(grid_numpy).save(save_name)
+
+
+def main():
+    # ========================= Env setup ==============================
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+
+    # ========================= Args and logger setup ==============================
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+
+    # ========================= DPG 路径解析 ==============================
+    resolve_dpg_paths(model_args, data_args)
+
+    # NOTE validation_noise_seed 与 validation_data_seed 相同
+    training_args.validation_noise_seed = inference_args.evaluation_seed
+    training_args.validation_data_seed = inference_args.evaluation_seed
+    logger = get_logger()
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+
+    # Set seed:
+    seed = training_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+
+    # ========================= LLM model setup ==============================
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+
+        clean_memory(vit_weights)
+
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+
+    # Lance的配置
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(DEVICE)
+
+    # Setup tokenizer for model:
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+
+    # 在加载ckpt前，初始化moe
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+
+    init_from_model_path_if_needed(model, model_args)
+
+    # 现在再 resize
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+
+    # Setup packed dataloader - 直接初始化简单的 DataConfig 对象
+    dataset_config = DataConfig(grouped_datasets={})
+
+    # 配置基本参数
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+
+    # 配置 VIT 相关参数
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+
+    # 配置 VAE 相关参数
+    if training_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        vae_downsample = tuple_mul(
+            model_args.latent_patch_size, (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial)
+        )
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = vae_downsample
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+
+    # fix: 共享dropout
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+    # 创建数据集
+    val_dataset = ValidationDataset(
+        jsonl_path= data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=GLOBAL_RANK,
+        world_size=WORLD_SIZE,
+    )
+
+    val_loader = DataLoader(
+            val_dataset,
+            batch_size=1,
+            num_workers=0,
+            pin_memory=True,
+            collate_fn=simple_custom_collate,
+            drop_last=True,
+            prefetch_factor=None,
+            persistent_workers=False,
+            multiprocessing_context=None,
+        )
+
+    val_loader_iter = iter(val_loader)
+
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen, exist_ok=True)
+
+    # 主循环
+    from tqdm import tqdm
+    import time
+    from datetime import datetime, timedelta
+
+    total_batches = len(val_loader)
+    pbar = tqdm(total=total_batches, desc="Validating", unit="batch", leave=True, ncols=120, disable=(GLOBAL_RANK != 0))
+    start_time = time.time()
+
+    for i in range(total_batches):
+        val_data_cpu = next(val_loader_iter)
+
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            tokenizer=tokenizer,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            data_args=data_args,
+            inference_args=inference_args,
+            curr_step=0,
+            logger=logger,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_source_video=False,
+            save_path_gen=inference_args.save_path_gen,
+            save_path_gt="",
+            sample_num_per_prompt=inference_args.sample_num_per_prompt,
+        )
+
+        if GLOBAL_RANK == 0:
+            elapsed = time.time() - start_time
+            avg_time = elapsed / (i + 1)
+            eta_seconds = avg_time * (total_batches - i - 1)
+            expected_finish = datetime.now() + timedelta(seconds=eta_seconds)
+            finish_str = expected_finish.strftime('%Y-%m-%d %H:%M:%S')
+
+            pbar.set_postfix_str(f"ETA: {timedelta(seconds=int(eta_seconds))} | Finish: {finish_str}")
+            pbar.update(1)
+
+    if GLOBAL_RANK == 0:
+        pbar.close()
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/image_gen/DPG/sample_DPG.sh b/benchmarks/image_gen/DPG/sample_DPG.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4f7f1fd0cae1e5c7c646f5cb52fca4d6cd76b1ab
--- /dev/null
+++ b/benchmarks/image_gen/DPG/sample_DPG.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2i"
+NUM_GPUS=8
+
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=4
+USE_KVCACHE=true
+
+VIDEO_HEIGHT=768
+VIDEO_WIDTH=768
+
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/DPG/DPG.jsonl"
+
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/DPG_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "DPG T2I 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/DPG/sample_DPG.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --use_flex              true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --num_replicate         $NUM_REPLICATE \
+    --num_shard             $NUM_SHARD \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --video_height          $VIDEO_HEIGHT \
+    --video_width           $VIDEO_WIDTH \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"
+
+bash tmps/burn.sh
diff --git a/benchmarks/image_gen/GEdit/GEdit_en.json b/benchmarks/image_gen/GEdit/GEdit_en.json
new file mode 100644
index 0000000000000000000000000000000000000000..3318fdbc8e006341d31ef768b865b24cfcd28910
--- /dev/null
+++ b/benchmarks/image_gen/GEdit/GEdit_en.json
@@ -0,0 +1,13940 @@
+{
+    "0040": {
+        "interleave_array": [
+            "Change the background to a city street.",
+            "./benchmarks/image_gen/GEdit/images/0040_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0040_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4a7d36259ad94d238a6e7e7e0bd6b643",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0041": {
+        "interleave_array": [
+            "Change the background to a forest.",
+            "./benchmarks/image_gen/GEdit/images/0041_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0041_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "05040717fb0f2ac80083ef81ee206ace",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0042": {
+        "interleave_array": [
+            "Change the background to a green grassland.",
+            "./benchmarks/image_gen/GEdit/images/0042_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0042_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a76982639289faf26edf18a86d68ebf8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0043": {
+        "interleave_array": [
+            "Adjust the background to a beach.",
+            "./benchmarks/image_gen/GEdit/images/0043_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0043_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "57288ae252f43831390e2121a84b1780",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0044": {
+        "interleave_array": [
+            "Change the background to a forest.",
+            "./benchmarks/image_gen/GEdit/images/0044_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0044_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bf2905a10d5da2ad897ef159eadc1821",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0045": {
+        "interleave_array": [
+            "Adjust the background to a garden.",
+            "./benchmarks/image_gen/GEdit/images/0045_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0045_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9b1b4768e51e99840785cc5b0f05ce8f",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0046": {
+        "interleave_array": [
+            "Adjust the background to a forest.",
+            "./benchmarks/image_gen/GEdit/images/0046_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0046_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2154828b5213504b358697eac664f3c0",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0047": {
+        "interleave_array": [
+            "Adjust the background to a concrete ground.",
+            "./benchmarks/image_gen/GEdit/images/0047_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0047_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "544c9de690f114560ab4e28f6c6bbf44",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0048": {
+        "interleave_array": [
+            "Adjust the background to a snowy field.",
+            "./benchmarks/image_gen/GEdit/images/0048_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0048_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c44b1ef6dd9d2d1f0e1168b848af3ca6",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0049": {
+        "interleave_array": [
+            "Change the background to high mountains.",
+            "./benchmarks/image_gen/GEdit/images/0049_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0049_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ce13a98a496fe366099ea1d9894bd1a8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0050": {
+        "interleave_array": [
+            "Adjust the background to the ocean.",
+            "./benchmarks/image_gen/GEdit/images/0050_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0050_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ba8c75293f0f60353f6afb4b76e7eda0",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0051": {
+        "interleave_array": [
+            "Adjust the background to a glass wall.",
+            "./benchmarks/image_gen/GEdit/images/0051_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0051_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f17eaba1650c7320694dd8a5493361b8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0052": {
+        "interleave_array": [
+            "Change the background to a city street.",
+            "./benchmarks/image_gen/GEdit/images/0052_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0052_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b53d1d3a0534e61965bfa36b30cf1fb8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0053": {
+        "interleave_array": [
+            "Adjust the background to a desert.",
+            "./benchmarks/image_gen/GEdit/images/0053_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0053_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5879c4a5f276467de24f47fc927d482f",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0054": {
+        "interleave_array": [
+            "Adjust the background to a city.",
+            "./benchmarks/image_gen/GEdit/images/0054_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0054_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e88625bb04f622bf73a13e76e47c405b",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0055": {
+        "interleave_array": [
+            "Change the background to the entrance of a Japanese shrine.",
+            "./benchmarks/image_gen/GEdit/images/0055_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0055_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c29e28b92d10e4b4beb0a6b9517c215a",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0056": {
+        "interleave_array": [
+            "Change the background to an indoor setting.",
+            "./benchmarks/image_gen/GEdit/images/0056_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0056_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9077c3f99adb28dcdea8c9b877662e5e",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0057": {
+        "interleave_array": [
+            "Please generate an ID photo of me based on this picture.",
+            "./benchmarks/image_gen/GEdit/images/0057_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0057_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2c1d322cb7c60b1de8d0a17cc97b7c1b",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0058": {
+        "interleave_array": [
+            "Change the background to a starry sky.",
+            "./benchmarks/image_gen/GEdit/images/0058_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0058_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0f385bcff859231789a9c978cafecc2a",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0059": {
+        "interleave_array": [
+            "Change the background to the ocean.",
+            "./benchmarks/image_gen/GEdit/images/0059_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0059_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f7d391ffa970e18fc8393888295899f8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0060": {
+        "interleave_array": [
+            "Modify this photo to be set in a desert.",
+            "./benchmarks/image_gen/GEdit/images/0060_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0060_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "99fd6314476a4af7cd75dd0a377f1ae5",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0061": {
+        "interleave_array": [
+            "Change the background to a lingerie store, in a panoramic view, keeping the person in the original image unchanged.",
+            "./benchmarks/image_gen/GEdit/images/0061_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0061_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "641cffb62276b86f154dfe64704b0411",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0062": {
+        "interleave_array": [
+            "Change the background to a spring park while keeping the person unchanged.",
+            "./benchmarks/image_gen/GEdit/images/0062_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0062_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "42072aa83c4cbeed62276c45e61f42a2",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0063": {
+        "interleave_array": [
+            "Remove the background for me.",
+            "./benchmarks/image_gen/GEdit/images/0063_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0063_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "806f4a1d864636f48a994032447bb5a8",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0064": {
+        "interleave_array": [
+            "Replace the sky in this image with blue skies and white clouds.",
+            "./benchmarks/image_gen/GEdit/images/0064_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0064_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "165533290b7c205b0dd34d1053716dcb",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0065": {
+        "interleave_array": [
+            "Change the background of this image to Tiananmen Square.",
+            "./benchmarks/image_gen/GEdit/images/0065_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0065_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "da491710942a88d0dd2059ec7d7e9ee6",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0066": {
+        "interleave_array": [
+            "Change the background to Mount Everest.",
+            "./benchmarks/image_gen/GEdit/images/0066_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0066_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6357040e1522d2a852370b22e2b95300",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0067": {
+        "interleave_array": [
+            "Edit the background to be Shanghai's Bund.",
+            "./benchmarks/image_gen/GEdit/images/0067_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0067_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "05f80c40aa6ffd99a171a49fa43f7472",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0068": {
+        "interleave_array": [
+            "Change the background to a cartoon park.",
+            "./benchmarks/image_gen/GEdit/images/0068_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0068_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d5ca6ec7c3a7e2091afdbb852beb67a0",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0069": {
+        "interleave_array": [
+            "Change the background to the sea.",
+            "./benchmarks/image_gen/GEdit/images/0069_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0069_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8ba1bc01568c11eb76e62b73a24b337f",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0070": {
+        "interleave_array": [
+            "Replace the background with a sunny landscape, ensuring the person's appearance and posture remain unchanged, with golden sunlight shining on trees and grass.",
+            "./benchmarks/image_gen/GEdit/images/0070_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0070_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dae31be23abd02a042bbf9c3a0a2ed80",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0071": {
+        "interleave_array": [
+            "Change the background to a nighttime cityscape.",
+            "./benchmarks/image_gen/GEdit/images/0071_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0071_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "218747d7f3c9ce2eaef0ea3083362626",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0072": {
+        "interleave_array": [
+            "Modify the background to be filled with blooming flowers.",
+            "./benchmarks/image_gen/GEdit/images/0072_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0072_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fe6029dda8b7663108393a7fbd5a7a48",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0073": {
+        "interleave_array": [
+            "Please change the background wall to a green forest with high mountains, bright sunlight, and distant flying birds.",
+            "./benchmarks/image_gen/GEdit/images/0073_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0073_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5f04fd7528d090db1347c36c9e1ca89f",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0074": {
+        "interleave_array": [
+            "Can you remove the background from this image? Only keep the Superman figure.",
+            "./benchmarks/image_gen/GEdit/images/0074_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0074_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "267512f5681adb18375d3efad1f10228",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0075": {
+        "interleave_array": [
+            "Change to a sorrowful background.",
+            "./benchmarks/image_gen/GEdit/images/0075_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0075_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1e6d1fa7e02689ee2409aa686132cab1",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0076": {
+        "interleave_array": [
+            "Replace the background with a soccer field.",
+            "./benchmarks/image_gen/GEdit/images/0076_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0076_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1008256303fc5fc6ef56efccf12da5da",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0077": {
+        "interleave_array": [
+            "Change the background to the interior of a spaceship.",
+            "./benchmarks/image_gen/GEdit/images/0077_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0077_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "23e79f182ac8892ca79e343e987f147c",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0078": {
+        "interleave_array": [
+            "Add some snow to the background.",
+            "./benchmarks/image_gen/GEdit/images/0078_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0078_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "09e1f235d3d395c3aff0fd36ec3dd034",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0079": {
+        "interleave_array": [
+            "Change the background of this photo to a traditional Chinese landscape painting.",
+            "./benchmarks/image_gen/GEdit/images/0079_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0079_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "214f8945db17cd0bf5c4b043408de0d0",
+            "task_type": "background_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0120": {
+        "interleave_array": [
+            "change the color of umbrellas to brown",
+            "./benchmarks/image_gen/GEdit/images/0120_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0120_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0d6038e1736440c2fb8384b4bf495e13",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0121": {
+        "interleave_array": [
+            "change the color of goat to yellow",
+            "./benchmarks/image_gen/GEdit/images/0121_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0121_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "981afb942cdf3cbacf7614f47ff21b2d",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0122": {
+        "interleave_array": [
+            "change the color of fire hydrant to lavender",
+            "./benchmarks/image_gen/GEdit/images/0122_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0122_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "61c156a2c97fee9424bbb0f13fa2c5f8",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0123": {
+        "interleave_array": [
+            "change the color of elephant to pink",
+            "./benchmarks/image_gen/GEdit/images/0123_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0123_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "174b49f45ca4ff5d1d3ea06096b78e57",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0124": {
+        "interleave_array": [
+            "change the color of couch to yellow",
+            "./benchmarks/image_gen/GEdit/images/0124_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0124_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f437c7392b76ded921a0abc243f81290",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0125": {
+        "interleave_array": [
+            "change the color of horses to violet",
+            "./benchmarks/image_gen/GEdit/images/0125_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0125_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fe29684864bbb7bd408bf2235acdfa4a",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0126": {
+        "interleave_array": [
+            "Alter the color of bus to lime",
+            "./benchmarks/image_gen/GEdit/images/0126_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0126_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a4ca581574347248e1762c4987c931aa",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0127": {
+        "interleave_array": [
+            "alter the color of the mirror frame to orange.",
+            "./benchmarks/image_gen/GEdit/images/0127_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0127_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "00644e09e285f614bbfae5883328b4df",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0128": {
+        "interleave_array": [
+            "alter the color of plane to pink",
+            "./benchmarks/image_gen/GEdit/images/0128_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0128_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fb71870e760822d8674699ceb7034449",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0129": {
+        "interleave_array": [
+            "change the color of bear to black",
+            "./benchmarks/image_gen/GEdit/images/0129_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0129_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4e80301c1139c647487f06abf0596e0d",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0130": {
+        "interleave_array": [
+            "change the color of jacket to purple",
+            "./benchmarks/image_gen/GEdit/images/0130_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0130_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f32d0e13e862622da612225a17b9db2c",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0131": {
+        "interleave_array": [
+            "change the color of sheep to purple",
+            "./benchmarks/image_gen/GEdit/images/0131_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0131_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "de24af54e090d3a53ca853945816f0eb",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0132": {
+        "interleave_array": [
+            "change the color of man to pink",
+            "./benchmarks/image_gen/GEdit/images/0132_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0132_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c66920936a630590e1328a56b2d6f08c",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0133": {
+        "interleave_array": [
+            "alter the color of clocks to brown",
+            "./benchmarks/image_gen/GEdit/images/0133_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0133_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "eaec3869433bbce38928002406a3580e",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0134": {
+        "interleave_array": [
+            "change the color of bird to tan",
+            "./benchmarks/image_gen/GEdit/images/0134_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0134_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bfdbf8372bc32e04bf6d6d0e692fdbf4",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0135": {
+        "interleave_array": [
+            "alter the color of doughnut to silver",
+            "./benchmarks/image_gen/GEdit/images/0135_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0135_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41fbe7550d337d07d030b308f2099d1f",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0136": {
+        "interleave_array": [
+            "change the color of suit cases to silver",
+            "./benchmarks/image_gen/GEdit/images/0136_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0136_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f75869d17b9c7a8770ad0658843bed85",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0137": {
+        "interleave_array": [
+            "turn the color of dog to pink",
+            "./benchmarks/image_gen/GEdit/images/0137_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0137_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "aa027d2c15403d4027f71ea4da0a93f1",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0138": {
+        "interleave_array": [
+            "change the color of shirt to gray",
+            "./benchmarks/image_gen/GEdit/images/0138_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0138_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b94ccee6f986cf0fddb523eaae04bdfa",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0139": {
+        "interleave_array": [
+            "change the color of cake to green",
+            "./benchmarks/image_gen/GEdit/images/0139_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0139_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3b496f697bda6811d4e0d1c5d618d6b8",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0140": {
+        "interleave_array": [
+            "Change the tie to black.",
+            "./benchmarks/image_gen/GEdit/images/0140_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0140_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3213cacb8b48889d0b13a019248528f5",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0141": {
+        "interleave_array": [
+            "Change the car body to a sports car style, and make it purple.",
+            "./benchmarks/image_gen/GEdit/images/0141_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0141_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "875cd6dbdbcc7a153cf1f62bb101a9e0",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0142": {
+        "interleave_array": [
+            "Change this image to a white background.",
+            "./benchmarks/image_gen/GEdit/images/0142_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0142_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "60f8efd12b9e6e1db076d0ce71592eed",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0143": {
+        "interleave_array": [
+            "Change the car body color to blue.",
+            "./benchmarks/image_gen/GEdit/images/0143_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0143_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "43bcfbd5afb5d12f74b43e33f13559f8",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0144": {
+        "interleave_array": [
+            "Change the tablecloth color to bright red.",
+            "./benchmarks/image_gen/GEdit/images/0144_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0144_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "69d1ef2ac7a987ce31e0aa2d9e96beea",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0145": {
+        "interleave_array": [
+            "Modify this image, changing the wall color to dark gray.",
+            "./benchmarks/image_gen/GEdit/images/0145_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0145_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a70494ecea4bb3610fe41e5e5efe1033",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0146": {
+        "interleave_array": [
+            "Make her hair shorter and darker, black.",
+            "./benchmarks/image_gen/GEdit/images/0146_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0146_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bc659e44391c92449841e3cd72dcd17b",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0147": {
+        "interleave_array": [
+            "Change the color of the stockings.",
+            "./benchmarks/image_gen/GEdit/images/0147_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0147_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "37c16adc232e505fc6f0d6747d10e8f1",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0148": {
+        "interleave_array": [
+            "Change the bed curtain color to dark gray.",
+            "./benchmarks/image_gen/GEdit/images/0148_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0148_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d0f17abfafec6172c241aa7ef30278a0",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0149": {
+        "interleave_array": [
+            "Change the background of the image to light sea color, with a gold border.",
+            "./benchmarks/image_gen/GEdit/images/0149_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0149_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8c541c8aaed6d5a8eb2d86162d39d01b",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0150": {
+        "interleave_array": [
+            "Invert the colors of the image.",
+            "./benchmarks/image_gen/GEdit/images/0150_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0150_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "104b3d9a195e2d012f07aa18a286c487",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0151": {
+        "interleave_array": [
+            "Change the hair of the person in the photo to yellow.",
+            "./benchmarks/image_gen/GEdit/images/0151_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0151_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "65e5510e9ed8036376e16afe77f8860e",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0152": {
+        "interleave_array": [
+            "Change this bag to red.",
+            "./benchmarks/image_gen/GEdit/images/0152_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0152_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "87e9e81f29177023a7b988b5557d5d3d",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0153": {
+        "interleave_array": [
+            "Adjust this to a white background ID photo.",
+            "./benchmarks/image_gen/GEdit/images/0153_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0153_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "90c511de3025169322c026d5f7ed209b",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0154": {
+        "interleave_array": [
+            "Retouch this photo, making the hair platinum blonde, improving the hairstyle at the ends, and making the fireworks more brilliant and colorful.",
+            "./benchmarks/image_gen/GEdit/images/0154_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0154_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d87fe55d649a02fa15881364ab671351",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0155": {
+        "interleave_array": [
+            "Can you change the wall color to yellow?",
+            "./benchmarks/image_gen/GEdit/images/0155_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0155_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4023c8e2e8a992a6768b47f1946d0027",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0156": {
+        "interleave_array": [
+            "Change the clothing to pink.",
+            "./benchmarks/image_gen/GEdit/images/0156_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0156_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "939aadbf02607ea772e7c214d0cbc0e1",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0157": {
+        "interleave_array": [
+            "Change this avatar to a blue color tone while keeping the content the same.",
+            "./benchmarks/image_gen/GEdit/images/0157_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0157_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e153b93ffb578c1939739628bad3c7a9",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0158": {
+        "interleave_array": [
+            "Change the car body color to gray.",
+            "./benchmarks/image_gen/GEdit/images/0158_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0158_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "eeab5f9b2f3a62deb674c7bc6af021fb",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0159": {
+        "interleave_array": [
+            "Change the bed sheet color to sky blue.",
+            "./benchmarks/image_gen/GEdit/images/0159_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0159_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1711b0f26ae0d35b6b33b0cd8fd2a6dc",
+            "task_type": "color_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0200": {
+        "interleave_array": [
+            "Change the hat\u2019s material to foam plastic.",
+            "./benchmarks/image_gen/GEdit/images/0200_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0200_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f521449fb89e5ded1f4ff725785d01b8",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0201": {
+        "interleave_array": [
+            "Replace the bench\u2019s material with marble.",
+            "./benchmarks/image_gen/GEdit/images/0201_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0201_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "da8a0c7926b0c53a2c01c3a28e79a2ef",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0202": {
+        "interleave_array": [
+            "Craft the ram with fine ceramic.",
+            "./benchmarks/image_gen/GEdit/images/0202_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0202_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3df4fa90ddbeb16bfac10ede96f31262",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0203": {
+        "interleave_array": [
+            "Change the hat\u2019s material to paper.",
+            "./benchmarks/image_gen/GEdit/images/0203_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0203_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c18278ee2b0b3d8bd18c5279f4a8c636",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0204": {
+        "interleave_array": [
+            "Reshape the kitten using clay.",
+            "./benchmarks/image_gen/GEdit/images/0204_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0204_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cc99cdd8f171dfacc44cddb50b690743",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0205": {
+        "interleave_array": [
+            "Replace the computer's casing with bamboo fiber composite.",
+            "./benchmarks/image_gen/GEdit/images/0205_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0205_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "611ae6fbc57a2b364325650954b21510",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0206": {
+        "interleave_array": [
+            "Change the bear\u2019s material to glass.",
+            "./benchmarks/image_gen/GEdit/images/0206_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0206_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0f6baa8d76c35f11200abb099692ed18",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0207": {
+        "interleave_array": [
+            "Reconstruct the bus body with solid wood panels.",
+            "./benchmarks/image_gen/GEdit/images/0207_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0207_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9d76287b0d48bcff3cdff69b198f569e",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0208": {
+        "interleave_array": [
+            "Change the plane's material to feathers.",
+            "./benchmarks/image_gen/GEdit/images/0208_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0208_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b83c07d09b8a5e602e152dbb6f0271d1",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0209": {
+        "interleave_array": [
+            "Transform the donut\u2019s material into aluminum foil.",
+            "./benchmarks/image_gen/GEdit/images/0209_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0209_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "625a9a448c17aecb16dce5b0da3075a6",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0210": {
+        "interleave_array": [
+            "Make the chair entirely from bone china.",
+            "./benchmarks/image_gen/GEdit/images/0210_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0210_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ca3b53a53971b0ad08476eeb10803df0",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0211": {
+        "interleave_array": [
+            "Change the zebra\u2019s material to concrete.",
+            "./benchmarks/image_gen/GEdit/images/0211_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0211_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5098e702ebab84dc41c1ec86a937bfb2",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0212": {
+        "interleave_array": [
+            "Build the horse using red bricks.",
+            "./benchmarks/image_gen/GEdit/images/0212_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0212_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d2a394c05802831288e0a592d3e28169",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0213": {
+        "interleave_array": [
+            "Change the doll\u2019s material to cotton fabric.",
+            "./benchmarks/image_gen/GEdit/images/0213_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0213_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "59c7d7b4c69afb3117e9b53eb4893c4d",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0214": {
+        "interleave_array": [
+            "Make the sheep from jade.",
+            "./benchmarks/image_gen/GEdit/images/0214_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0214_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "46090cab325a6d4ac10ced9b95dbcad7",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0215": {
+        "interleave_array": [
+            "Craft the cat using cloisonn\u00e9 enamel.",
+            "./benchmarks/image_gen/GEdit/images/0215_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0215_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5dbff1a3b7d1fb890b72cef2f711a2ac",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0216": {
+        "interleave_array": [
+            "Change the toilet\u2019s material to aluminum foil.",
+            "./benchmarks/image_gen/GEdit/images/0216_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0216_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9612c74ec7892a39867e992d0d806314",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0217": {
+        "interleave_array": [
+            "Make the seagull from resin.",
+            "./benchmarks/image_gen/GEdit/images/0217_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0217_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "522ca43195a09cb195944e4154fb3286",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0218": {
+        "interleave_array": [
+            "Mold the bed frame from high-strength plaster.",
+            "./benchmarks/image_gen/GEdit/images/0218_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0218_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c0b82d5df485c7cbec00d9190adf0f55",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0219": {
+        "interleave_array": [
+            "Construct the elephant from bricks.",
+            "./benchmarks/image_gen/GEdit/images/0219_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0219_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a1e647285bd94edb240a412737354b02",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0220": {
+        "interleave_array": [
+            "Replace the sword in the image with a diamond sword.",
+            "./benchmarks/image_gen/GEdit/images/0220_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0220_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f93b1dd57b6a8791c872be6221c66dd0",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0221": {
+        "interleave_array": [
+            "Turn the bag stand into a glass counter.",
+            "./benchmarks/image_gen/GEdit/images/0221_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0221_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "95368925f9384e28535aea893b6add55",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0222": {
+        "interleave_array": [
+            "Sculpt the ice cream from jade.",
+            "./benchmarks/image_gen/GEdit/images/0222_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0222_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6dc67bf92a79edb4f966836df4252145",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0223": {
+        "interleave_array": [
+            "Change the hat\u2019s material to wood.",
+            "./benchmarks/image_gen/GEdit/images/0223_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0223_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "df4b227669a0c09e007e063781385cc5",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0224": {
+        "interleave_array": [
+            "Replace the doctor's coat with a Merino wool sweater.",
+            "./benchmarks/image_gen/GEdit/images/0224_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0224_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "db87dca6363b0c1afd3246ab8fcfe5d7",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0225": {
+        "interleave_array": [
+            "Reconstruct the tower structure using a cast iron framework.",
+            "./benchmarks/image_gen/GEdit/images/0225_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0225_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7078f382f1fc25aeb48cbcd6dddd9c78",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0226": {
+        "interleave_array": [
+            "Make the clothing fabric from premium linen.",
+            "./benchmarks/image_gen/GEdit/images/0226_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0226_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6db0677c1fa5b1a266e9c078d5cb175d",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0227": {
+        "interleave_array": [
+            "Craft the outerwear from full-grain calfskin leather.",
+            "./benchmarks/image_gen/GEdit/images/0227_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0227_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "66fbc2d25acbb4b6542ba627c365bd4f",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0228": {
+        "interleave_array": [
+            "Upgrade the necklace\u2019s material to 999 pure gold.",
+            "./benchmarks/image_gen/GEdit/images/0228_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0228_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1e863624d729f3c358964626ad4612bc",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0229": {
+        "interleave_array": [
+            "Replace the tabletop with imported Italian marble.",
+            "./benchmarks/image_gen/GEdit/images/0229_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0229_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ac38191337c2f53c46b131624c789abc",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0230": {
+        "interleave_array": [
+            "Swap the background plants for woven rattan.",
+            "./benchmarks/image_gen/GEdit/images/0230_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0230_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "73875335f42e4154ece47b4a4fafd83e",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0231": {
+        "interleave_array": [
+            "Change the clothing\u2019s material to foam.",
+            "./benchmarks/image_gen/GEdit/images/0231_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0231_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "43e8fede0b26141d75c64c1f03bfc96e",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0232": {
+        "interleave_array": [
+            "Replace with jade.",
+            "./benchmarks/image_gen/GEdit/images/0232_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0232_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d7380515285d80a58ff567863809c8f4",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0233": {
+        "interleave_array": [
+            "Change the stone platform to rubber.",
+            "./benchmarks/image_gen/GEdit/images/0233_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0233_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "19e8dce7f4aa1758502870d9ae8a919b",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0234": {
+        "interleave_array": [
+            "Turn the puppy into clay.",
+            "./benchmarks/image_gen/GEdit/images/0234_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0234_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5b3a45f95245e83201a46866e71df0c9",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0235": {
+        "interleave_array": [
+            "Create a rubber-textured turtle identical to this one.",
+            "./benchmarks/image_gen/GEdit/images/0235_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0235_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dd0e86152b637efa3cc71b41fb8aaddc",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0236": {
+        "interleave_array": [
+            "Cutlery is made of food-grade stainless steel.",
+            "./benchmarks/image_gen/GEdit/images/0236_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0236_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "db8e18433b727737610cb3d8b71f4690",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0237": {
+        "interleave_array": [
+            "Change the cup in hand to ceramic.",
+            "./benchmarks/image_gen/GEdit/images/0237_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0237_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0cf6ae6de96b2d07c717f8c3bf9517fa",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0238": {
+        "interleave_array": [
+            "Transform the clothing material into silk.",
+            "./benchmarks/image_gen/GEdit/images/0238_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0238_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "641f39026c89fffaf60a4f0f50304d7d",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0239": {
+        "interleave_array": [
+            "Craft the outerwear from lambskin leather.",
+            "./benchmarks/image_gen/GEdit/images/0239_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0239_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "803f665220970a442a420afb826b6747",
+            "task_type": "material_alter",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0280": {
+        "interleave_array": [
+            "change the action of cat to jumping",
+            "./benchmarks/image_gen/GEdit/images/0280_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0280_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "76a4af36b318953c8054fdd706e7294f",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0281": {
+        "interleave_array": [
+            "change the action of biplane to landing",
+            "./benchmarks/image_gen/GEdit/images/0281_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0281_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "caabd082c0ed1757df58db3eaea5ac73",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0282": {
+        "interleave_array": [
+            "Change the bird's action to flapping its wings and flying high",
+            "./benchmarks/image_gen/GEdit/images/0282_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0282_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41fcd0b5de39189a4fbf4eac28ce259a",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0283": {
+        "interleave_array": [
+            "change the action of the man to running",
+            "./benchmarks/image_gen/GEdit/images/0283_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0283_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f09044a354815af044038bf50708b58d",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0284": {
+        "interleave_array": [
+            "make the action of the man to cheering",
+            "./benchmarks/image_gen/GEdit/images/0284_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0284_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "be4bb34c6d879f253a4b7c4f32fc333f",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0285": {
+        "interleave_array": [
+            "Change the man's gesture to raising his hands",
+            "./benchmarks/image_gen/GEdit/images/0285_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0285_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0a4769356f68ed88de0d0eb3aba89eb6",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0286": {
+        "interleave_array": [
+            "change the action of the horses to galloping",
+            "./benchmarks/image_gen/GEdit/images/0286_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0286_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8175d438e57f213c80425595063d053a",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0287": {
+        "interleave_array": [
+            "change the action of the birds to flying",
+            "./benchmarks/image_gen/GEdit/images/0287_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0287_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "621c48b14baadc0c3947bc05857e91f4",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0288": {
+        "interleave_array": [
+            "make the action of the woman to laughing",
+            "./benchmarks/image_gen/GEdit/images/0288_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0288_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "72fc47119a7cd90ecdcbf073c3fb74be",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0289": {
+        "interleave_array": [
+            "make the action of the plane to taking off",
+            "./benchmarks/image_gen/GEdit/images/0289_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0289_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3bf8c501e7e338fe92879153ec038ede",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0290": {
+        "interleave_array": [
+            "make the action of the child to laughing",
+            "./benchmarks/image_gen/GEdit/images/0290_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0290_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5e855293ac06b10ae7faeb4a675b18a6",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0291": {
+        "interleave_array": [
+            "change the action of cat to sleeping",
+            "./benchmarks/image_gen/GEdit/images/0291_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0291_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "06ec0c598cbbc8c9490395a98b88adac",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0292": {
+        "interleave_array": [
+            "Change the person's movements to look forward",
+            "./benchmarks/image_gen/GEdit/images/0292_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0292_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "733441fbfc73dee8d4a74d2bde1bd931",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0293": {
+        "interleave_array": [
+            "make the action of the man to kissing",
+            "./benchmarks/image_gen/GEdit/images/0293_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0293_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "aefbfe530e2ce5323e5be6ea2575815e",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0294": {
+        "interleave_array": [
+            "Make the person in the image smile slightly without altering the original structure.",
+            "./benchmarks/image_gen/GEdit/images/0294_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0294_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8296e86315751cdaa09c910c95b02c10",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0295": {
+        "interleave_array": [
+            "Add a cherry-eating action without changing the original character.",
+            "./benchmarks/image_gen/GEdit/images/0295_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0295_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1941828e44744f4cd248560b4b67529c",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0296": {
+        "interleave_array": [
+            "Animate the dog in the image.",
+            "./benchmarks/image_gen/GEdit/images/0296_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0296_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3d112cbeb258289dbbca3738ee92a8aa",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0297": {
+        "interleave_array": [
+            "Make the person in the image smile.",
+            "./benchmarks/image_gen/GEdit/images/0297_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0297_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "353cf97ec89d2e51932763ae7538c4cc",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0298": {
+        "interleave_array": [
+            "Make the person in the image give a thumbs-up.",
+            "./benchmarks/image_gen/GEdit/images/0298_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0298_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "65368cbdae17f7c44cd4d8d1271f0bdf",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0299": {
+        "interleave_array": [
+            "Make the person in the image wave.",
+            "./benchmarks/image_gen/GEdit/images/0299_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0299_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fc228a38f175cad001bc8a409c76e63b",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0300": {
+        "interleave_array": [
+            "Make the little girl in the image stick out her tongue.",
+            "./benchmarks/image_gen/GEdit/images/0300_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0300_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "73e34d4e31b308f26f3ade464bbd9a52",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0301": {
+        "interleave_array": [
+            "Make the two main subjects in the image hug.",
+            "./benchmarks/image_gen/GEdit/images/0301_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0301_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a6bd902e89f6b8576a02f2a0139a993b",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0302": {
+        "interleave_array": [
+            "Transform the image into one where the woman tilts her head.",
+            "./benchmarks/image_gen/GEdit/images/0302_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0302_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "25a7bc668e846d7218e012af5295eba9",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0303": {
+        "interleave_array": [
+            "Make the child pout.",
+            "./benchmarks/image_gen/GEdit/images/0303_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0303_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0d61bb72d05645cec8a4c2c62cdf7fe0",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0304": {
+        "interleave_array": [
+            "Change the expression to a crying face.",
+            "./benchmarks/image_gen/GEdit/images/0304_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0304_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9b58419e6adadd4a367c699741e08f89",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0305": {
+        "interleave_array": [
+            "Generate an image of the character smiling based on this photo.",
+            "./benchmarks/image_gen/GEdit/images/0305_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0305_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0a406290c07e7b1837c0f3bcddbeb8d2",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0306": {
+        "interleave_array": [
+            "Make the person in the image make a funny face.",
+            "./benchmarks/image_gen/GEdit/images/0306_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0306_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cd627a9fe6dd079a1e692be90563c50c",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0307": {
+        "interleave_array": [
+            "Using the same image, generate one where the girl is crying.",
+            "./benchmarks/image_gen/GEdit/images/0307_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0307_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f1a8ac0cd17e1138c22accdc095dfb04",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0308": {
+        "interleave_array": [
+            "Change the person\u2019s expression to one of distress.",
+            "./benchmarks/image_gen/GEdit/images/0308_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0308_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "715a72723f1a797640c35a4c7a4f8f51",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0309": {
+        "interleave_array": [
+            "Make all the people in the image laugh.",
+            "./benchmarks/image_gen/GEdit/images/0309_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0309_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2ca948c72ab289d2c86db037002baa95",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0310": {
+        "interleave_array": [
+            "Make the person in the photo dance.",
+            "./benchmarks/image_gen/GEdit/images/0310_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0310_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dd6be6e54c7662c78f607bd88a931caf",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0311": {
+        "interleave_array": [
+            "Modify the image to show the person picking their nose.",
+            "./benchmarks/image_gen/GEdit/images/0311_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0311_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c5b82e3b142580940d3897a9f43b4139",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0312": {
+        "interleave_array": [
+            "Make the person jump.",
+            "./benchmarks/image_gen/GEdit/images/0312_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0312_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1fbb9fae9fb272593a73203c8113f758",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0313": {
+        "interleave_array": [
+            "Animate the cat in the image.",
+            "./benchmarks/image_gen/GEdit/images/0313_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0313_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e648d94e8f66940befa13e34039be176",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0314": {
+        "interleave_array": [
+            "Make the girl in the photo blow a kiss.",
+            "./benchmarks/image_gen/GEdit/images/0314_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0314_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9ce39582df9aaf3b21b39fb9627f7bb2",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0315": {
+        "interleave_array": [
+            "Make the expression more sorrowful.",
+            "./benchmarks/image_gen/GEdit/images/0315_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0315_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "028b9a3c540ac8eaeef11799524ec127",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0316": {
+        "interleave_array": [
+            "Make the child in the image dance.",
+            "./benchmarks/image_gen/GEdit/images/0316_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0316_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "991cdfa18e521e034d65a636900b09af",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0317": {
+        "interleave_array": [
+            "Make the people in the image smile happily.",
+            "./benchmarks/image_gen/GEdit/images/0317_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0317_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "df666978387df220cd714f7b2f80b673",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0318": {
+        "interleave_array": [
+            "Create an animated version of the person in the image blinking and opening their mouth.",
+            "./benchmarks/image_gen/GEdit/images/0318_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0318_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "aa322dd5d83bd8e02afd3ad4a87d6a6a",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0319": {
+        "interleave_array": [
+            "Make the cat in the image run.",
+            "./benchmarks/image_gen/GEdit/images/0319_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0319_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e9b3b27074575e7615723b7ff89de9a0",
+            "task_type": "motion_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0390": {
+        "interleave_array": [
+            "Remove his beard",
+            "./benchmarks/image_gen/GEdit/images/0390_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0390_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ccffdca80bf93b14a3533eb46829300f",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0391": {
+        "interleave_array": [
+            "Make his nose more defined and his face slimmer",
+            "./benchmarks/image_gen/GEdit/images/0391_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0391_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cac69629251b8ef6f51d793b6a3b07fa",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0392": {
+        "interleave_array": [
+            "Make him lose 15 pounds",
+            "./benchmarks/image_gen/GEdit/images/0392_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0392_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9b0d4782d50d550654d1daf53153524a",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0393": {
+        "interleave_array": [
+            "Make him gain 20 pounds",
+            "./benchmarks/image_gen/GEdit/images/0393_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0393_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1805a69d09b8d3637fe585f3c402ea2f",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0394": {
+        "interleave_array": [
+            "Remove his abs and add more fat to his body",
+            "./benchmarks/image_gen/GEdit/images/0394_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0394_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0e38f5cee6a69fb6b1817b1685618e91",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0395": {
+        "interleave_array": [
+            "Make him look sad",
+            "./benchmarks/image_gen/GEdit/images/0395_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0395_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "80e7583efc497acfddc6d6f34c1207c9",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0396": {
+        "interleave_array": [
+            "Change her curly hair to straight hair",
+            "./benchmarks/image_gen/GEdit/images/0396_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0396_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6ffe2a7e012e52694e1a07c00e7f44c5",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0397": {
+        "interleave_array": [
+            "Make her look younger",
+            "./benchmarks/image_gen/GEdit/images/0397_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0397_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "58d48d76e4705b16a6f03337fc6397e8",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0398": {
+        "interleave_array": [
+            "Make her skin a bit darker, like after a sunbath",
+            "./benchmarks/image_gen/GEdit/images/0398_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0398_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "de1c094bc7a28f273e560bacef9c2a5e",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0399": {
+        "interleave_array": [
+            "Remove his beard and wrinkles from his face",
+            "./benchmarks/image_gen/GEdit/images/0399_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0399_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b125faa596111bb238ac9e908d67045b",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0400": {
+        "interleave_array": [
+            "Make his beard longer",
+            "./benchmarks/image_gen/GEdit/images/0400_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0400_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "06fa0ee0788e219cae32f542a417ab70",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0401": {
+        "interleave_array": [
+            "Make him look very happy",
+            "./benchmarks/image_gen/GEdit/images/0401_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0401_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ea1de73c9c216ec0689eb650e51a5829",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0402": {
+        "interleave_array": [
+            "Make him look very serious",
+            "./benchmarks/image_gen/GEdit/images/0402_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0402_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3190e10334ec71222a324bf0b2e3a459",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0403": {
+        "interleave_array": [
+            "Make him look middle-aged",
+            "./benchmarks/image_gen/GEdit/images/0403_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0403_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "67fd8d190cb31cc01f52c2ec8ead9896",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0404": {
+        "interleave_array": [
+            "Make him look 10 years older",
+            "./benchmarks/image_gen/GEdit/images/0404_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0404_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1bdf06dc53b7cc3c907b540dca7b4b53",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0405": {
+        "interleave_array": [
+            "Make him look stronger",
+            "./benchmarks/image_gen/GEdit/images/0405_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0405_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0d2a9e9966354dc8039aeee974c767c2",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0406": {
+        "interleave_array": [
+            "Make him look older",
+            "./benchmarks/image_gen/GEdit/images/0406_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0406_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e5db9a732c72acae1638371292c14220",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0407": {
+        "interleave_array": [
+            "Dye her hair brown",
+            "./benchmarks/image_gen/GEdit/images/0407_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0407_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d7b1f377153a3e35db9020dc1a848c8a",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0408": {
+        "interleave_array": [
+            "Make him grow hair",
+            "./benchmarks/image_gen/GEdit/images/0408_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0408_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9a7eea29db11c1f500838bee90653970",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0409": {
+        "interleave_array": [
+            "Make him look more handsome with sharper eyes",
+            "./benchmarks/image_gen/GEdit/images/0409_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0409_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1f1c9a1e6ce6899d91abcb9a67922758",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0410": {
+        "interleave_array": [
+            "Make him look less angry",
+            "./benchmarks/image_gen/GEdit/images/0410_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0410_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41d961b14b637889947080f1891f85ff",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0411": {
+        "interleave_array": [
+            "Make him look like he is crying a lot",
+            "./benchmarks/image_gen/GEdit/images/0411_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0411_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fe220565cb0f22a44d1f0a81a132ce9f",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0412": {
+        "interleave_array": [
+            "Make her look better",
+            "./benchmarks/image_gen/GEdit/images/0412_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0412_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "918900e10cac886e4bdf4236efee15b9",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0413": {
+        "interleave_array": [
+            "Make him have more muscles and a stronger vibe",
+            "./benchmarks/image_gen/GEdit/images/0413_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0413_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5968a24147a8564f74bd09104c4c032e",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0414": {
+        "interleave_array": [
+            "Make him more handsome",
+            "./benchmarks/image_gen/GEdit/images/0414_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0414_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4c453fc6e3f8842296406dc7c8ad5ac4",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0415": {
+        "interleave_array": [
+            "Make him laugh heartily",
+            "./benchmarks/image_gen/GEdit/images/0415_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0415_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6022c9e5401a53028e3b0690cce7a9db",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0416": {
+        "interleave_array": [
+            "Make him look younger",
+            "./benchmarks/image_gen/GEdit/images/0416_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0416_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4611d3319199a5c4b84ea1608f6eba29",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0417": {
+        "interleave_array": [
+            "Make him grow long hair",
+            "./benchmarks/image_gen/GEdit/images/0417_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0417_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f21e2f3585f8cddeab9d472375e92bac",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0418": {
+        "interleave_array": [
+            "Make his skin smoother, no wrinkles",
+            "./benchmarks/image_gen/GEdit/images/0418_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0418_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2dd5f9c40a055007abcafbbdaf2be46b",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0419": {
+        "interleave_array": [
+            "Make him lose 20 pounds",
+            "./benchmarks/image_gen/GEdit/images/0419_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0419_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ba360f7380e2f080485af9bbe38bd4c6",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0420": {
+        "interleave_array": [
+            "Make me look like a handsome guy in this photo.",
+            "./benchmarks/image_gen/GEdit/images/0420_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0420_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6cabdec52f6113e0a365332f323053b1",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0421": {
+        "interleave_array": [
+            "Enhance the appearance.",
+            "./benchmarks/image_gen/GEdit/images/0421_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0421_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9647d92cf8edeec8c3b68ecb6150f7c7",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0422": {
+        "interleave_array": [
+            "Make the allergic reaction on my face look more severe.",
+            "./benchmarks/image_gen/GEdit/images/0422_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0422_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f0995ee97b33e6ca5effc808be8e4ac2",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0423": {
+        "interleave_array": [
+            "How can I fix these facial imperfections?",
+            "./benchmarks/image_gen/GEdit/images/0423_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0423_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "47e2e49cf8b662a7493ddad42334b6e1",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0424": {
+        "interleave_array": [
+            "Remove the eyeshadow and lipstick.",
+            "./benchmarks/image_gen/GEdit/images/0424_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0424_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cc7a45219bdfbaf01dc3348735c183d0",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0425": {
+        "interleave_array": [
+            "Add abs to the original photo.",
+            "./benchmarks/image_gen/GEdit/images/0425_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0425_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c96ed4ce8d74381cce77516fa3c3b6b3",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0426": {
+        "interleave_array": [
+            "Make his nose higher.",
+            "./benchmarks/image_gen/GEdit/images/0426_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0426_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ca9fc1fd8ffd6fdc77d694c5786a69db",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0427": {
+        "interleave_array": [
+            "Edit the eyes to look red and teary.",
+            "./benchmarks/image_gen/GEdit/images/0427_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0427_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "27a041f8cf96466d3fe99c2854600ed7",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0428": {
+        "interleave_array": [
+            "Make it look better.",
+            "./benchmarks/image_gen/GEdit/images/0428_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0428_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "84e1ecd02e31ff710caf92575973abb3",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0429": {
+        "interleave_array": [
+            "Remove acne and blemishes from my face, slim down my nose and face.",
+            "./benchmarks/image_gen/GEdit/images/0429_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0429_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c18b9ea3a82a132108bb19942258fae1",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0430": {
+        "interleave_array": [
+            "Give me a G-cup while keeping my face unchanged and maintaining the original proportions.",
+            "./benchmarks/image_gen/GEdit/images/0430_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0430_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "04f9da181ba94ebbb7b76206affdc4cc",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0431": {
+        "interleave_array": [
+            "Refine the chin in this photo.",
+            "./benchmarks/image_gen/GEdit/images/0431_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0431_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f598b8339d28b4c0836da4341785e605",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0432": {
+        "interleave_array": [
+            "Edit my face to remove spots, slim it down, and brighten the skin.",
+            "./benchmarks/image_gen/GEdit/images/0432_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0432_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2898a98d82023e2e558488202dc231af",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0433": {
+        "interleave_array": [
+            "Enhance my nose.",
+            "./benchmarks/image_gen/GEdit/images/0433_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0433_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "697678d3816a0fcfc357a108ae47955a",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0434": {
+        "interleave_array": [
+            "Without altering or beautifying anything else, just shape my eyebrows to suit me.",
+            "./benchmarks/image_gen/GEdit/images/0434_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0434_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "32d356cc309cfe3682305e2c3c2adfd9",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0435": {
+        "interleave_array": [
+            "Make my legs longer in the photo.",
+            "./benchmarks/image_gen/GEdit/images/0435_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0435_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "089748fdcf4c407ac479c76e0f62f8f4",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0436": {
+        "interleave_array": [
+            "Draw her with a much larger chest.",
+            "./benchmarks/image_gen/GEdit/images/0436_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0436_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6d71b1b0b6f8fb153cf031d29ba59129",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0437": {
+        "interleave_array": [
+            "Make me look 10 pounds thinner.",
+            "./benchmarks/image_gen/GEdit/images/0437_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0437_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e3ec80bb14ae5d53e19a1d5efd5921a1",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0438": {
+        "interleave_array": [
+            "Whiten my face and apply a better filter.",
+            "./benchmarks/image_gen/GEdit/images/0438_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0438_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d64299c7e5b6cda2e20b7fd0c577aba3",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0439": {
+        "interleave_array": [
+            "Add abs to this image.",
+            "./benchmarks/image_gen/GEdit/images/0439_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0439_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2e6c9f632d7c9a434011c88cc1e3c8d8",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0440": {
+        "interleave_array": [
+            "Edit the image to give me visible abs.",
+            "./benchmarks/image_gen/GEdit/images/0440_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0440_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7160b6370ace4b0a89a408876b48c1c4",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0441": {
+        "interleave_array": [
+            "Make my hair longer.",
+            "./benchmarks/image_gen/GEdit/images/0441_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0441_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c0887ad7bc9f207f3acf198fc2a2e4aa",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0442": {
+        "interleave_array": [
+            "Transform the original photo into a youthful and stylish version.",
+            "./benchmarks/image_gen/GEdit/images/0442_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0442_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "535fc24a4f6446999ac202e6e2eab72c",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0443": {
+        "interleave_array": [
+            "Adjust my face size to be symmetrical, and make my eyes the same size.",
+            "./benchmarks/image_gen/GEdit/images/0443_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0443_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b6669ad585437d790d56c9d51812ce73",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0444": {
+        "interleave_array": [
+            "Make my eyes bigger.",
+            "./benchmarks/image_gen/GEdit/images/0444_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0444_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5681bb73bf53906dfe4e7376be42d981",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0445": {
+        "interleave_array": [
+            "Generate a version where the girl on the right looks slimmer.",
+            "./benchmarks/image_gen/GEdit/images/0445_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0445_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "aa5b5375f7ead439732b0979fab353b6",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0446": {
+        "interleave_array": [
+            "This is my photo\u2014please make me look more handsome.",
+            "./benchmarks/image_gen/GEdit/images/0446_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0446_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "038013b7852ce014b254effb307ec5de",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0447": {
+        "interleave_array": [
+            "Feminize my appearance.",
+            "./benchmarks/image_gen/GEdit/images/0447_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0447_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3b0f6ca611bfa2f2416bf7ade7f60811",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0448": {
+        "interleave_array": [
+            "Make me look more masculine.",
+            "./benchmarks/image_gen/GEdit/images/0448_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0448_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "38121c6cc4479c8a4fbd0d888bb79f1d",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0449": {
+        "interleave_array": [
+            "Enhance this photo to make me look better.",
+            "./benchmarks/image_gen/GEdit/images/0449_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0449_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "56287939cfa47505f0cc400430ae4131",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0450": {
+        "interleave_array": [
+            "Retouch this image.",
+            "./benchmarks/image_gen/GEdit/images/0450_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0450_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c9101db419bbcd1b258ed367dc09b986",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0451": {
+        "interleave_array": [
+            "Make me look handsome.",
+            "./benchmarks/image_gen/GEdit/images/0451_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0451_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4b7e3f9099377e3823c1c3e0d924883a",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0452": {
+        "interleave_array": [
+            "Make me look as good as possible.",
+            "./benchmarks/image_gen/GEdit/images/0452_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0452_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3547f6c2021822fb3f480595a44679bd",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0453": {
+        "interleave_array": [
+            "Make me look 20 years younger.",
+            "./benchmarks/image_gen/GEdit/images/0453_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0453_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e16b35649536eed0fecef4c7704b228b",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0454": {
+        "interleave_array": [
+            "Make my face look better.",
+            "./benchmarks/image_gen/GEdit/images/0454_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0454_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f81d8419a96bac5878844b85e21a938c",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0455": {
+        "interleave_array": [
+            "Change my face shape to a round one.",
+            "./benchmarks/image_gen/GEdit/images/0455_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0455_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8195e9e69612be9bae6cad135bb94840",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0456": {
+        "interleave_array": [
+            "Transform me into a Korean-style handsome guy.",
+            "./benchmarks/image_gen/GEdit/images/0456_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0456_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d405f724329102f5171bdcc915177e35",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0457": {
+        "interleave_array": [
+            "Edit this photo\u2014slim the waist and lift the butt.",
+            "./benchmarks/image_gen/GEdit/images/0457_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0457_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2dd8b5fb8e22905ed49d87660eb82ee0",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0458": {
+        "interleave_array": [
+            "Make me look more attractive.",
+            "./benchmarks/image_gen/GEdit/images/0458_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0458_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ce9fa032e29c8f6418f4cab41e068fcb",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0459": {
+        "interleave_array": [
+            "Generate my adult appearance.",
+            "./benchmarks/image_gen/GEdit/images/0459_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0459_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a50f15973f0f4fcf88c8badcab58e86a",
+            "task_type": "ps_human",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0520": {
+        "interleave_array": [
+            "Convert the image to a Japanese manga style.",
+            "./benchmarks/image_gen/GEdit/images/0520_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0520_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "15f01f7d55ad4f8695218594277e451f",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0521": {
+        "interleave_array": [
+            "Apply the art style of Hayao Miyazaki's animated films.",
+            "./benchmarks/image_gen/GEdit/images/0521_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0521_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4c14aafd852de6b46042af27c98c6a27",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0522": {
+        "interleave_array": [
+            "Switch to an American cartoon animation effect.",
+            "./benchmarks/image_gen/GEdit/images/0522_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0522_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fd82e6b33505ba16b6c64cfff4ea3895",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0523": {
+        "interleave_array": [
+            "Render with Pixar Animation Studios' 3D texture.",
+            "./benchmarks/image_gen/GEdit/images/0523_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0523_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c51e5e8de213d991bcab4513da06c885",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0524": {
+        "interleave_array": [
+            "Transform into a Japanese anime visual style.",
+            "./benchmarks/image_gen/GEdit/images/0524_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0524_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6b63968c28b15a0bb25b6f93056811c8",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0525": {
+        "interleave_array": [
+            "Change the image style to a high-contrast look.",
+            "./benchmarks/image_gen/GEdit/images/0525_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0525_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9c72e227cbe037de86345d2d79652e5d",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0526": {
+        "interleave_array": [
+            "Replace the image style with a 3D effect.",
+            "./benchmarks/image_gen/GEdit/images/0526_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0526_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "74d28960183c3490877d0da05b4ced6b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0527": {
+        "interleave_array": [
+            "Adjust the image style to a bubble-like aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0527_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0527_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2c9c0664af586124f7fa0108419dc9b3",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0528": {
+        "interleave_array": [
+            "Add vintage film grain and faded effects.",
+            "./benchmarks/image_gen/GEdit/images/0528_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0528_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5fb4494003bb1a55335de8b9ec954f29",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0529": {
+        "interleave_array": [
+            "Simulate the texture of clay stop-motion animation.",
+            "./benchmarks/image_gen/GEdit/images/0529_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0529_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d124236c63ab27f157dd7ffdf6d9cc4b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0530": {
+        "interleave_array": [
+            "Present the brushstroke characteristics of digital painting.",
+            "./benchmarks/image_gen/GEdit/images/0530_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0530_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1484ee9ef9a7f142aff7856a0edb276b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0531": {
+        "interleave_array": [
+            "Replace the image style with fantasy art.",
+            "./benchmarks/image_gen/GEdit/images/0531_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0531_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "04a128c7a483127cf7ad491c56de17d5",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0532": {
+        "interleave_array": [
+            "Change the image style to a Mondrian-inspired look.",
+            "./benchmarks/image_gen/GEdit/images/0532_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0532_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d6ac5062a555a85235c762972d5277fe",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0533": {
+        "interleave_array": [
+            "Adjust the image style to a watercolor effect.",
+            "./benchmarks/image_gen/GEdit/images/0533_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0533_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4e62777f17329aff2906ff86a217ccb6",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0534": {
+        "interleave_array": [
+            "Modify the image style into line art.",
+            "./benchmarks/image_gen/GEdit/images/0534_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0534_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d99c96ce517e4588da92570b178e8906",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0535": {
+        "interleave_array": [
+            "Transform the image into a retro aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0535_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0535_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a69d50a3ae88a3405928ebd3f05d60eb",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0536": {
+        "interleave_array": [
+            "Switch the image style to neon-punk.",
+            "./benchmarks/image_gen/GEdit/images/0536_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0536_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "45d80a2eb5f3804b79cc274526e60c02",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0537": {
+        "interleave_array": [
+            "Convert the image style to pixel art.",
+            "./benchmarks/image_gen/GEdit/images/0537_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0537_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f1a9872a150e9dd0850744bd0effe17d",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0538": {
+        "interleave_array": [
+            "Adjust the image style to a gothic theme.",
+            "./benchmarks/image_gen/GEdit/images/0538_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0538_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "53d3ea8751230795aaa6ce6bf85669dc",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0539": {
+        "interleave_array": [
+            "Use abstract color blocks and lines to express the composition.",
+            "./benchmarks/image_gen/GEdit/images/0539_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0539_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d3f8877981d10b5a003e6223531c71eb",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0540": {
+        "interleave_array": [
+            "Turn it into a cartoon image.",
+            "./benchmarks/image_gen/GEdit/images/0540_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0540_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9a6b59107a07bbe528614eff11b697b9",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0541": {
+        "interleave_array": [
+            "Please transform this photo into a Cubist style.",
+            "./benchmarks/image_gen/GEdit/images/0541_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0541_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "45a4db4e153c28c6a04bc2c642c0c12c",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0542": {
+        "interleave_array": [
+            "Convert it into a graffiti style.",
+            "./benchmarks/image_gen/GEdit/images/0542_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0542_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c00a2045d8125220281c879fd556c528",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0543": {
+        "interleave_array": [
+            "Switch the image to a high-resolution Impressionist painting.",
+            "./benchmarks/image_gen/GEdit/images/0543_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0543_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "739b6810796e2c0179073e600004b764",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0544": {
+        "interleave_array": [
+            "Transform this image into a Pointillist artwork.",
+            "./benchmarks/image_gen/GEdit/images/0544_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0544_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "af2db8f036e553f782d1ed6573c68fb7",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0545": {
+        "interleave_array": [
+            "Change it to a Pop Art style.",
+            "./benchmarks/image_gen/GEdit/images/0545_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0545_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "727fd5249ae60ae0279432babea49584",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0546": {
+        "interleave_array": [
+            "Adjust the image style to an oil painting with bold brushstrokes.",
+            "./benchmarks/image_gen/GEdit/images/0546_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0546_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "27e4b539c6355586fe2935d6a90bba61",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0547": {
+        "interleave_array": [
+            "Turn this image into a steampunk aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0547_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0547_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "22b24d3d01e3b64ec59ae5f8e3c170fa",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0548": {
+        "interleave_array": [
+            "Modify it into an Impressionist oil painting.",
+            "./benchmarks/image_gen/GEdit/images/0548_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0548_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9cf1146672c2b9e1d4ce37b9dbb3fbda",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0549": {
+        "interleave_array": [
+            "Transform it into an oil painting style.",
+            "./benchmarks/image_gen/GEdit/images/0549_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0549_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a5f07e015eeb284665b72240e853baa5",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0550": {
+        "interleave_array": [
+            "Change this image to a Pixar style with a background of a vibrant spring park while keeping the character unchanged.",
+            "./benchmarks/image_gen/GEdit/images/0550_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0550_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "23f17387da2ea2e6817c2204417195ff",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0551": {
+        "interleave_array": [
+            "Convert to a watercolor painting style.",
+            "./benchmarks/image_gen/GEdit/images/0551_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0551_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bd033dd036c1f2e6424ceb3fd9f90dbd",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0552": {
+        "interleave_array": [
+            "Please change this photo into a *Genshin Impact* style.",
+            "./benchmarks/image_gen/GEdit/images/0552_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0552_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f709057fa095f9cb9426f4c3cc783822",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0553": {
+        "interleave_array": [
+            "Generate a Pixar-style animation with a cheerful spring background.",
+            "./benchmarks/image_gen/GEdit/images/0553_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0553_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41a20c339a3c73c6cc5200d03d7ff4a1",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0554": {
+        "interleave_array": [
+            "Modify it into a digital illustration style.",
+            "./benchmarks/image_gen/GEdit/images/0554_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0554_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9b7e39332a893401bdd50a09b75ecd3e",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0555": {
+        "interleave_array": [
+            "Generate a gothic-style image.",
+            "./benchmarks/image_gen/GEdit/images/0555_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0555_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f4dfe5ffe8cb955f41ac3858dd6ac5d7",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0556": {
+        "interleave_array": [
+            "Switch the image to a minimalist aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0556_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0556_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5788cde5a601a266804107209de8ee4c",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0557": {
+        "interleave_array": [
+            "Recreate it in the style of Monet.",
+            "./benchmarks/image_gen/GEdit/images/0557_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0557_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7327c91f9d675ec5e69712972f49a06c",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0558": {
+        "interleave_array": [
+            "Generate a monochrome-style animation.",
+            "./benchmarks/image_gen/GEdit/images/0558_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0558_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a50fdf85f87b7a11acc92335eaba1b6c",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0559": {
+        "interleave_array": [
+            "Create an artwork in a tribal aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0559_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0559_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1f08678913cf5274ac110ee34ef2b8d8",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0560": {
+        "interleave_array": [
+            "Turn the image into an American comic style.",
+            "./benchmarks/image_gen/GEdit/images/0560_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0560_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "07fc2fa9b1bbee0e9e37421fe3a6576b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0561": {
+        "interleave_array": [
+            "Make it anime-style.",
+            "./benchmarks/image_gen/GEdit/images/0561_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0561_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "35825b1573a735192ee20541255a0e87",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0562": {
+        "interleave_array": [
+            "Generate a collage-style artwork.",
+            "./benchmarks/image_gen/GEdit/images/0562_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0562_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2b2bdf9401c83ea114677520a589e383",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0563": {
+        "interleave_array": [
+            "Please change this image into a manga style.",
+            "./benchmarks/image_gen/GEdit/images/0563_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0563_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "508064bc4a75a2e2d065ee06ff93eb44",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0564": {
+        "interleave_array": [
+            "Edit this photo to have a Fuji-style aesthetic.",
+            "./benchmarks/image_gen/GEdit/images/0564_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0564_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "acb15686bbb0df11a35eb9b6a8c9062a",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0565": {
+        "interleave_array": [
+            "Create a dark-themed version.",
+            "./benchmarks/image_gen/GEdit/images/0565_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0565_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8d7df9ad8365c7bb1196275184327d2d",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0566": {
+        "interleave_array": [
+            "Transform the buildings in the image into a colorful, cute, anime-style landscape full of blooming flowers.",
+            "./benchmarks/image_gen/GEdit/images/0566_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0566_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cd36d9862ef442202ea6d3ffc5b8e8dd",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0567": {
+        "interleave_array": [
+            "Convert this image into an anime style.",
+            "./benchmarks/image_gen/GEdit/images/0567_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0567_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "599dbcd5dd042cec90da287aa11414ce",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0568": {
+        "interleave_array": [
+            "Generate a pixel-art avatar based on this girl's photo.",
+            "./benchmarks/image_gen/GEdit/images/0568_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0568_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "53eeb1cef5005f5e54b87b8e30c3a85b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0569": {
+        "interleave_array": [
+            "Generate an ink wash painting-style image.",
+            "./benchmarks/image_gen/GEdit/images/0569_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0569_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "84111b91ee328a9c2fc1cf7342b84480",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0570": {
+        "interleave_array": [
+            "Modify this image in a Ghibli style.",
+            "./benchmarks/image_gen/GEdit/images/0570_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0570_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "acda14bb5323b325128b624ecc6652c0",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0571": {
+        "interleave_array": [
+            "Convert to an ink wash painting style.",
+            "./benchmarks/image_gen/GEdit/images/0571_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0571_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2928aac848e0c414823b1b4c2c144ad5",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0572": {
+        "interleave_array": [
+            "Switch to a Ghibli style.",
+            "./benchmarks/image_gen/GEdit/images/0572_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0572_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ddf9a3b77759783bb6d2b96453f8454b",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0573": {
+        "interleave_array": [
+            "Generate a Pixar-style animation.",
+            "./benchmarks/image_gen/GEdit/images/0573_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0573_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3a95a56fdbaed9acedd16d650dcc41ac",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0574": {
+        "interleave_array": [
+            "Convert the young man and woman in the first image into chibi-style characters similar to those in the second image.",
+            "./benchmarks/image_gen/GEdit/images/0574_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0574_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "99ccd2da145b7d7ca7670e407cb3bef7",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0575": {
+        "interleave_array": [
+            "Transform it into a Ghibli style.",
+            "./benchmarks/image_gen/GEdit/images/0575_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0575_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "af2400e4008497474a3ded8fbbb3acc0",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0576": {
+        "interleave_array": [
+            "Make it an oil painting.",
+            "./benchmarks/image_gen/GEdit/images/0576_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0576_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1ae5cde224b986b5e4e9d5bdcf44321e",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0577": {
+        "interleave_array": [
+            "Edit this image into a bright and sunny style for use as an avatar.",
+            "./benchmarks/image_gen/GEdit/images/0577_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0577_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f50278651439a107a2ff7e1b6f76ff08",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0578": {
+        "interleave_array": [
+            "Generate a cyberpunk-style photo.",
+            "./benchmarks/image_gen/GEdit/images/0578_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0578_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1021ff6859a5be7b3955a1fc8d1a9431",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0579": {
+        "interleave_array": [
+            "Redraw it as a chibi-style illustration.",
+            "./benchmarks/image_gen/GEdit/images/0579_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0579_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ae112c98cae0bfd203af4da8ee3ad54f",
+            "task_type": "style_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0640": {
+        "interleave_array": [
+            "add a book in her hand",
+            "./benchmarks/image_gen/GEdit/images/0640_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0640_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "462493728c812ed7b9a46d35bd923d34",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0641": {
+        "interleave_array": [
+            "add a poolside lounge chair",
+            "./benchmarks/image_gen/GEdit/images/0641_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0641_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "25631c283bd12713900c693009b1c4ca",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0642": {
+        "interleave_array": [
+            "add a spoon next to the bowl",
+            "./benchmarks/image_gen/GEdit/images/0642_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0642_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3d00688dfbae3a417d9fdb0599c31612",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0643": {
+        "interleave_array": [
+            "include a table with a plate",
+            "./benchmarks/image_gen/GEdit/images/0643_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0643_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cbb6a1442c20bc7d006fc52f57a7e069",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0644": {
+        "interleave_array": [
+            "add a bookshelf in the corner",
+            "./benchmarks/image_gen/GEdit/images/0644_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0644_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8a116404e2af4eb50871e06263c518b0",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0645": {
+        "interleave_array": [
+            "add a flying baseball coming towards the player",
+            "./benchmarks/image_gen/GEdit/images/0645_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0645_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7fa258492c40546a1412b2e24f283e5f",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0646": {
+        "interleave_array": [
+            "add a beautiful ring on the finger",
+            "./benchmarks/image_gen/GEdit/images/0646_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0646_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ab8c8482e5621349ffcaf7b73a3898d6",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0647": {
+        "interleave_array": [
+            "add a candle on top of the cake",
+            "./benchmarks/image_gen/GEdit/images/0647_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0647_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b9c37aa4bbba0d3603d3d3d6b2472f44",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0648": {
+        "interleave_array": [
+            "add a palm tree behind him",
+            "./benchmarks/image_gen/GEdit/images/0648_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0648_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "92bb99012b775fec11f9c61eb22340e5",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0649": {
+        "interleave_array": [
+            "add a hot air balloon in the sky",
+            "./benchmarks/image_gen/GEdit/images/0649_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0649_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41fa4cf4dabc709bc1a04b273b801471",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0650": {
+        "interleave_array": [
+            "add a cat sitting in the basket",
+            "./benchmarks/image_gen/GEdit/images/0650_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0650_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "82f0d5a60f6a14fcbf7a0828d93b2c42",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0651": {
+        "interleave_array": [
+            "add a butterfly fluttering around the cat",
+            "./benchmarks/image_gen/GEdit/images/0651_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0651_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "38e83bc17f011f6fe380618f5edc9af4",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0652": {
+        "interleave_array": [
+            "add a red cherry on top of pizza",
+            "./benchmarks/image_gen/GEdit/images/0652_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0652_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "24365500c3f8cef08832d25e00ae03cb",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0653": {
+        "interleave_array": [
+            "include a dog running alongside",
+            "./benchmarks/image_gen/GEdit/images/0653_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0653_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6d36627582330fe77f4726604d362dc8",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0654": {
+        "interleave_array": [
+            "include a candle on top of the cake",
+            "./benchmarks/image_gen/GEdit/images/0654_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0654_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5792877c20ccb8c8dfa7a2e3ea570c86",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0655": {
+        "interleave_array": [
+            "include a butterfly landing on its mane",
+            "./benchmarks/image_gen/GEdit/images/0655_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0655_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d23b4b1af3a519ef6c3d5deb2ae171fd",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0656": {
+        "interleave_array": [
+            "add a tennis ball flying towards her",
+            "./benchmarks/image_gen/GEdit/images/0656_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0656_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4c2bf9769840edd7015a3cbea40f10cc",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0657": {
+        "interleave_array": [
+            "add a tennis ball next to the dog",
+            "./benchmarks/image_gen/GEdit/images/0657_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0657_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "071bd732edfb657a3baf47a13477c0ff",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0658": {
+        "interleave_array": [
+            "Add a running horse near the train",
+            "./benchmarks/image_gen/GEdit/images/0658_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0658_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c884913a9bec1ac33d16e85b252c39c5",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0659": {
+        "interleave_array": [
+            "add a person standing next to the bus",
+            "./benchmarks/image_gen/GEdit/images/0659_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0659_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e9ac3ec18e91f8bf73b340de1c2e459e",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0660": {
+        "interleave_array": [
+            "Add an image of Naruto on the left side.",
+            "./benchmarks/image_gen/GEdit/images/0660_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0660_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "91ab4a87f04b6e652fe4e0bfba31ddc3",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0661": {
+        "interleave_array": [
+            "Add a robot bird in the sky.",
+            "./benchmarks/image_gen/GEdit/images/0661_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0661_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3cac5f0141378133b6c02c69bb7349fc",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0662": {
+        "interleave_array": [
+            "Light the candle to enhance the candlelight.",
+            "./benchmarks/image_gen/GEdit/images/0662_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0662_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "853784745a3c52dcfd24cf3a8dba1f56",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0663": {
+        "interleave_array": [
+            "Add a pair of sunglasses in a cool style.",
+            "./benchmarks/image_gen/GEdit/images/0663_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0663_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "707d83474b3e137e378c02b23ee414ae",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0664": {
+        "interleave_array": [
+            "Can you Photoshop a girlfriend for me? Sitting alone is boring, and I\u2019ve already left space for her.",
+            "./benchmarks/image_gen/GEdit/images/0664_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0664_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ba40f1be938b26d2ddec5cb966453720",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0665": {
+        "interleave_array": [
+            "Can you Photoshop a boyfriend for me? I want a couple\u2019s photo.",
+            "./benchmarks/image_gen/GEdit/images/0665_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0665_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f35092d58408ce805d5778fd13ad950c",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0666": {
+        "interleave_array": [
+            "Add more hair to the front, making it long and soft for a gentle look.",
+            "./benchmarks/image_gen/GEdit/images/0666_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0666_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ab6798a5e2a8e04de9bdb02c9425d2a9",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0667": {
+        "interleave_array": [
+            "Add glasses in an intellectual style, giving a high-knowledgeable vibe.",
+            "./benchmarks/image_gen/GEdit/images/0667_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0667_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3ec57ad1669a3841f18e151a487bc767",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0668": {
+        "interleave_array": [
+            "Place a wine glass in the hand.",
+            "./benchmarks/image_gen/GEdit/images/0668_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0668_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9435ef3cbe961ecde654fdde42598cb1",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0669": {
+        "interleave_array": [
+            "Add a large diamond ring on the finger, making it abstract, exaggerated, and funny.",
+            "./benchmarks/image_gen/GEdit/images/0669_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0669_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "10d6161cc1bebeeb0d7c5cac99a3cafd",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0670": {
+        "interleave_array": [
+            "Can you add penguin eyes to this image?",
+            "./benchmarks/image_gen/GEdit/images/0670_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0670_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "794bc25fba24e9c7546c7ffed818fba1",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0671": {
+        "interleave_array": [
+            "Generate an image where a lit tent is added next to the appliances in the picture while keeping the original appliances intact.",
+            "./benchmarks/image_gen/GEdit/images/0671_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0671_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d211b4a29bbfc174b2ef48c6574c5dff",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0672": {
+        "interleave_array": [
+            "Add three persimmons with leaves in the bottom right corner of this painting.",
+            "./benchmarks/image_gen/GEdit/images/0672_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0672_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6a498187c524c7adb7a739413c24f185",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0673": {
+        "interleave_array": [
+            "Add clothes to the person in the image, make their gaze slightly disdainful, and change their posture to a crossed-leg position.",
+            "./benchmarks/image_gen/GEdit/images/0673_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0673_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ec55ed4412ff3a74e6e4b42b21371fb1",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0674": {
+        "interleave_array": [
+            "Add stockings.",
+            "./benchmarks/image_gen/GEdit/images/0674_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0674_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dd8355aceecda1bed1594a616b40cd11",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0675": {
+        "interleave_array": [
+            "Add firearms to the character in the image, turning them into a comedic depiction of a robber, and change the background to a bank.",
+            "./benchmarks/image_gen/GEdit/images/0675_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0675_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1f58ceef62aecf90fcca4f253c5a478b",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0676": {
+        "interleave_array": [
+            "Add a puppy to this picture leaning against the girl's legs.",
+            "./benchmarks/image_gen/GEdit/images/0676_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0676_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "720454d83c65f03eefe4cb6da5d706df",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0677": {
+        "interleave_array": [
+            "Add a beautiful woman to accompany the boyfriend in the image.",
+            "./benchmarks/image_gen/GEdit/images/0677_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0677_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ae62baf786ccbe623b41109c0bda4add",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0678": {
+        "interleave_array": [
+            "Add a young Chinese woman next to the character in the image, with a bright smile and a pure, natural look, without altering the original character.",
+            "./benchmarks/image_gen/GEdit/images/0678_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0678_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "30ecaf9734421b7085c536d7f9837ec7",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0679": {
+        "interleave_array": [
+            "Add a wool coat to the person in the image.",
+            "./benchmarks/image_gen/GEdit/images/0679_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0679_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "be980c50b5cdfb9ede40c6d71769d2c9",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0680": {
+        "interleave_array": [
+            "Add a woman with her back to the camera to the left of the man in white clothes.",
+            "./benchmarks/image_gen/GEdit/images/0680_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0680_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "abf17f7fd44b495e38da17423b1bbd49",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0681": {
+        "interleave_array": [
+            "Add a chair in the background.",
+            "./benchmarks/image_gen/GEdit/images/0681_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0681_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2155f844beb949faa389c83bd4173a6c",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0682": {
+        "interleave_array": [
+            "Add a potted green plant to the right of the sofa.",
+            "./benchmarks/image_gen/GEdit/images/0682_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0682_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "761a6bf01b28d3785d5ab04afa45e7a9",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0683": {
+        "interleave_array": [
+            "Add an elderly woman with gray hair on the right side of the image.",
+            "./benchmarks/image_gen/GEdit/images/0683_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0683_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "540034b428e3c61e8d5a59e3fbba46aa",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0684": {
+        "interleave_array": [
+            "Add a balloon decoration strip below the airplane.",
+            "./benchmarks/image_gen/GEdit/images/0684_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0684_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0523f1c34d1e3312d85b938a7c329885",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0685": {
+        "interleave_array": [
+            "Add a water bottle on the table.",
+            "./benchmarks/image_gen/GEdit/images/0685_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0685_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "69a8b2a59f6d83aab9101d895bc0e10f",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0686": {
+        "interleave_array": [
+            "Add a smiling girl.",
+            "./benchmarks/image_gen/GEdit/images/0686_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0686_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0fd3b576ec3f9873767eb7348c78ead2",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0687": {
+        "interleave_array": [
+            "Dress the girl in black shorts.",
+            "./benchmarks/image_gen/GEdit/images/0687_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0687_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5fe0c103a59eabd95012374edf3d298e",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0688": {
+        "interleave_array": [
+            "Add two small dogs sitting face-to-face in the foreground.",
+            "./benchmarks/image_gen/GEdit/images/0688_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0688_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bcb9d7a80eaf8a5f630cc78b6bce0b6c",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0689": {
+        "interleave_array": [
+            "Add a groom to the left of the bride, with the two gazing into each other\u2019s eyes.",
+            "./benchmarks/image_gen/GEdit/images/0689_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0689_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0b54f659bd2b2ecd02c1070331cd0c92",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0690": {
+        "interleave_array": [
+            "Add a golden crystal glass ball.",
+            "./benchmarks/image_gen/GEdit/images/0690_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0690_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "db01713497b149ee87ed7ef66313f122",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0691": {
+        "interleave_array": [
+            "Add a boy wearing headphones operating a laptop in front of the computer.",
+            "./benchmarks/image_gen/GEdit/images/0691_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0691_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ee87afcee5619d39abcbc36cd87391d4",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0692": {
+        "interleave_array": [
+            "Add a blue feathered helmet.",
+            "./benchmarks/image_gen/GEdit/images/0692_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0692_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4000f5cdc69f67b283228009f51133fa",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0693": {
+        "interleave_array": [
+            "Change the hair from a ponytail to naturally draping over the shoulders.",
+            "./benchmarks/image_gen/GEdit/images/0693_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0693_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0ed1540807e373893280ce44287a9838",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0694": {
+        "interleave_array": [
+            "Add a face mask to the chef's face.",
+            "./benchmarks/image_gen/GEdit/images/0694_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0694_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "30f6aa209359ab7d115d232b1313a047",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0695": {
+        "interleave_array": [
+            "Add a disposable cup on the left side of the foreground.",
+            "./benchmarks/image_gen/GEdit/images/0695_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0695_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cf2612adda4cc638132a60a2857a6cc5",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0696": {
+        "interleave_array": [
+            "Add a black short-sleeved T-shirt to the upper body of the person.",
+            "./benchmarks/image_gen/GEdit/images/0696_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0696_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "09329f999a34f60db2047904ffe6cf0b",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0697": {
+        "interleave_array": [
+            "Add a painting to the easel.",
+            "./benchmarks/image_gen/GEdit/images/0697_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0697_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "61e0b78dbfbf640f62447931c8c45a9a",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0698": {
+        "interleave_array": [
+            "Add a guitar to the girl\u2019s hands and adjust her hand and arm positions to fit the guitar.",
+            "./benchmarks/image_gen/GEdit/images/0698_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0698_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0f0d1e81bb1308e2bbc57ea3c32d5f31",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0699": {
+        "interleave_array": [
+            "Add a lit green desk lamp to the image.",
+            "./benchmarks/image_gen/GEdit/images/0699_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0699_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "815fc87e5633ec77855c468746d08773",
+            "task_type": "subject-add",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0757": {
+        "interleave_array": [
+            "remove the fedora on the bench",
+            "./benchmarks/image_gen/GEdit/images/0757_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0757_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c2330eaa58378dd8d76989053fe27cdc",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0758": {
+        "interleave_array": [
+            "remove the freight train",
+            "./benchmarks/image_gen/GEdit/images/0758_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0758_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fc610a23a5c9ac5c4a3c2cc0386bc8d2",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0759": {
+        "interleave_array": [
+            "remove the sign that measures height",
+            "./benchmarks/image_gen/GEdit/images/0759_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0759_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2d762cc12344718236b171a19417adf5",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0760": {
+        "interleave_array": [
+            "erase the stop sign",
+            "./benchmarks/image_gen/GEdit/images/0760_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0760_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ea887a20c2483ba77f0f0fdfabf83f34",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0761": {
+        "interleave_array": [
+            "remove the dog getting a haircut",
+            "./benchmarks/image_gen/GEdit/images/0761_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0761_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "79e800c9187225a020bef26413014de3",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0762": {
+        "interleave_array": [
+            "remove the stuffed animals",
+            "./benchmarks/image_gen/GEdit/images/0762_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0762_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1b05dbce0dc0e981e4eb38b27c2c0167",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0763": {
+        "interleave_array": [
+            "remove the black hat",
+            "./benchmarks/image_gen/GEdit/images/0763_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0763_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ca3ce49b08db0d75388197210fed5157",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0764": {
+        "interleave_array": [
+            "remove the person wearing all white standing on their ski's",
+            "./benchmarks/image_gen/GEdit/images/0764_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0764_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e0e6c00cd4573be9dd571854cf362d24",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0765": {
+        "interleave_array": [
+            "remove the motorcycle",
+            "./benchmarks/image_gen/GEdit/images/0765_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0765_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3d1bf910852585afdb6fe2a9c9b24d6b",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0766": {
+        "interleave_array": [
+            "remove the peanuts",
+            "./benchmarks/image_gen/GEdit/images/0766_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0766_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6d3f5e90a64806a7b3170d71a6dd0fbe",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0767": {
+        "interleave_array": [
+            "remove the frisbee",
+            "./benchmarks/image_gen/GEdit/images/0767_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0767_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "90f506d94854bce0e7cfe3d7f015c4b2",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0768": {
+        "interleave_array": [
+            "remove the meat pie",
+            "./benchmarks/image_gen/GEdit/images/0768_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0768_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fb492dc225f9ba92079731774b91ac8e",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0769": {
+        "interleave_array": [
+            "remove the pizza",
+            "./benchmarks/image_gen/GEdit/images/0769_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0769_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "834b9cd34b6c6c201ad42bb00eba10eb",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0770": {
+        "interleave_array": [
+            "remove the skateboard",
+            "./benchmarks/image_gen/GEdit/images/0770_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0770_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "856b6b373fe9f39644456d5810cb9042",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0771": {
+        "interleave_array": [
+            "remove the woman standing next to the lady in white.",
+            "./benchmarks/image_gen/GEdit/images/0771_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0771_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9e563953afc8bcce1d0ad908e47f8006",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0772": {
+        "interleave_array": [
+            "erase the zebra",
+            "./benchmarks/image_gen/GEdit/images/0772_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0772_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7bacd70f8819d2444bcf5e0676b14a67",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0773": {
+        "interleave_array": [
+            "remove the umbrella",
+            "./benchmarks/image_gen/GEdit/images/0773_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0773_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e0f2fafb11805800995f38cb327d905b",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0774": {
+        "interleave_array": [
+            "remove the woman",
+            "./benchmarks/image_gen/GEdit/images/0774_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0774_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e5407a415cc85180f2decb76a9529b6e",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0775": {
+        "interleave_array": [
+            "delete the broccoli",
+            "./benchmarks/image_gen/GEdit/images/0775_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0775_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dd328e27b6f2b6871f6be99c414717a9",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0776": {
+        "interleave_array": [
+            "erase the knife on the cutting board",
+            "./benchmarks/image_gen/GEdit/images/0776_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0776_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5d9a5910b296328accc6701096c16a5b",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0777": {
+        "interleave_array": [
+            "Remove the sword from the character\u2019s hand.",
+            "./benchmarks/image_gen/GEdit/images/0777_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0777_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ae3bf75e9abe53ab8e24052ef129d1ef",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0778": {
+        "interleave_array": [
+            "Remove the bag near the character\u2019s hand.",
+            "./benchmarks/image_gen/GEdit/images/0778_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0778_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bc8d567ee91ca1521adaa8d4a486851f",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0779": {
+        "interleave_array": [
+            "Remove the bystanders; the background is cacti.",
+            "./benchmarks/image_gen/GEdit/images/0779_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0779_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2afdc3f8ccc9191b4b5854a9c4042092",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0780": {
+        "interleave_array": [
+            "Remove the magic wand and stars.",
+            "./benchmarks/image_gen/GEdit/images/0780_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0780_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "202eccaba244927e87a17200a87e4406",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0781": {
+        "interleave_array": [
+            "Remove the white stockings.",
+            "./benchmarks/image_gen/GEdit/images/0781_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0781_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6e59f1430e90396b7e08cb869274c426",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0782": {
+        "interleave_array": [
+            "Remove the person from the image.",
+            "./benchmarks/image_gen/GEdit/images/0782_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0782_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a9036279f412391d29bfd86fccd1606c",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0783": {
+        "interleave_array": [
+            "Please remove the woman outside the car while keeping everything else intact, then re-output the image for me.",
+            "./benchmarks/image_gen/GEdit/images/0783_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0783_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "071cbc925ee5f3fcc234d72bf5fbe182",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0784": {
+        "interleave_array": [
+            "Remove the person in the middle of the image.",
+            "./benchmarks/image_gen/GEdit/images/0784_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0784_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "39b5aeaeecceb845d41bc7beaf9319a3",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0785": {
+        "interleave_array": [
+            "Remove the beard from the person while keeping everything else unchanged.",
+            "./benchmarks/image_gen/GEdit/images/0785_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0785_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dc1fb90b41f4c01c16cc351575bc9461",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0786": {
+        "interleave_array": [
+            "Remove the clutter from the photo.",
+            "./benchmarks/image_gen/GEdit/images/0786_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0786_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8e02b0e258dcb5c6af860af239ea35be",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0787": {
+        "interleave_array": [
+            "Remove all the people.",
+            "./benchmarks/image_gen/GEdit/images/0787_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0787_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "be1f3d0f398433eaf3f9cf9a931402a3",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0788": {
+        "interleave_array": [
+            "Remove the red section at the bottom of the image.",
+            "./benchmarks/image_gen/GEdit/images/0788_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0788_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1e62ce17d24426272c11bd47906afba7",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0789": {
+        "interleave_array": [
+            "Enhance this image by removing the distant power lines while maintaining a realistic style.",
+            "./benchmarks/image_gen/GEdit/images/0789_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0789_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cd5e2a6dd0f762849943fede284c4516",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0790": {
+        "interleave_array": [
+            "Remove the bangs.",
+            "./benchmarks/image_gen/GEdit/images/0790_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0790_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b5729b790593f7065bf7ae2f7674c1e2",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0791": {
+        "interleave_array": [
+            "Remove the snake pattern from the image.",
+            "./benchmarks/image_gen/GEdit/images/0791_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0791_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1110b6bc43aa5a3037467f5833ece3c5",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0792": {
+        "interleave_array": [
+            "Remove the phone from the person\u2019s hand.",
+            "./benchmarks/image_gen/GEdit/images/0792_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0792_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "56b9020b542342972a3a796ee802ca95",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0793": {
+        "interleave_array": [
+            "Remove the black railing.",
+            "./benchmarks/image_gen/GEdit/images/0793_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0793_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ddb40f4b94e2f60fc92815cfd89546c7",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0794": {
+        "interleave_array": [
+            "Delete the man wearing a white robe.",
+            "./benchmarks/image_gen/GEdit/images/0794_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0794_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f9418062962373f7646b359d32c2526a",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0795": {
+        "interleave_array": [
+            "Remove the person from the top of the boat cabin.",
+            "./benchmarks/image_gen/GEdit/images/0795_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0795_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "23663821d9be6aba1d8100daffdb15cb",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0796": {
+        "interleave_array": [
+            "Remove the glasses.",
+            "./benchmarks/image_gen/GEdit/images/0796_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0796_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "64c8ea3004fb830615591121c1cebe6a",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0797": {
+        "interleave_array": [
+            "Delete the computer.",
+            "./benchmarks/image_gen/GEdit/images/0797_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0797_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "08a671d4c5067a5659ea84cded659fd8",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0798": {
+        "interleave_array": [
+            "Remove the music stand and sheet music from the stage.",
+            "./benchmarks/image_gen/GEdit/images/0798_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0798_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8752867086a665d8889f8134703d92b0",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0799": {
+        "interleave_array": [
+            "Remove the railing in the background.",
+            "./benchmarks/image_gen/GEdit/images/0799_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0799_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "99cc761cfc9689c2ffff606e41832371",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0800": {
+        "interleave_array": [
+            "Remove the bouquet from the left foreground.",
+            "./benchmarks/image_gen/GEdit/images/0800_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0800_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "600884cc7c2be67d2ecf5517fea512c2",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0801": {
+        "interleave_array": [
+            "Delete the pen from the woman\u2019s hand in the foreground.",
+            "./benchmarks/image_gen/GEdit/images/0801_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0801_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cf3daa2e86bcd3cc867204e5edb938bf",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0802": {
+        "interleave_array": [
+            "Remove the bicycle between the two people.",
+            "./benchmarks/image_gen/GEdit/images/0802_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0802_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c59adebb1dc6ecf43d658c1a4b7674ee",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0803": {
+        "interleave_array": [
+            "Remove the girl inside the window.",
+            "./benchmarks/image_gen/GEdit/images/0803_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0803_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fe511143bd74c0262075af599364dbcc",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0804": {
+        "interleave_array": [
+            "Remove the person in the distance wearing red clothes and a green backpack.",
+            "./benchmarks/image_gen/GEdit/images/0804_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0804_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2ebb31e51f0de8bba7005352df7150b8",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0805": {
+        "interleave_array": [
+            "Remove the Christmas tree on the left side of the image.",
+            "./benchmarks/image_gen/GEdit/images/0805_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0805_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ce693a75c84f7c0f2975c6cc50f2af9e",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0806": {
+        "interleave_array": [
+            "Remove the sticky notes next to the monitor.",
+            "./benchmarks/image_gen/GEdit/images/0806_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0806_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f5d8129b33eaf3adcaad19ba2d471529",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0807": {
+        "interleave_array": [
+            "Delete the red headscarf.",
+            "./benchmarks/image_gen/GEdit/images/0807_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0807_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "17f4fec8b5c096e4ee3e8f168f93d05d",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0808": {
+        "interleave_array": [
+            "Remove the red dumbbell.",
+            "./benchmarks/image_gen/GEdit/images/0808_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0808_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cdafebec5ad18102bb9f2f9eccac053f",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0809": {
+        "interleave_array": [
+            "Remove the two bottles of alcohol on the left side of the image.",
+            "./benchmarks/image_gen/GEdit/images/0809_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0809_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "774befadbd2459f532eb3e9bc2dca051",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0810": {
+        "interleave_array": [
+            "Delete the white fence.",
+            "./benchmarks/image_gen/GEdit/images/0810_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0810_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8d0bc807846ac304d0b02ac5588c646d",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0811": {
+        "interleave_array": [
+            "Remove the elderly man wearing glasses",
+            "./benchmarks/image_gen/GEdit/images/0811_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0811_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8853ec3095105930363c6c8c988f55e6",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0812": {
+        "interleave_array": [
+            "Delete the standing person wearing glasses.",
+            "./benchmarks/image_gen/GEdit/images/0812_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0812_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "16e59a89f37b5603a41fc60e2912a325",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0813": {
+        "interleave_array": [
+            "Remove the bracelets and wristbands from the woman\u2019s hand.",
+            "./benchmarks/image_gen/GEdit/images/0813_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0813_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "453c8005733dd2b902343f7577818c7b",
+            "task_type": "subject-remove",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0874": {
+        "interleave_array": [
+            "Replace the dog with a robot.",
+            "./benchmarks/image_gen/GEdit/images/0874_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0874_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e1f901ce70f0db3eff231690b35a5e6f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0875": {
+        "interleave_array": [
+            "Replace the TV with a bookshelf.",
+            "./benchmarks/image_gen/GEdit/images/0875_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0875_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b3143e1ac75799da45f66d12b56cf911",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0876": {
+        "interleave_array": [
+            "Replace the cat with a squirrel.",
+            "./benchmarks/image_gen/GEdit/images/0876_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0876_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "119d3796737244a5dafa7513b373f64d",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0877": {
+        "interleave_array": [
+            "Replace the pizza with a croissant.",
+            "./benchmarks/image_gen/GEdit/images/0877_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0877_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9c6128cce6d1f80b7185c7427d3e30b8",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0878": {
+        "interleave_array": [
+            "Replace the cat on the laptop with a robot.",
+            "./benchmarks/image_gen/GEdit/images/0878_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0878_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "906a0ffafbbf5f0dc51f01a4d2b7b7e3",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0879": {
+        "interleave_array": [
+            "Replace the cat with a dog.",
+            "./benchmarks/image_gen/GEdit/images/0879_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0879_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f1eb6e2131f2e664890e7e7a6c27efd5",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0880": {
+        "interleave_array": [
+            "Replace the toilet with a bathtub.",
+            "./benchmarks/image_gen/GEdit/images/0880_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0880_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b8b9b70b9e2bce018e5e0d2bad7293bc",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0881": {
+        "interleave_array": [
+            "Replace the bear with a fox.",
+            "./benchmarks/image_gen/GEdit/images/0881_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0881_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dcb09f6f95a11496ee03ea7c875ef481",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0882": {
+        "interleave_array": [
+            "Replace the bus with a truck.",
+            "./benchmarks/image_gen/GEdit/images/0882_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0882_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "02bc73fdbbb74f6ed7fd480b6b61abe8",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0883": {
+        "interleave_array": [
+            "Replace the dog with a rabbit.",
+            "./benchmarks/image_gen/GEdit/images/0883_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0883_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6c5513204cc1970d65864e87f5c9444b",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0884": {
+        "interleave_array": [
+            "Replace the school bus with a truck.",
+            "./benchmarks/image_gen/GEdit/images/0884_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0884_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "efffd701b3207dee94a8d7009bbc9c75",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0885": {
+        "interleave_array": [
+            "Replace the vase with a sculpture.",
+            "./benchmarks/image_gen/GEdit/images/0885_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0885_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "99303b3de6bf596a47ecf83b25b08db5",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0886": {
+        "interleave_array": [
+            "Replace the cat with a fish.",
+            "./benchmarks/image_gen/GEdit/images/0886_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0886_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1eaed2671a534749a7c6a02a3d3e5f82",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0887": {
+        "interleave_array": [
+            "Replace the bus with an ambulance.",
+            "./benchmarks/image_gen/GEdit/images/0887_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0887_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a88fd018f656ea701330c3fc14c1e8d0",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0888": {
+        "interleave_array": [
+            "Replace the zebra with a giraffe.",
+            "./benchmarks/image_gen/GEdit/images/0888_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0888_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "23b240c9ef94ecae6ad0b992b3d08034",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0889": {
+        "interleave_array": [
+            "Replace the cake with a pie.",
+            "./benchmarks/image_gen/GEdit/images/0889_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0889_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2d3bd0d5db244d8fc89c9348fcb07d19",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0890": {
+        "interleave_array": [
+            "Replace the bed with a sofa.",
+            "./benchmarks/image_gen/GEdit/images/0890_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0890_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d5f8ef8ebda32869bfc8b7fefc88f364",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0891": {
+        "interleave_array": [
+            "Replace the elephant with a giraffe.",
+            "./benchmarks/image_gen/GEdit/images/0891_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0891_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7a6e67b7d9c028d3d1bbef91483c26a6",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0892": {
+        "interleave_array": [
+            "Replace the eagle with a parrot.",
+            "./benchmarks/image_gen/GEdit/images/0892_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0892_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "469267bfc120943d28e93b6ecefe14af",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0893": {
+        "interleave_array": [
+            "Replace the baby with a puppy.",
+            "./benchmarks/image_gen/GEdit/images/0893_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0893_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7d15844945eb5e5dc00c219740f028d3",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0894": {
+        "interleave_array": [
+            "Replace the sword in Sasuke\u2019s hand with a rainbow unicorn hammer.",
+            "./benchmarks/image_gen/GEdit/images/0894_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0894_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ae3bf75e9abe53ab8e24052ef129d1ef",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0895": {
+        "interleave_array": [
+            "Replace the face in the picture with a blonde beauty.",
+            "./benchmarks/image_gen/GEdit/images/0895_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0895_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bc8d567ee91ca1521adaa8d4a486851f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0896": {
+        "interleave_array": [
+            "Change the cat\u2019s collar into a bell.",
+            "./benchmarks/image_gen/GEdit/images/0896_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0896_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9d913d98a00d6b3a4088bdceb2232b89",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0897": {
+        "interleave_array": [
+            "Replace the bracelet with a jade bangle.",
+            "./benchmarks/image_gen/GEdit/images/0897_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0897_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "038b15558b1082c59f0a92e4853554aa",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0898": {
+        "interleave_array": [
+            "Change the food into seafood paella.",
+            "./benchmarks/image_gen/GEdit/images/0898_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0898_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b3f022593999c27290201be31f9e6f1b",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0899": {
+        "interleave_array": [
+            "Swap the earring for one that looks less conspicuous.",
+            "./benchmarks/image_gen/GEdit/images/0899_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0899_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "08ea4b4086ac3690f6aa0ab47d0da30f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0900": {
+        "interleave_array": [
+            "Turn the rice into a hamburger and draw an avatar eating a burger.",
+            "./benchmarks/image_gen/GEdit/images/0900_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0900_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ab7edde74c02708a661f6861144cbe95",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0901": {
+        "interleave_array": [
+            "This is an ID photo\u2014please replace the clothing with a white dress shirt.",
+            "./benchmarks/image_gen/GEdit/images/0901_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0901_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e5c8aea1b4db7ee5e173bcc122a4ba8f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0902": {
+        "interleave_array": [
+            "Give her a different outfit.",
+            "./benchmarks/image_gen/GEdit/images/0902_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0902_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2bf9fc7119ba64e9bb1579221e788885",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0903": {
+        "interleave_array": [
+            "Replace the food in the pot with spicy hot pot.",
+            "./benchmarks/image_gen/GEdit/images/0903_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0903_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "50bf17e2335463ccb3511f5164ed1af0",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0904": {
+        "interleave_array": [
+            "Change my hairstyle to a wolf-cut mullet.",
+            "./benchmarks/image_gen/GEdit/images/0904_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0904_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b049c18444079151e9be5a640f9fe552",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0905": {
+        "interleave_array": [
+            "Turn the sleeves into ramen and chopped scallions.",
+            "./benchmarks/image_gen/GEdit/images/0905_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0905_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0051b688bcfc65a4fc1063488eb9da0c",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0906": {
+        "interleave_array": [
+            "Change the man's hairstyle to a textured fringe cut.",
+            "./benchmarks/image_gen/GEdit/images/0906_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0906_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3de02fadd75720177fa70f42f743a48f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0907": {
+        "interleave_array": [
+            "Dress the person in the image in a Zhongshan suit.",
+            "./benchmarks/image_gen/GEdit/images/0907_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0907_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b993cccb7c8b197175226e397a0f09a8",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0908": {
+        "interleave_array": [
+            "Give me long hair\u2014shoulder-length or waist-length.",
+            "./benchmarks/image_gen/GEdit/images/0908_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0908_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1a0f5ee01be70d234093e91bae2282d7",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0909": {
+        "interleave_array": [
+            "Turn the tree branches in the image into a witch\u2019s magic wand.",
+            "./benchmarks/image_gen/GEdit/images/0909_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0909_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "70fa37d482c2e708435366323262de90",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0910": {
+        "interleave_array": [
+            "Extract the person from the photo and dress them in a police uniform.",
+            "./benchmarks/image_gen/GEdit/images/0910_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0910_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "93e3bdd834cb2924864675b3dc5de9e5",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0911": {
+        "interleave_array": [
+            "Change the character into a male.",
+            "./benchmarks/image_gen/GEdit/images/0911_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0911_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0a92bb3a2f79ce8f79a6818701b95efa",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0912": {
+        "interleave_array": [
+            "Replace the person in the image with Spider-Man.",
+            "./benchmarks/image_gen/GEdit/images/0912_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0912_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e70f54aa5e32ff6a11a512a4b6ebc734",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0913": {
+        "interleave_array": [
+            "Keep the person in the image but replace the cat with a dinosaur.",
+            "./benchmarks/image_gen/GEdit/images/0913_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0913_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a231523c745863eb34887202481d482b",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0914": {
+        "interleave_array": [
+            "Replace the guitar in front of the fox with a can of Coca-Cola.",
+            "./benchmarks/image_gen/GEdit/images/0914_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0914_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6d45407dd52ca631efd3095a6f84c8b2",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0915": {
+        "interleave_array": [
+            "Swap the crystal in the child\u2019s hand for a glowing square box.",
+            "./benchmarks/image_gen/GEdit/images/0915_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0915_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9658ab0654630bdb7d190f3f85280793",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0916": {
+        "interleave_array": [
+            "Replace the woman in the center of the image with a black helicopter.",
+            "./benchmarks/image_gen/GEdit/images/0916_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0916_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "645a7c81e22d496f62e8bbbd58ca309e",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0917": {
+        "interleave_array": [
+            "Change the crown on the cat\u2019s head into a magician\u2019s hat.",
+            "./benchmarks/image_gen/GEdit/images/0917_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0917_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "41648ca74f9e782af1359397de7a6125",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0918": {
+        "interleave_array": [
+            "Replace the wolf with a standing bear, and have the bear hold a lightsaber with both paws.",
+            "./benchmarks/image_gen/GEdit/images/0918_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0918_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "20766bfbcee8914213e479c5845a057f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0919": {
+        "interleave_array": [
+            "Swap the bouquet in the woman\u2019s hand for a bottle of whiskey.",
+            "./benchmarks/image_gen/GEdit/images/0919_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0919_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "19e7dd610e2151dd4576490c7ece040f",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0920": {
+        "interleave_array": [
+            "Replace the fruits in the colander with tomatoes.",
+            "./benchmarks/image_gen/GEdit/images/0920_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0920_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8e5ddf5d223b0e938f58da44752dbca7",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0921": {
+        "interleave_array": [
+            "Turn the samurai sword in the person's right hand into an axe.",
+            "./benchmarks/image_gen/GEdit/images/0921_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0921_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "76fb2038377daf0ee1fe8efc57f7d6b3",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0922": {
+        "interleave_array": [
+            "Replace the two children with a fire truck.",
+            "./benchmarks/image_gen/GEdit/images/0922_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0922_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "47ee57134632cbbfd038acd9ae870779",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0923": {
+        "interleave_array": [
+            "Swap the brown teddy bear with a bag of \"Cricket\" chips.",
+            "./benchmarks/image_gen/GEdit/images/0923_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0923_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "38424c921a89c3192404da23d54ce90d",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0924": {
+        "interleave_array": [
+            "Replace the person in the mirror wearing a white shirt with a wardrobe.",
+            "./benchmarks/image_gen/GEdit/images/0924_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0924_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0064d30c8f40ddd94fa9bc564677498e",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0925": {
+        "interleave_array": [
+            "Turn the baby\u2019s balloon into an ice cream cone.",
+            "./benchmarks/image_gen/GEdit/images/0925_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0925_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a33f7ac94c028e30e9254363bb651331",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0926": {
+        "interleave_array": [
+            "Replace the baby stroller with a large globe.",
+            "./benchmarks/image_gen/GEdit/images/0926_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0926_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "92feefc5a6c868f8e36f262a7a89f866",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0927": {
+        "interleave_array": [
+            "Change the painting on the scroll in the man\u2019s hand to a treasure map.",
+            "./benchmarks/image_gen/GEdit/images/0927_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0927_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f22a0046d07bf09f9e90b3eecb06e151",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0928": {
+        "interleave_array": [
+            "Replace the pencil in the hand of the boy wearing an orange-striped shirt with an egg.",
+            "./benchmarks/image_gen/GEdit/images/0928_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0928_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8d074f9906d22f1f4d48400fe47f74f0",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0929": {
+        "interleave_array": [
+            "Swap the two grapefruit slices for VR goggles.",
+            "./benchmarks/image_gen/GEdit/images/0929_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0929_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cef0d8358ad359678f9632380c3b5ac6",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0930": {
+        "interleave_array": [
+            "Replace the paintbrush in the foreground person's hand with a giraffe mask.",
+            "./benchmarks/image_gen/GEdit/images/0930_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0930_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3e15c35b58cee82be47ee8d927704dde",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "0931": {
+        "interleave_array": [
+            "Replace the green cup and its light-colored saucer on the table with a pen holder.",
+            "./benchmarks/image_gen/GEdit/images/0931_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0931_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "13829304257def7287613210cc82d6eb",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0932": {
+        "interleave_array": [
+            "Change the clothes of the girl on the left side of the image to a black-and-white striped dress.",
+            "./benchmarks/image_gen/GEdit/images/0932_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0932_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "619ba60ef621caf9f1412bfa7a3eb5c1",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "0933": {
+        "interleave_array": [
+            "Replace the laptop in front of the girl with a book.",
+            "./benchmarks/image_gen/GEdit/images/0933_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/0933_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "fb4ebcdc742e1eb13c99f8f56e3a0cdb",
+            "task_type": "subject-replace",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1033": {
+        "interleave_array": [
+            "Replace the text 'Google' with 'Goose'",
+            "./benchmarks/image_gen/GEdit/images/1033_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1033_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a8cd7b467259425ed1a369550b28340e",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1034": {
+        "interleave_array": [
+            "Replace the text 'me' with 'he'",
+            "./benchmarks/image_gen/GEdit/images/1034_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1034_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9083ce3121a3d62c3fe3527e874760e5",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1035": {
+        "interleave_array": [
+            "Replace the text 'LONDON' with 'ENGLAND'",
+            "./benchmarks/image_gen/GEdit/images/1035_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1035_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "680553ae15691f30c9f717b5eac38ee9",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1036": {
+        "interleave_array": [
+            "Replace the text 'THE' with 'BUT'",
+            "./benchmarks/image_gen/GEdit/images/1036_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1036_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "de0dff21bb67b813d97dbe1709282a1f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1037": {
+        "interleave_array": [
+            "Replace the text 'DARK' with 'DUCK'",
+            "./benchmarks/image_gen/GEdit/images/1037_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1037_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a64d376914f2bed3b2383d9af0965dcb",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1038": {
+        "interleave_array": [
+            "Replace the text 'ORION' with 'OREO'",
+            "./benchmarks/image_gen/GEdit/images/1038_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1038_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9c5b7cba20568ccd142d8911e7a763c1",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1039": {
+        "interleave_array": [
+            "Replace the text 'Science' with 'Nature'",
+            "./benchmarks/image_gen/GEdit/images/1039_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1039_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9c626643de176f0b934842efe12893c1",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1040": {
+        "interleave_array": [
+            "Replace the text 'APEX' with 'CSGO'",
+            "./benchmarks/image_gen/GEdit/images/1040_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1040_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "58679ee9aa7080c0d51c33e71375689c",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1041": {
+        "interleave_array": [
+            "Replace the text 'CS' with 'VALO' and replace 'GO' with 'RANT'",
+            "./benchmarks/image_gen/GEdit/images/1041_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1041_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3bba48b36d85fb45365ee57c188b71ea",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1042": {
+        "interleave_array": [
+            "Replace the text 'VALORANT' with 'APEX'",
+            "./benchmarks/image_gen/GEdit/images/1042_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1042_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6c106095dcb4224b3b3d74b4f89b8389",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1043": {
+        "interleave_array": [
+            "Replace the text 'IELTS' with 'TOFEL'",
+            "./benchmarks/image_gen/GEdit/images/1043_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1043_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e7e3e2de78380531a17b1edd36807135",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1044": {
+        "interleave_array": [
+            "Replace the text 'TOFEL' with 'IELTS'",
+            "./benchmarks/image_gen/GEdit/images/1044_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1044_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ffb7319fe2b8f2d222fb4ef38575d0f7",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1045": {
+        "interleave_array": [
+            "Replace the text 'duolingo' with 'greenowl'",
+            "./benchmarks/image_gen/GEdit/images/1045_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1045_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0c8ea3779a6497c2b743c962a8231d1f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1046": {
+        "interleave_array": [
+            "Remove the text 'FREE'",
+            "./benchmarks/image_gen/GEdit/images/1046_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1046_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4612dcee8805e9624abd52e616449ba5",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1047": {
+        "interleave_array": [
+            "Replace the text 'TRAIN' with 'PLANE'",
+            "./benchmarks/image_gen/GEdit/images/1047_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1047_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5e085566f105978483848cab2f3a7001",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1048": {
+        "interleave_array": [
+            "Replace the text 'Salmon' with 'Sandwich'",
+            "./benchmarks/image_gen/GEdit/images/1048_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1048_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "89714e3ea9345ea5483ac6d5856915fe",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1049": {
+        "interleave_array": [
+            "Remove the text from the image",
+            "./benchmarks/image_gen/GEdit/images/1049_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1049_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b43f20f42dbb3e169fabf75289627f98",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1050": {
+        "interleave_array": [
+            "Remove the text from the image",
+            "./benchmarks/image_gen/GEdit/images/1050_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1050_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "78ee992d292a3153df4d8d351f6256da",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1051": {
+        "interleave_array": [
+            "Replace the text 'KONG' with 'JING'",
+            "./benchmarks/image_gen/GEdit/images/1051_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1051_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e3ca3edfa90133957ec703f6d54b293a",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1052": {
+        "interleave_array": [
+            "Replace the text 'lululemon' with 'lelolelol'",
+            "./benchmarks/image_gen/GEdit/images/1052_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1052_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c3e2d59003688478213a86fcee494bad",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1053": {
+        "interleave_array": [
+            "Add 'SPRING' below the text '2025'",
+            "./benchmarks/image_gen/GEdit/images/1053_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1053_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "729973fc58cfbf36b439f77a9f23a175",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1054": {
+        "interleave_array": [
+            "Replace the text 'NIPS' with 'CVPR'",
+            "./benchmarks/image_gen/GEdit/images/1054_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1054_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0cf1208c4cfe6b460aaa6c4e01af30a3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1055": {
+        "interleave_array": [
+            "Replace the text 'WOOD' with 'LAND'",
+            "./benchmarks/image_gen/GEdit/images/1055_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1055_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "73c88cc5d9741cfbc0764304bbba00ed",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1056": {
+        "interleave_array": [
+            "Replace the text 'SUMMER' with 'WINTER'",
+            "./benchmarks/image_gen/GEdit/images/1056_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1056_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0f7f956de6cdaff845d738075fa2fa16",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1057": {
+        "interleave_array": [
+            "Replace the text 'PIZZA' with 'PLAZA'",
+            "./benchmarks/image_gen/GEdit/images/1057_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1057_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "853bc02c90873ac8838e53ee11fa5ec3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1058": {
+        "interleave_array": [
+            "Replace the text 'DARK' with 'MILK'",
+            "./benchmarks/image_gen/GEdit/images/1058_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1058_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5b8fa6ad16fa1c442f0cf33437c8b310",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1059": {
+        "interleave_array": [
+            "Replace the text 'HELL'S' with 'HEAVEN'",
+            "./benchmarks/image_gen/GEdit/images/1059_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1059_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9897da315582be46de990e313d8e4a9b",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1060": {
+        "interleave_array": [
+            "Replace the text 'BOWL' with 'FULL'",
+            "./benchmarks/image_gen/GEdit/images/1060_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1060_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "334a1052afdb33ad9aebdc24406c46be",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1061": {
+        "interleave_array": [
+            "Replace the text 'PROJECT' with 'PROMPT'",
+            "./benchmarks/image_gen/GEdit/images/1061_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1061_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7bdeb9f23a8c11688f33f968ee27be4d",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1062": {
+        "interleave_array": [
+            "Replace the text 'NATURE' with 'SCIENCE'",
+            "./benchmarks/image_gen/GEdit/images/1062_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1062_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f9f802a2b603002b098e3e7590f45661",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1063": {
+        "interleave_array": [
+            "Replace the text 'TOES' with 'NIKE'",
+            "./benchmarks/image_gen/GEdit/images/1063_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1063_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b4b77c91de77e4bd0abe2ca27853ce1f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1064": {
+        "interleave_array": [
+            "Replace the text 'SHEFFIELD' with 'EDINBURGH'",
+            "./benchmarks/image_gen/GEdit/images/1064_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1064_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0139f41b56bc537daabf684856d2ddb5",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1065": {
+        "interleave_array": [
+            "Replace the text 'DORIT' with 'DRINK'",
+            "./benchmarks/image_gen/GEdit/images/1065_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1065_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "65657376e859f54717af9ee796759dc7",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1066": {
+        "interleave_array": [
+            "Replace the text 'CLASSIC MOJITO' with 'BABY MILKSHAKE'",
+            "./benchmarks/image_gen/GEdit/images/1066_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1066_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8ecdf91615e1599dd4a088d757fedd29",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1067": {
+        "interleave_array": [
+            "Replace the text 'SEVEN' with 'EIGHT'",
+            "./benchmarks/image_gen/GEdit/images/1067_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1067_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3e6dd180e9c8081cceae9fc8abbf9052",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1068": {
+        "interleave_array": [
+            "Add 'Written by' to the right of 'NANCY DOWD'",
+            "./benchmarks/image_gen/GEdit/images/1068_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1068_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c86d92a8647bd78f5b8e25ff05e45d03",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1069": {
+        "interleave_array": [
+            "Replace the text 'JACQUES' with 'QUEENS'",
+            "./benchmarks/image_gen/GEdit/images/1069_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1069_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d0c69c93397abe869ba14880d3933216",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1070": {
+        "interleave_array": [
+            "Replace the text 'McCONAUGHEY' with 'McDonald'",
+            "./benchmarks/image_gen/GEdit/images/1070_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1070_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "140cbdcd2cc6adccb374b62d40f41b9f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1071": {
+        "interleave_array": [
+            "Replace the text 'chicken' with 'beef'",
+            "./benchmarks/image_gen/GEdit/images/1071_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1071_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6e8087c04fbf254fb063f1500135ea36",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1072": {
+        "interleave_array": [
+            "Change the text '23' to '45'",
+            "./benchmarks/image_gen/GEdit/images/1072_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1072_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "58d1612b7520a747c616519e94c8f48c",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1073": {
+        "interleave_array": [
+            "Replace the text 'SPA' with 'Relaxation Oasis'",
+            "./benchmarks/image_gen/GEdit/images/1073_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1073_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f58bb1fd98acc1888a7272d0d0f4f2a7",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1074": {
+        "interleave_array": [
+            "Change the text 'Bank' to 'Banks'",
+            "./benchmarks/image_gen/GEdit/images/1074_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1074_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a9ae1402abe1d2624b7fce054edd7313",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1075": {
+        "interleave_array": [
+            "Replace the text 'Ziel' with 'Porte'",
+            "./benchmarks/image_gen/GEdit/images/1075_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1075_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ba7dd356656852097eeb86c78c1faae3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1076": {
+        "interleave_array": [
+            "Change the text 'FAIRGROUNDS' to 'PARKWAY'",
+            "./benchmarks/image_gen/GEdit/images/1076_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1076_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f00dfa62ea1474aeb985b5447dc8fa0c",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1077": {
+        "interleave_array": [
+            "Replace the text 'SNACK' with 'TREAT'",
+            "./benchmarks/image_gen/GEdit/images/1077_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1077_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "131ed8c70ef386a4edf4faefe155a8c7",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1078": {
+        "interleave_array": [
+            "Change the text 'petit' to 'grand'",
+            "./benchmarks/image_gen/GEdit/images/1078_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1078_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ac80b138c5c796517a6fdc789a7d7c2b",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1079": {
+        "interleave_array": [
+            "Replace the text '32' with '33'",
+            "./benchmarks/image_gen/GEdit/images/1079_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1079_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "92181ff38321335cfb22e96fefd03188",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1080": {
+        "interleave_array": [
+            "Change the text 'COST' to 'FREE'",
+            "./benchmarks/image_gen/GEdit/images/1080_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1080_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5efb5e25b477ca8d0f98f7774cb28ce0",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1081": {
+        "interleave_array": [
+            "Change the text 'hotwind' to 'cool breeze'",
+            "./benchmarks/image_gen/GEdit/images/1081_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1081_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5052e9399738b1d713833bf3b1b55950",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1082": {
+        "interleave_array": [
+            "Change the text 'ONTARIO' to 'ONTARO'",
+            "./benchmarks/image_gen/GEdit/images/1082_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1082_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e25b84ada0cfe0c12eaf82e0b7dbecf3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1083": {
+        "interleave_array": [
+            "Replace the text'FERMENT' with 'delicious treats'",
+            "./benchmarks/image_gen/GEdit/images/1083_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1083_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7b61056fbc47ace12e3ae5568e0c67ed",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1084": {
+        "interleave_array": [
+            "Change the text '500' to '250'",
+            "./benchmarks/image_gen/GEdit/images/1084_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1084_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "11e198f3745e800957d19098cf29c99b",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1085": {
+        "interleave_array": [
+            "Change the text 'SCHOOL' to 'COLLEGE'",
+            "./benchmarks/image_gen/GEdit/images/1085_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1085_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "73fa218afee74ffe8cc7d1695cef644e",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1086": {
+        "interleave_array": [
+            "Replace the text 'STOP' with 'Caution'",
+            "./benchmarks/image_gen/GEdit/images/1086_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1086_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "966bd852686133547ab7283bec3293ab",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1087": {
+        "interleave_array": [
+            "Replace the text 'BAR' with 'Beach'",
+            "./benchmarks/image_gen/GEdit/images/1087_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1087_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "079b25c601b74a2da8980461e0640324",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1088": {
+        "interleave_array": [
+            "Change the text 'ESTATE TACHEN' to 'Timeless Fashion'",
+            "./benchmarks/image_gen/GEdit/images/1088_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1088_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "510f45e1134cf029ef152acee6aaf6dd",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1089": {
+        "interleave_array": [
+            "Change the text 'SNP' to 'Call me'",
+            "./benchmarks/image_gen/GEdit/images/1089_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1089_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "de1a98de20909a104b97fc444fff100d",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1090": {
+        "interleave_array": [
+            "Change the text '10000' to '18000'",
+            "./benchmarks/image_gen/GEdit/images/1090_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1090_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bf5580adf9837959761bae65e343ff1f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1091": {
+        "interleave_array": [
+            "Replace the text '95' with '123'",
+            "./benchmarks/image_gen/GEdit/images/1091_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1091_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "99500eadb2f363c2e26fcb501972c29f",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1092": {
+        "interleave_array": [
+            "Little Yue, can you replace the character \"\u66f9\" with \"\u53f6\" inside?",
+            "./benchmarks/image_gen/GEdit/images/1092_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1092_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "365da3516f60dde11e8a362ceffceb38",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1093": {
+        "interleave_array": [
+            "Add the three characters \"\u4ff1\u4e50\u90e8\" to this image.",
+            "./benchmarks/image_gen/GEdit/images/1093_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1093_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "680b385ed8595ff5e4bfdd9fc7c5bf1a",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1094": {
+        "interleave_array": [
+            "Help me insert the words \"\u65f6\u5149\u4e0e\u4f60\u5171\u5b88\u60c5\u957f\" into the image.",
+            "./benchmarks/image_gen/GEdit/images/1094_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1094_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "be45c39b3bcc9d082051c13b5300dde1",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1095": {
+        "interleave_array": [
+            "Remove the text.",
+            "./benchmarks/image_gen/GEdit/images/1095_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1095_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d93126d2fa1e4d4a9ce9cc0cddee9826",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1096": {
+        "interleave_array": [
+            "Replace the tattoo text in the image with \"\u6b64\u751f\u4e0d\u8d1f\u4f60\".",
+            "./benchmarks/image_gen/GEdit/images/1096_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1096_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "947855a8a1e0954deaa4dc826e9519db",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1097": {
+        "interleave_array": [
+            "Based on this image, change the hidden \"New York\" text to \"ALEX\".",
+            "./benchmarks/image_gen/GEdit/images/1097_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1097_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "27dba5cccc5a6d4ca877b83eb2ca374e",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1098": {
+        "interleave_array": [
+            "Remove the text in the background.",
+            "./benchmarks/image_gen/GEdit/images/1098_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1098_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e206257f9c32db3dd3ad888865e3cefc",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1099": {
+        "interleave_array": [
+            "Can you change the text in the image to \"\u68a6\u79bb\u5f52\u65f6\"?",
+            "./benchmarks/image_gen/GEdit/images/1099_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1099_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3a9853285c981f9ec42fae7c9ba938f8",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1100": {
+        "interleave_array": [
+            "Replace the two characters \"\u8bf8\u66a8\" with \"\u6c38\u5eb7\".",
+            "./benchmarks/image_gen/GEdit/images/1100_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1100_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "bdd77c99d54bdb14bcd48ee0ee3faafa",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1101": {
+        "interleave_array": [
+            "Can you remove the white text from this image?",
+            "./benchmarks/image_gen/GEdit/images/1101_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1101_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f684d93bdc97991873726762bec1d841",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1102": {
+        "interleave_array": [
+            "I want to change the letters on this piece of clothing to \"DIOR\".",
+            "./benchmarks/image_gen/GEdit/images/1102_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1102_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7b2f2dd979ca5149d4a6145bdfa64494",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1103": {
+        "interleave_array": [
+            "Replace the \"\u62db\u8d22\u8fdb\u5b9d\" characters inside the circle with \"\u8d22\u6e90\u5e7f\u8fdb\".",
+            "./benchmarks/image_gen/GEdit/images/1103_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1103_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1bb0fbeaac87f6eff80e09d8fd409de1",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1104": {
+        "interleave_array": [
+            "I need you to change \"2024\" to \"2025\" in this image and replace the \"\u9648\" character inside the topmost heart with \"\u534e\".",
+            "./benchmarks/image_gen/GEdit/images/1104_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1104_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5b8717b2209b784940f388864d5520f3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1105": {
+        "interleave_array": [
+            "Modify this image directly, do not generate a new one. Add the four characters \"\u5f77\u5fa8\u4e4b\u5203\" on the left side.",
+            "./benchmarks/image_gen/GEdit/images/1105_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1105_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7eb55372802dfee7167a63e02728ca0e",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1106": {
+        "interleave_array": [
+            "Remove the text from the image.",
+            "./benchmarks/image_gen/GEdit/images/1106_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1106_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f148f938d38014e46b85664ffc617457",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1107": {
+        "interleave_array": [
+            "Add the four characters \"\u677e\u524d\u4e91\u9e64\" to this image in a calligraphy style.",
+            "./benchmarks/image_gen/GEdit/images/1107_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1107_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "0ca8f7be9422a84221920fccc6df2c4c",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1108": {
+        "interleave_array": [
+            "Describe this photo and replace the text with \"\u4eba\u751f\u9760\u81ea\u5df1\uff0c\u4e09\u5206\u5929\u6ce8\u5b9a\u4e03\u5206\u9760\u6253\u62fc\" in two lines.",
+            "./benchmarks/image_gen/GEdit/images/1108_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1108_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3a016977fd14367ffc324d12e965e961",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1109": {
+        "interleave_array": [
+            "Remove the text and numbers below.",
+            "./benchmarks/image_gen/GEdit/images/1109_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1109_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "62173ca26266af1845db7de6227a2e92",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1110": {
+        "interleave_array": [
+            "Write the characters \"\u6df7\u6c8c\u7ec3\u5b9d\u51b3\" below this image.",
+            "./benchmarks/image_gen/GEdit/images/1110_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1110_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8382cb3ebe2cfce4315f9ec944ee12c2",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1111": {
+        "interleave_array": [
+            "Add the three characters \"\u82cd\u7a79\u51b3\" to this image.",
+            "./benchmarks/image_gen/GEdit/images/1111_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1111_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a095ed1661fa125363bf59a47d8e52e4",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1112": {
+        "interleave_array": [
+            "Help me change \"\u73b0\u6b63\u70ed\u64ad\u4e2d\" to \"2025\u6625\u8282\u4e0a\u6620\"",
+            "./benchmarks/image_gen/GEdit/images/1112_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1112_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "07c987a0a42790a9f5fed28a2bc6409e",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1113": {
+        "interleave_array": [
+            "Help me change the character after \"\u51b7\" to \"\u4e0d\u4e01\"",
+            "./benchmarks/image_gen/GEdit/images/1113_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1113_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "75ba6223b2ab4de7f35cc0653d1df7da",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1114": {
+        "interleave_array": [
+            "Help me change \"\u81f4\u9752\u6625\" to \"\u81f4\u5c11\u5e74\"",
+            "./benchmarks/image_gen/GEdit/images/1114_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1114_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f4f263ee11d7db0e483e1c10973e7b22",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1115": {
+        "interleave_array": [
+            "Change \"\u8fd9\u4e48\u4efb\u6027\" to \"\u8fd9\u4e48\u4efb\u610f\"",
+            "./benchmarks/image_gen/GEdit/images/1115_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1115_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "08fcf0e92aeea7e37931a6036a27174b",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1116": {
+        "interleave_array": [
+            "Add \"\u8d85\u7ea7\u5927\" before \"\u63a2\u79d8\"",
+            "./benchmarks/image_gen/GEdit/images/1116_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1116_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8ed283fe0c51659c06fd1de14420b544",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1117": {
+        "interleave_array": [
+            "Change \"\u5927\u8bdd\u897f\u6e38\" to \"\u795e\u8bdd\u61c2\u6e38\"",
+            "./benchmarks/image_gen/GEdit/images/1117_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1117_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "db250fc8f05da99e1e9e57eb3ecd3920",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1118": {
+        "interleave_array": [
+            "Help me change \"\u884c\u52a8\" to \"\u51fa\u51fb\"",
+            "./benchmarks/image_gen/GEdit/images/1118_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1118_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "a2a41ebb84be1126248d9cf65d8ed078",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1119": {
+        "interleave_array": [
+            "Delete \"\u53e4\u5821\u60ca\u9b42\"",
+            "./benchmarks/image_gen/GEdit/images/1119_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1119_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "11f41f16aebac2a07f24432b8cbeefd7",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1120": {
+        "interleave_array": [
+            "Remove \"\u6500\u5ca9\"",
+            "./benchmarks/image_gen/GEdit/images/1120_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1120_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4165fe6296cc4e75c9055794e5a13a10",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1121": {
+        "interleave_array": [
+            "Change \"\u63a2\u5bfb\" to  \"\u627e\u5230\"",
+            "./benchmarks/image_gen/GEdit/images/1121_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1121_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b485856ffb2107eaf50b4b437dacbea3",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1122": {
+        "interleave_array": [
+            "Write \"\u62ac\u9ad880\u516c\u5206\" on the underline",
+            "./benchmarks/image_gen/GEdit/images/1122_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1122_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "90c04ff64d248a824f0cd65936a99bf0",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1123": {
+        "interleave_array": [
+            "Remove \"\u79d8\u5236\u9ebb\u9ebb\u9c7c\" ",
+            "./benchmarks/image_gen/GEdit/images/1123_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1123_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ac11359a70c3950fce81d6c4a9665875",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1124": {
+        "interleave_array": [
+            "Remove the text \"\u621a\u98ce\u86cb\u7cd5\" and rewrite as \"\u9e21\u86cb\u86cb\u7cd5\"",
+            "./benchmarks/image_gen/GEdit/images/1124_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1124_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8155a4b0727c1a4176f5f70cc0810562",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1125": {
+        "interleave_array": [
+            "Write 'SMILE' on the blackboard",
+            "./benchmarks/image_gen/GEdit/images/1125_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1125_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d9ea1c0d881af0ade68721357d453c64",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1126": {
+        "interleave_array": [
+            "Add 'Top 1' in the middle, with the '1' especially large",
+            "./benchmarks/image_gen/GEdit/images/1126_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1126_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ba63cec563b66911faa74a1c3ba4850a",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1127": {
+        "interleave_array": [
+            "Change '2022' to '2024'",
+            "./benchmarks/image_gen/GEdit/images/1127_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1127_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7d283f97ec3592b175588efbd534061a",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1128": {
+        "interleave_array": [
+            "Remove 'copy chief from random house'",
+            "./benchmarks/image_gen/GEdit/images/1128_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1128_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7c7d40daf262d8e0df903af3e0917c03",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1129": {
+        "interleave_array": [
+            "Remove the text and add  \"\u6211\u7231\u4f60\"",
+            "./benchmarks/image_gen/GEdit/images/1129_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1129_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "058a3b3c422dcefddb8deb2c61ded83d",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1130": {
+        "interleave_array": [
+            "Add 'GAMES' in the center of the image",
+            "./benchmarks/image_gen/GEdit/images/1130_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1130_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7ccd6d8d72339d7c94560300dce346f4",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1131": {
+        "interleave_array": [
+            "Write  \"\u6211\u4eec\u6b22\u8fce\u4f60\" in the bottom left corner",
+            "./benchmarks/image_gen/GEdit/images/1131_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1131_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "58cf4f4a16cd16ffef55c170804be136",
+            "task_type": "text_change",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1172": {
+        "interleave_array": [
+            "change the weather to snowstorm",
+            "./benchmarks/image_gen/GEdit/images/1172_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1172_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3549b73c635e7f5d67d728bd582daffd",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1173": {
+        "interleave_array": [
+            "Apply an HDR filter to brighten the image.",
+            "./benchmarks/image_gen/GEdit/images/1173_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1173_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8168e81061f790fb34c9f4c81ed34d90",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1174": {
+        "interleave_array": [
+            "Make the image brighter.",
+            "./benchmarks/image_gen/GEdit/images/1174_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1174_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1db07f0d277222e32913bff2681faebb",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1175": {
+        "interleave_array": [
+            "change the weather to snowy",
+            "./benchmarks/image_gen/GEdit/images/1175_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1175_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d83dad4db56f5c6c1270708a74311725",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1176": {
+        "interleave_array": [
+            "change the time to nighttime",
+            "./benchmarks/image_gen/GEdit/images/1176_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1176_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ad3ad5f80040286822ec035c8fcf6c0f",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1177": {
+        "interleave_array": [
+            "change the weather to heavy rain",
+            "./benchmarks/image_gen/GEdit/images/1177_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1177_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2c7d8b151daa5920c523e40d1dda0d5e",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1178": {
+        "interleave_array": [
+            "Make the image brighter.",
+            "./benchmarks/image_gen/GEdit/images/1178_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1178_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e6b1eb3b883e718a85581c0d36727f24",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1179": {
+        "interleave_array": [
+            "change the time to night",
+            "./benchmarks/image_gen/GEdit/images/1179_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1179_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b95f07c147ce5d8afd6556b1acd5a902",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1180": {
+        "interleave_array": [
+            "make the weather snow",
+            "./benchmarks/image_gen/GEdit/images/1180_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1180_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2e77d56a387ce48d9467a73c128635c4",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1181": {
+        "interleave_array": [
+            "Add a background blur filter.",
+            "./benchmarks/image_gen/GEdit/images/1181_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1181_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "47e6851356aed262881f4af848b27b8b",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1182": {
+        "interleave_array": [
+            "change the season to winter",
+            "./benchmarks/image_gen/GEdit/images/1182_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1182_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "ef5b74bc64af4113749e170f4624a1e4",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1183": {
+        "interleave_array": [
+            "Make the image brighter.",
+            "./benchmarks/image_gen/GEdit/images/1183_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1183_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "b728006d225ca8acf59cb8bd958d79c4",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1184": {
+        "interleave_array": [
+            "Apply a suitable filter for this image.",
+            "./benchmarks/image_gen/GEdit/images/1184_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1184_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "6878b2aaea42391eb6d9d5a004dfba5a",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1185": {
+        "interleave_array": [
+            "change the weather to foggy",
+            "./benchmarks/image_gen/GEdit/images/1185_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1185_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "42ece5249116fbad305140e068b118b3",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1186": {
+        "interleave_array": [
+            "change the weather to snow",
+            "./benchmarks/image_gen/GEdit/images/1186_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1186_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "82713e857fa4a3972bd3bd560ad45d70",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1187": {
+        "interleave_array": [
+            "Restore and colorize this old photo in high definition.",
+            "./benchmarks/image_gen/GEdit/images/1187_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1187_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "eded7f39803839235a11c20fe72c67f5",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1188": {
+        "interleave_array": [
+            "change the time to prehistoric era",
+            "./benchmarks/image_gen/GEdit/images/1188_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1188_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "051ce492fd93f74add67a5fea2ec1f20",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1189": {
+        "interleave_array": [
+            "change the weather to snow",
+            "./benchmarks/image_gen/GEdit/images/1189_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1189_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "acd9a6d08c0a18ee251de9831251edf5",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1190": {
+        "interleave_array": [
+            "change the weather to foggy",
+            "./benchmarks/image_gen/GEdit/images/1190_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1190_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7bfbebfb0521da039f7ec26aec330ec9",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1191": {
+        "interleave_array": [
+            "Apply a filter to make the image brighter.",
+            "./benchmarks/image_gen/GEdit/images/1191_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1191_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "d231513192c28e8f14d79a41fd648e9a",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1192": {
+        "interleave_array": [
+            "Colorize this photo without altering the facial structure.",
+            "./benchmarks/image_gen/GEdit/images/1192_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1192_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "8907d6eacd7b91ee4cf8a157802a53a5",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1193": {
+        "interleave_array": [
+            "Apply a suitable filter to this image.",
+            "./benchmarks/image_gen/GEdit/images/1193_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1193_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "7bea5378467e211452fb8289e7da71be",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1194": {
+        "interleave_array": [
+            "Enhance and colorize this photo to make the subject more vivid.",
+            "./benchmarks/image_gen/GEdit/images/1194_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1194_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "277a863acdd110cc9550f16da754a93d",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1195": {
+        "interleave_array": [
+            "Restore and colorize this old photo in high definition.",
+            "./benchmarks/image_gen/GEdit/images/1195_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1195_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "1846b247da04c0fe6c63d8166e100a6a",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1196": {
+        "interleave_array": [
+            "Restore and colorize the image.",
+            "./benchmarks/image_gen/GEdit/images/1196_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1196_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "e7652e4858f7d1f3b86a0de28c6cb8c1",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1197": {
+        "interleave_array": [
+            "My photo looks a bit yellowish; please adjust the color.",
+            "./benchmarks/image_gen/GEdit/images/1197_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1197_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f57ec87ccf7bc1788dfd5be1da4dbe7a",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1198": {
+        "interleave_array": [
+            "Restore and colorize the image.",
+            "./benchmarks/image_gen/GEdit/images/1198_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1198_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "9d26293b9cc3ffd9df59117abbd9783d",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1199": {
+        "interleave_array": [
+            "Enhance it to super high quality.",
+            "./benchmarks/image_gen/GEdit/images/1199_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1199_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4b0700347e2ea2aef8f27a2cc2b9c370",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1200": {
+        "interleave_array": [
+            "Colorize the photo to make it clearer.",
+            "./benchmarks/image_gen/GEdit/images/1200_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1200_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "dbc15e88af0839a1b60801291c31b3c8",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1201": {
+        "interleave_array": [
+            "Adjust the colors to make the image look brighter.",
+            "./benchmarks/image_gen/GEdit/images/1201_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1201_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "5c01bd878c03ec2e5c6060f7a133b2f9",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1202": {
+        "interleave_array": [
+            "Enhance the clarity of this photo.",
+            "./benchmarks/image_gen/GEdit/images/1202_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1202_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "c09e9a0c550da145d9afe12c543b0048",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1203": {
+        "interleave_array": [
+            "Change the nighttime scene in the image to daytime.",
+            "./benchmarks/image_gen/GEdit/images/1203_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1203_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "94ab9306a1ea70be534d9ef36f3a19b0",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1204": {
+        "interleave_array": [
+            "Apply an HDR filter to brighten the image.",
+            "./benchmarks/image_gen/GEdit/images/1204_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1204_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "02f9043a04c5691315d5d690193c955e",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": true
+        }
+    },
+    "1205": {
+        "interleave_array": [
+            "Make this image clearer.",
+            "./benchmarks/image_gen/GEdit/images/1205_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1205_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "68afffd7d086ad91fb4d45d372418fea",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1206": {
+        "interleave_array": [
+            "Enhance the child\u2019s face in the image to make it sharper.",
+            "./benchmarks/image_gen/GEdit/images/1206_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1206_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "4167a37565e731478db17e138cbb6b8a",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1207": {
+        "interleave_array": [
+            "Apply a filter adjustment.",
+            "./benchmarks/image_gen/GEdit/images/1207_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1207_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "3053a9287013dac68056dd7aefdced02",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1208": {
+        "interleave_array": [
+            "Improve the photo's clarity and apply beautification.",
+            "./benchmarks/image_gen/GEdit/images/1208_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1208_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "220cbf1b8bf55b56873b0aec63a1e6bc",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1209": {
+        "interleave_array": [
+            "Enhance the image quality.",
+            "./benchmarks/image_gen/GEdit/images/1209_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1209_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "cea51a1910ce86cd42a393ba7417daf3",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1210": {
+        "interleave_array": [
+            "Can you restore this photo for me?",
+            "./benchmarks/image_gen/GEdit/images/1210_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1210_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "f4ecf30b68ba88536a28f899b87e5af1",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    },
+    "1211": {
+        "interleave_array": [
+            "Adjust the color of this image.",
+            "./benchmarks/image_gen/GEdit/images/1211_input_image.png",
+            "./benchmarks/image_gen/GEdit/images/1211_input_image.png"
+        ],
+        "element_dtype_array": [
+            "text",
+            "image",
+            "image"
+        ],
+        "istarget_in_interleave": [
+            0,
+            0,
+            1
+        ],
+        "additional_info": {
+            "key": "2a52ea99c0051bff29020048a0daca28",
+            "task_type": "tone_transfer",
+            "instruction_language": "en",
+            "Intersection_exist": false
+        }
+    }
+}
diff --git a/benchmarks/image_gen/GEdit/README.md b/benchmarks/image_gen/GEdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..702b4e7cb08403971f7be45d4864f9049fbad458
--- /dev/null
+++ b/benchmarks/image_gen/GEdit/README.md
@@ -0,0 +1,68 @@
+[Chinese Version](./README_zh.md)
+
+# GEdit Image Editing Evaluation
+
+Benchmark evaluation scripts for GEdit based on the Lance model.
+
+## Files
+
+- `sample_GEdit.py` - Python inference script
+- `sample_GEdit.sh` - Launch script
+- `GEdit_en.json` - Evaluation dataset
+
+## Quick Start
+
+### Basic Usage
+
+```bash
+bash benchmarks/image_gen/GEdit/sample_GEdit.sh
+```
+
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GEdit/sample_GEdit.sh`.
+Please follow `https://github.com/stepfun-ai/Step1X-Edit` to download the source images in GEdit-Bench and put all images in `benchmarks/image_gen/GEdit/images/`.
+
+## Parameters
+
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `image_edit` | Task type. GEdit is fixed to image editing. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GEdit/GEdit_en.json` | Path to the evaluation data. |
+
+## How To Modify
+
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GEdit/sample_GEdit.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/GEdit/sample_GEdit.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+
+## Output Format
+
+Results are saved in a structure like this:
+
+```
+results/GEdit_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── fullset/
+│   ├── add/
+│   │   ├── en/
+│   │   │   ├── 000001.webp
+│   │   │   └── ...
+│   ├── remove/
+│   │   └── en/
+│   │       └── ...
+├── prompt.json
+```
+
+Each case generates one edited image by default and stores it as a `.webp` file under `task_type/instruction_language/key`. A `prompt.json` file is also written to record the generated text.
+
+## Notes
+
+- If you need to switch the model, dataset, or resolution, edit the script configuration at the top directly.
+- The default result directory automatically includes key parameters and a timestamp for easier experiment tracking.
diff --git a/benchmarks/image_gen/GEdit/README_zh.md b/benchmarks/image_gen/GEdit/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..5351d21cceabeba22cf69a3369653d4ab24ccab7
--- /dev/null
+++ b/benchmarks/image_gen/GEdit/README_zh.md
@@ -0,0 +1,67 @@
+[English Version](./README.md)
+
+# GEdit 图像编辑评估
+
+基于 Lance 模型的 GEdit 评估基准测试脚本。
+
+## 文件说明
+
+- `sample_GEdit.py` - 推理 Python 脚本
+- `sample_GEdit.sh` - 启动脚本
+- `GEdit_en.json` - 评估数据集
+
+## 快速开始
+
+### 基本用法
+
+```bash
+bash benchmarks/image_gen/GEdit/sample_GEdit.sh
+```
+
+运行前请直接修改 `benchmarks/image_gen/GEdit/sample_GEdit.sh` 顶部的“推理参数配置”区。
+请参考 `https://github.com/stepfun-ai/Step1X-Edit` 下载 GEdit-Bench 的源图，并将所有图片放到 `benchmarks/image_gen/GEdit/images/` 中。
+
+## 参数说明
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `image_edit` | 任务类型，GEdit 固定为图像编辑 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GEdit/GEdit_en.json` | 评估数据路径 |
+
+## 修改方式
+
+- 请手动编辑 `benchmarks/image_gen/GEdit/sample_GEdit.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/GEdit/sample_GEdit.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+
+## 保存格式
+
+结果会按照以下结构保存：
+
+```
+results/GEdit_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── fullset/
+│   ├── add/
+│   │   ├── en/
+│   │   │   ├── 000001.webp
+│   │   │   └── ...
+│   ├── remove/
+│   │   └── en/
+│   │       └── ...
+├── prompt.json
+```
+
+每个 case 默认生成 1 张编辑结果图，并按 `task_type/instruction_language/key` 分目录保存为 `.webp` 文件；同时会额外写出 `prompt.json` 用于记录生成文本。
+## 注意事项
+
+- 如果需要切换模型、数据集或分辨率，请直接修改脚本顶部配置。
+- 默认结果目录会自动包含关键参数和时间戳，方便区分不同实验。
diff --git a/benchmarks/image_gen/GEdit/sample_GEdit.py b/benchmarks/image_gen/GEdit/sample_GEdit.py
new file mode 100644
index 0000000000000000000000000000000000000000..b647041e5b3b8656c79e8ac70ee132bde738714a
--- /dev/null
+++ b/benchmarks/image_gen/GEdit/sample_GEdit.py
@@ -0,0 +1,425 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+
+import os.path as osp
+from copy import deepcopy
+import json
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from tqdm import trange
+
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.val.utils import make_padded_latent, decode_video_tensor
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, TrainingArguments, EvaluationArguments, get_model_path
+
+
+def init_from_vlm_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments, log_rank0):
+    def load_safetensors_state_dict(folder_path):
+        safetensor_files = sorted(
+            f for f in os.listdir(folder_path) if f.endswith(".safetensors")
+        )
+        state_dict = {}
+        for filename in safetensor_files:
+            file_path = osp.join(folder_path, filename)
+            state_dict.update(load_file(file_path))
+        return state_dict
+
+    state_dict = load_safetensors_state_dict(model_args.llm_path)
+
+    for k in list(state_dict.keys()):
+        if "visual" in k:
+            state_dict[k.replace("visual", "vit_model")] = state_dict.pop(k)
+        else:
+            state_dict["language_model." + k] = state_dict.pop(k)
+
+    result = model.load_state_dict(state_dict, strict=False)
+    del state_dict
+    import gc; gc.collect(); torch.cuda.empty_cache()
+    return result
+
+
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+
+    msg = model.load_state_dict(model_state_dict, strict=False)
+    del model_state_dict
+    import gc; gc.collect(); torch.cuda.empty_cache()
+    return msg
+
+
+def save_prompt_results(prompt_data_dict, save_path_gen):
+    prompt_json_path = os.path.join(save_path_gen, "prompt.json")
+    with open(prompt_json_path, 'w', encoding='utf-8') as f:
+        json.dump(prompt_data_dict, f, ensure_ascii=False, indent=2)
+
+
+def resolve_gedit_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("GEdit requires --model_path to be provided explicitly.")
+
+    if not model_args.llm_path:
+        model_args.llm_path = model_args.model_path
+
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("gedit.data")
+
+
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_path_gen: str = "",
+):
+    val_data = val_data_cpu.cuda(device).to_dict()
+    fsdp_model = fsdp_model.to(device=device, dtype=torch.bfloat16)
+
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+
+        metadata = val_data["additional_info"]
+        task_type = metadata["task_type"]
+        instruction_language = metadata["instruction_language"]
+        save_key = metadata["key"]
+        save_dir_current = os.path.join(save_path_gen, "fullset/{}/{}".format(task_type, instruction_language))
+        os.makedirs(save_dir_current, exist_ok=True)
+
+        # -------------------- GEN 分支 --------------------
+        params = {
+            "val_packed_text_ids": val_data["packed_text_ids"],
+            "val_packed_text_indexes": val_data["packed_text_indexes"],
+            "val_sample_lens": val_data["sample_lens"],
+            "val_packed_position_ids": val_data["packed_position_ids"],
+            "val_split_lens": val_data["split_lens"],
+            "val_attn_modes": val_data["attn_modes"],
+            "val_sample_N_target": val_data["sample_N_target"],
+            "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+            "timestep_shift": training_args.validation_timestep_shift,
+            "num_timesteps": training_args.validation_num_timesteps,
+            "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+            "val_padded_latent": val_data["padded_latent"],
+            "video_sizes": val_data["video_sizes"],
+            "cfg_text_scale": model_args.cfg_text_scale,
+            "cfg_interval": training_args.cfg_interval,
+            "cfg_renorm_min": training_args.cfg_renorm_min,
+            "cfg_renorm_type": training_args.cfg_renorm_type,
+            "device": device,
+            "dtype": torch.bfloat16,
+            "new_token_ids": new_token_ids,
+            "max_samples": training_args.validation_max_samples,
+            "validation_noise_seed": training_args.validation_noise_seed,
+            "apply_chat_template": training_args.apply_chat_template,
+            "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+            "image_token_id": image_token_id,
+            "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+            "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+            "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+            "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+            "video_grid_thw": val_data.get("video_grid_thw", None),
+            "caption": val_data.get("caption", None),
+            "sample_task": val_data["sample_task"],
+            "sample_modality": val_data["sample_modality"],
+            "cfg_type": training_args.cfg_type,
+            "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+            "index": val_data["index"],
+            "val_padded_videos": None,
+        }
+        if inference_args.use_KVcache:
+            denoise_latent, captions, _, _ = fsdp_model.validation_gen_KVcache(**params)
+        else:
+            denoise_latent, captions, _, _ = fsdp_model.validation_gen(**params)
+
+        for i_val, latent in enumerate(denoise_latent):
+            target_latent = latent[-1]
+            v_target = vae_model.vae_decode([target_latent])[0]
+
+            v_thwc = decode_video_tensor([v_target], save_path="", save_half=False)
+
+            if v_thwc.shape[0] != 1:
+                raise NotImplementedError(
+                    "GEdit benchmark only supports image output (max_num_frames=1), "
+                    f"but got {v_thwc.shape[0]} frames."
+                )
+
+            save_name = f'{save_dir_current}/{save_key}.webp'
+            Image.fromarray(v_thwc[0]).save(save_name)
+            inference_args.prompt_data_dict[save_name] = captions[i_val]
+
+
+def main():
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+
+    training_args.validation_noise_seed = training_args.validation_data_seed
+
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+
+    seed = training_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+
+    resolve_gedit_paths(model_args, data_args)
+
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+
+        del vit_weights
+        import gc; gc.collect(); torch.cuda.empty_cache()
+
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(DEVICE)
+
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+
+    init_from_model_path_if_needed(model, model_args)
+
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+
+    dataset_config = DataConfig(grouped_datasets={})
+
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+    if training_args.visual_gen:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        vae_downsample = tuple_mul(
+            model_args.latent_patch_size, (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial)
+        )
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = vae_downsample
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+
+    val_dataset = ValidationDataset(
+        jsonl_path=data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+    )
+
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+    )
+
+    val_loader_iter = iter(val_loader)
+
+    if not hasattr(inference_args, "prompt_data_dict"):
+        inference_args.prompt_data_dict = {}
+
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen)
+
+    for epoch in trange(len(val_loader), desc="Validating", unit="batch", leave=True, ncols=80, disable=(GLOBAL_RANK != 0)):
+        try:
+            val_data_cpu = next(val_loader_iter)
+        except StopIteration:
+            break
+
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_path_gen=inference_args.save_path_gen,
+        )
+
+    if dist.is_initialized():
+        dist.barrier()
+        gathered = [None for _ in range(dist.get_world_size())]
+        dist.all_gather_object(gathered, inference_args.prompt_data_dict)
+
+        if GLOBAL_RANK == 0:
+            merged = {}
+            for d in gathered:
+                merged.update(d)
+            inference_args.prompt_data_dict = merged
+            save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+
+    elif GLOBAL_RANK == 0:
+        save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/image_gen/GEdit/sample_GEdit.sh b/benchmarks/image_gen/GEdit/sample_GEdit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..edfbacf2579c7ca3a64ee92ee1c4fb6e3b4bfb69
--- /dev/null
+++ b/benchmarks/image_gen/GEdit/sample_GEdit.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+
+# ========================= 推理参数配置 =========================
+TASK_NAME="image_edit"
+NUM_GPUS=8
+
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+USE_KVCACHE=true
+
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/GEdit/GEdit_en.json"
+
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/GEdit_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "GEdit 图像编辑评估"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/GEdit/sample_GEdit.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --use_flex              true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --num_replicate         $NUM_REPLICATE \
+    --num_shard             $NUM_SHARD \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --validation_max_samples 100000 \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt 1 \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"
+
+bash tmps/burn.sh
diff --git a/benchmarks/image_gen/GenEVAL/GenEVAL.jsonl b/benchmarks/image_gen/GenEVAL/GenEVAL.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..8a910997099d6cb92763eea7024cd6e16178d33f
--- /dev/null
+++ b/benchmarks/image_gen/GenEVAL/GenEVAL.jsonl
@@ -0,0 +1,553 @@
+{"index": 0, "data": "a photo of a bench", "additional_info": {"tag": "single_object", "include": [{"class": "bench", "count": 1}], "prompt": "a photo of a bench"}}
+{"index": 1, "data": "a photo of a cow", "additional_info": {"tag": "single_object", "include": [{"class": "cow", "count": 1}], "prompt": "a photo of a cow"}}
+{"index": 2, "data": "a photo of a bicycle", "additional_info": {"tag": "single_object", "include": [{"class": "bicycle", "count": 1}], "prompt": "a photo of a bicycle"}}
+{"index": 3, "data": "a photo of a clock", "additional_info": {"tag": "single_object", "include": [{"class": "clock", "count": 1}], "prompt": "a photo of a clock"}}
+{"index": 4, "data": "a photo of a carrot", "additional_info": {"tag": "single_object", "include": [{"class": "carrot", "count": 1}], "prompt": "a photo of a carrot"}}
+{"index": 5, "data": "a photo of a suitcase", "additional_info": {"tag": "single_object", "include": [{"class": "suitcase", "count": 1}], "prompt": "a photo of a suitcase"}}
+{"index": 6, "data": "a photo of a fork", "additional_info": {"tag": "single_object", "include": [{"class": "fork", "count": 1}], "prompt": "a photo of a fork"}}
+{"index": 7, "data": "a photo of a surfboard", "additional_info": {"tag": "single_object", "include": [{"class": "surfboard", "count": 1}], "prompt": "a photo of a surfboard"}}
+{"index": 8, "data": "a photo of a refrigerator", "additional_info": {"tag": "single_object", "include": [{"class": "refrigerator", "count": 1}], "prompt": "a photo of a refrigerator"}}
+{"index": 9, "data": "a photo of a cup", "additional_info": {"tag": "single_object", "include": [{"class": "cup", "count": 1}], "prompt": "a photo of a cup"}}
+{"index": 10, "data": "a photo of a microwave", "additional_info": {"tag": "single_object", "include": [{"class": "microwave", "count": 1}], "prompt": "a photo of a microwave"}}
+{"index": 11, "data": "a photo of a potted plant", "additional_info": {"tag": "single_object", "include": [{"class": "potted plant", "count": 1}], "prompt": "a photo of a potted plant"}}
+{"index": 12, "data": "a photo of a snowboard", "additional_info": {"tag": "single_object", "include": [{"class": "snowboard", "count": 1}], "prompt": "a photo of a snowboard"}}
+{"index": 13, "data": "a photo of a zebra", "additional_info": {"tag": "single_object", "include": [{"class": "zebra", "count": 1}], "prompt": "a photo of a zebra"}}
+{"index": 14, "data": "a photo of a parking meter", "additional_info": {"tag": "single_object", "include": [{"class": "parking meter", "count": 1}], "prompt": "a photo of a parking meter"}}
+{"index": 15, "data": "a photo of a spoon", "additional_info": {"tag": "single_object", "include": [{"class": "spoon", "count": 1}], "prompt": "a photo of a spoon"}}
+{"index": 16, "data": "a photo of a skateboard", "additional_info": {"tag": "single_object", "include": [{"class": "skateboard", "count": 1}], "prompt": "a photo of a skateboard"}}
+{"index": 17, "data": "a photo of a car", "additional_info": {"tag": "single_object", "include": [{"class": "car", "count": 1}], "prompt": "a photo of a car"}}
+{"index": 18, "data": "a photo of a motorcycle", "additional_info": {"tag": "single_object", "include": [{"class": "motorcycle", "count": 1}], "prompt": "a photo of a motorcycle"}}
+{"index": 19, "data": "a photo of a traffic light", "additional_info": {"tag": "single_object", "include": [{"class": "traffic light", "count": 1}], "prompt": "a photo of a traffic light"}}
+{"index": 20, "data": "a photo of a book", "additional_info": {"tag": "single_object", "include": [{"class": "book", "count": 1}], "prompt": "a photo of a book"}}
+{"index": 21, "data": "a photo of a couch", "additional_info": {"tag": "single_object", "include": [{"class": "couch", "count": 1}], "prompt": "a photo of a couch"}}
+{"index": 22, "data": "a photo of a backpack", "additional_info": {"tag": "single_object", "include": [{"class": "backpack", "count": 1}], "prompt": "a photo of a backpack"}}
+{"index": 23, "data": "a photo of a computer keyboard", "additional_info": {"tag": "single_object", "include": [{"class": "computer keyboard", "count": 1}], "prompt": "a photo of a computer keyboard"}}
+{"index": 24, "data": "a photo of a toaster", "additional_info": {"tag": "single_object", "include": [{"class": "toaster", "count": 1}], "prompt": "a photo of a toaster"}}
+{"index": 25, "data": "a photo of a bird", "additional_info": {"tag": "single_object", "include": [{"class": "bird", "count": 1}], "prompt": "a photo of a bird"}}
+{"index": 26, "data": "a photo of a bowl", "additional_info": {"tag": "single_object", "include": [{"class": "bowl", "count": 1}], "prompt": "a photo of a bowl"}}
+{"index": 27, "data": "a photo of a dog", "additional_info": {"tag": "single_object", "include": [{"class": "dog", "count": 1}], "prompt": "a photo of a dog"}}
+{"index": 28, "data": "a photo of a tie", "additional_info": {"tag": "single_object", "include": [{"class": "tie", "count": 1}], "prompt": "a photo of a tie"}}
+{"index": 29, "data": "a photo of a laptop", "additional_info": {"tag": "single_object", "include": [{"class": "laptop", "count": 1}], "prompt": "a photo of a laptop"}}
+{"index": 30, "data": "a photo of a computer mouse", "additional_info": {"tag": "single_object", "include": [{"class": "computer mouse", "count": 1}], "prompt": "a photo of a computer mouse"}}
+{"index": 31, "data": "a photo of a sandwich", "additional_info": {"tag": "single_object", "include": [{"class": "sandwich", "count": 1}], "prompt": "a photo of a sandwich"}}
+{"index": 32, "data": "a photo of a baseball bat", "additional_info": {"tag": "single_object", "include": [{"class": "baseball bat", "count": 1}], "prompt": "a photo of a baseball bat"}}
+{"index": 33, "data": "a photo of a train", "additional_info": {"tag": "single_object", "include": [{"class": "train", "count": 1}], "prompt": "a photo of a train"}}
+{"index": 34, "data": "a photo of a cell phone", "additional_info": {"tag": "single_object", "include": [{"class": "cell phone", "count": 1}], "prompt": "a photo of a cell phone"}}
+{"index": 35, "data": "a photo of a chair", "additional_info": {"tag": "single_object", "include": [{"class": "chair", "count": 1}], "prompt": "a photo of a chair"}}
+{"index": 36, "data": "a photo of a tv", "additional_info": {"tag": "single_object", "include": [{"class": "tv", "count": 1}], "prompt": "a photo of a tv"}}
+{"index": 37, "data": "a photo of a broccoli", "additional_info": {"tag": "single_object", "include": [{"class": "broccoli", "count": 1}], "prompt": "a photo of a broccoli"}}
+{"index": 38, "data": "a photo of a bed", "additional_info": {"tag": "single_object", "include": [{"class": "bed", "count": 1}], "prompt": "a photo of a bed"}}
+{"index": 39, "data": "a photo of a skis", "additional_info": {"tag": "single_object", "include": [{"class": "skis", "count": 1}], "prompt": "a photo of a skis"}}
+{"index": 40, "data": "a photo of a handbag", "additional_info": {"tag": "single_object", "include": [{"class": "handbag", "count": 1}], "prompt": "a photo of a handbag"}}
+{"index": 41, "data": "a photo of a pizza", "additional_info": {"tag": "single_object", "include": [{"class": "pizza", "count": 1}], "prompt": "a photo of a pizza"}}
+{"index": 42, "data": "a photo of a frisbee", "additional_info": {"tag": "single_object", "include": [{"class": "frisbee", "count": 1}], "prompt": "a photo of a frisbee"}}
+{"index": 43, "data": "a photo of a scissors", "additional_info": {"tag": "single_object", "include": [{"class": "scissors", "count": 1}], "prompt": "a photo of a scissors"}}
+{"index": 44, "data": "a photo of a bottle", "additional_info": {"tag": "single_object", "include": [{"class": "bottle", "count": 1}], "prompt": "a photo of a bottle"}}
+{"index": 45, "data": "a photo of an elephant", "additional_info": {"tag": "single_object", "include": [{"class": "elephant", "count": 1}], "prompt": "a photo of an elephant"}}
+{"index": 46, "data": "a photo of a toilet", "additional_info": {"tag": "single_object", "include": [{"class": "toilet", "count": 1}], "prompt": "a photo of a toilet"}}
+{"index": 47, "data": "a photo of an oven", "additional_info": {"tag": "single_object", "include": [{"class": "oven", "count": 1}], "prompt": "a photo of an oven"}}
+{"index": 48, "data": "a photo of an orange", "additional_info": {"tag": "single_object", "include": [{"class": "orange", "count": 1}], "prompt": "a photo of an orange"}}
+{"index": 49, "data": "a photo of a person", "additional_info": {"tag": "single_object", "include": [{"class": "person", "count": 1}], "prompt": "a photo of a person"}}
+{"index": 50, "data": "a photo of a teddy bear", "additional_info": {"tag": "single_object", "include": [{"class": "teddy bear", "count": 1}], "prompt": "a photo of a teddy bear"}}
+{"index": 51, "data": "a photo of a vase", "additional_info": {"tag": "single_object", "include": [{"class": "vase", "count": 1}], "prompt": "a photo of a vase"}}
+{"index": 52, "data": "a photo of a banana", "additional_info": {"tag": "single_object", "include": [{"class": "banana", "count": 1}], "prompt": "a photo of a banana"}}
+{"index": 53, "data": "a photo of a toothbrush", "additional_info": {"tag": "single_object", "include": [{"class": "toothbrush", "count": 1}], "prompt": "a photo of a toothbrush"}}
+{"index": 54, "data": "a photo of a tv remote", "additional_info": {"tag": "single_object", "include": [{"class": "tv remote", "count": 1}], "prompt": "a photo of a tv remote"}}
+{"index": 55, "data": "a photo of a dining table", "additional_info": {"tag": "single_object", "include": [{"class": "dining table", "count": 1}], "prompt": "a photo of a dining table"}}
+{"index": 56, "data": "a photo of a stop sign", "additional_info": {"tag": "single_object", "include": [{"class": "stop sign", "count": 1}], "prompt": "a photo of a stop sign"}}
+{"index": 57, "data": "a photo of a sheep", "additional_info": {"tag": "single_object", "include": [{"class": "sheep", "count": 1}], "prompt": "a photo of a sheep"}}
+{"index": 58, "data": "a photo of a fire hydrant", "additional_info": {"tag": "single_object", "include": [{"class": "fire hydrant", "count": 1}], "prompt": "a photo of a fire hydrant"}}
+{"index": 59, "data": "a photo of an airplane", "additional_info": {"tag": "single_object", "include": [{"class": "airplane", "count": 1}], "prompt": "a photo of an airplane"}}
+{"index": 60, "data": "a photo of a giraffe", "additional_info": {"tag": "single_object", "include": [{"class": "giraffe", "count": 1}], "prompt": "a photo of a giraffe"}}
+{"index": 61, "data": "a photo of a horse", "additional_info": {"tag": "single_object", "include": [{"class": "horse", "count": 1}], "prompt": "a photo of a horse"}}
+{"index": 62, "data": "a photo of a cat", "additional_info": {"tag": "single_object", "include": [{"class": "cat", "count": 1}], "prompt": "a photo of a cat"}}
+{"index": 63, "data": "a photo of a donut", "additional_info": {"tag": "single_object", "include": [{"class": "donut", "count": 1}], "prompt": "a photo of a donut"}}
+{"index": 64, "data": "a photo of a boat", "additional_info": {"tag": "single_object", "include": [{"class": "boat", "count": 1}], "prompt": "a photo of a boat"}}
+{"index": 65, "data": "a photo of a baseball glove", "additional_info": {"tag": "single_object", "include": [{"class": "baseball glove", "count": 1}], "prompt": "a photo of a baseball glove"}}
+{"index": 66, "data": "a photo of a hair drier", "additional_info": {"tag": "single_object", "include": [{"class": "hair drier", "count": 1}], "prompt": "a photo of a hair drier"}}
+{"index": 67, "data": "a photo of a sink", "additional_info": {"tag": "single_object", "include": [{"class": "sink", "count": 1}], "prompt": "a photo of a sink"}}
+{"index": 68, "data": "a photo of a cake", "additional_info": {"tag": "single_object", "include": [{"class": "cake", "count": 1}], "prompt": "a photo of a cake"}}
+{"index": 69, "data": "a photo of a wine glass", "additional_info": {"tag": "single_object", "include": [{"class": "wine glass", "count": 1}], "prompt": "a photo of a wine glass"}}
+{"index": 70, "data": "a photo of an apple", "additional_info": {"tag": "single_object", "include": [{"class": "apple", "count": 1}], "prompt": "a photo of an apple"}}
+{"index": 71, "data": "a photo of a bus", "additional_info": {"tag": "single_object", "include": [{"class": "bus", "count": 1}], "prompt": "a photo of a bus"}}
+{"index": 72, "data": "a photo of a tennis racket", "additional_info": {"tag": "single_object", "include": [{"class": "tennis racket", "count": 1}], "prompt": "a photo of a tennis racket"}}
+{"index": 73, "data": "a photo of a knife", "additional_info": {"tag": "single_object", "include": [{"class": "knife", "count": 1}], "prompt": "a photo of a knife"}}
+{"index": 74, "data": "a photo of a hot dog", "additional_info": {"tag": "single_object", "include": [{"class": "hot dog", "count": 1}], "prompt": "a photo of a hot dog"}}
+{"index": 75, "data": "a photo of a truck", "additional_info": {"tag": "single_object", "include": [{"class": "truck", "count": 1}], "prompt": "a photo of a truck"}}
+{"index": 76, "data": "a photo of an umbrella", "additional_info": {"tag": "single_object", "include": [{"class": "umbrella", "count": 1}], "prompt": "a photo of an umbrella"}}
+{"index": 77, "data": "a photo of a sports ball", "additional_info": {"tag": "single_object", "include": [{"class": "sports ball", "count": 1}], "prompt": "a photo of a sports ball"}}
+{"index": 78, "data": "a photo of a bear", "additional_info": {"tag": "single_object", "include": [{"class": "bear", "count": 1}], "prompt": "a photo of a bear"}}
+{"index": 79, "data": "a photo of a kite", "additional_info": {"tag": "single_object", "include": [{"class": "kite", "count": 1}], "prompt": "a photo of a kite"}}
+{"index": 80, "data": "a photo of a bench and a sports ball", "additional_info": {"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "sports ball", "count": 1}], "prompt": "a photo of a bench and a sports ball"}}
+{"index": 81, "data": "a photo of a toothbrush and a snowboard", "additional_info": {"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a toothbrush and a snowboard"}}
+{"index": 82, "data": "a photo of a toaster and an oven", "additional_info": {"tag": "two_object", "include": [{"class": "toaster", "count": 1}, {"class": "oven", "count": 1}], "prompt": "a photo of a toaster and an oven"}}
+{"index": 83, "data": "a photo of a broccoli and a vase", "additional_info": {"tag": "two_object", "include": [{"class": "broccoli", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a broccoli and a vase"}}
+{"index": 84, "data": "a photo of a tennis racket and a wine glass", "additional_info": {"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "wine glass", "count": 1}], "prompt": "a photo of a tennis racket and a wine glass"}}
+{"index": 85, "data": "a photo of a fork and a knife", "additional_info": {"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "knife", "count": 1}], "prompt": "a photo of a fork and a knife"}}
+{"index": 86, "data": "a photo of a hair drier and a cake", "additional_info": {"tag": "two_object", "include": [{"class": "hair drier", "count": 1}, {"class": "cake", "count": 1}], "prompt": "a photo of a hair drier and a cake"}}
+{"index": 87, "data": "a photo of a horse and a giraffe", "additional_info": {"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "giraffe", "count": 1}], "prompt": "a photo of a horse and a giraffe"}}
+{"index": 88, "data": "a photo of a horse and a computer keyboard", "additional_info": {"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "computer keyboard", "count": 1}], "prompt": "a photo of a horse and a computer keyboard"}}
+{"index": 89, "data": "a photo of a toothbrush and a carrot", "additional_info": {"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a toothbrush and a carrot"}}
+{"index": 90, "data": "a photo of a cake and a zebra", "additional_info": {"tag": "two_object", "include": [{"class": "cake", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a cake and a zebra"}}
+{"index": 91, "data": "a photo of a hair drier and a bear", "additional_info": {"tag": "two_object", "include": [{"class": "hair drier", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a hair drier and a bear"}}
+{"index": 92, "data": "a photo of a knife and a zebra", "additional_info": {"tag": "two_object", "include": [{"class": "knife", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a knife and a zebra"}}
+{"index": 93, "data": "a photo of a couch and a wine glass", "additional_info": {"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "wine glass", "count": 1}], "prompt": "a photo of a couch and a wine glass"}}
+{"index": 94, "data": "a photo of a frisbee and a vase", "additional_info": {"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a frisbee and a vase"}}
+{"index": 95, "data": "a photo of a book and a laptop", "additional_info": {"tag": "two_object", "include": [{"class": "book", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a book and a laptop"}}
+{"index": 96, "data": "a photo of a dining table and a bear", "additional_info": {"tag": "two_object", "include": [{"class": "dining table", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a dining table and a bear"}}
+{"index": 97, "data": "a photo of a frisbee and a couch", "additional_info": {"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "couch", "count": 1}], "prompt": "a photo of a frisbee and a couch"}}
+{"index": 98, "data": "a photo of a couch and a horse", "additional_info": {"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a couch and a horse"}}
+{"index": 99, "data": "a photo of a toilet and a computer mouse", "additional_info": {"tag": "two_object", "include": [{"class": "toilet", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a toilet and a computer mouse"}}
+{"index": 100, "data": "a photo of a bottle and a refrigerator", "additional_info": {"tag": "two_object", "include": [{"class": "bottle", "count": 1}, {"class": "refrigerator", "count": 1}], "prompt": "a photo of a bottle and a refrigerator"}}
+{"index": 101, "data": "a photo of a potted plant and a backpack", "additional_info": {"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "backpack", "count": 1}], "prompt": "a photo of a potted plant and a backpack"}}
+{"index": 102, "data": "a photo of a skateboard and a cake", "additional_info": {"tag": "two_object", "include": [{"class": "skateboard", "count": 1}, {"class": "cake", "count": 1}], "prompt": "a photo of a skateboard and a cake"}}
+{"index": 103, "data": "a photo of a broccoli and a parking meter", "additional_info": {"tag": "two_object", "include": [{"class": "broccoli", "count": 1}, {"class": "parking meter", "count": 1}], "prompt": "a photo of a broccoli and a parking meter"}}
+{"index": 104, "data": "a photo of a zebra and a bed", "additional_info": {"tag": "two_object", "include": [{"class": "zebra", "count": 1}, {"class": "bed", "count": 1}], "prompt": "a photo of a zebra and a bed"}}
+{"index": 105, "data": "a photo of an oven and a bed", "additional_info": {"tag": "two_object", "include": [{"class": "oven", "count": 1}, {"class": "bed", "count": 1}], "prompt": "a photo of an oven and a bed"}}
+{"index": 106, "data": "a photo of a baseball bat and a fork", "additional_info": {"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "fork", "count": 1}], "prompt": "a photo of a baseball bat and a fork"}}
+{"index": 107, "data": "a photo of a vase and a spoon", "additional_info": {"tag": "two_object", "include": [{"class": "vase", "count": 1}, {"class": "spoon", "count": 1}], "prompt": "a photo of a vase and a spoon"}}
+{"index": 108, "data": "a photo of a skateboard and a sink", "additional_info": {"tag": "two_object", "include": [{"class": "skateboard", "count": 1}, {"class": "sink", "count": 1}], "prompt": "a photo of a skateboard and a sink"}}
+{"index": 109, "data": "a photo of a pizza and a bench", "additional_info": {"tag": "two_object", "include": [{"class": "pizza", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a pizza and a bench"}}
+{"index": 110, "data": "a photo of a bowl and a pizza", "additional_info": {"tag": "two_object", "include": [{"class": "bowl", "count": 1}, {"class": "pizza", "count": 1}], "prompt": "a photo of a bowl and a pizza"}}
+{"index": 111, "data": "a photo of a tennis racket and a bird", "additional_info": {"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "bird", "count": 1}], "prompt": "a photo of a tennis racket and a bird"}}
+{"index": 112, "data": "a photo of a wine glass and a bear", "additional_info": {"tag": "two_object", "include": [{"class": "wine glass", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a wine glass and a bear"}}
+{"index": 113, "data": "a photo of a fork and a book", "additional_info": {"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "book", "count": 1}], "prompt": "a photo of a fork and a book"}}
+{"index": 114, "data": "a photo of a scissors and a bowl", "additional_info": {"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "bowl", "count": 1}], "prompt": "a photo of a scissors and a bowl"}}
+{"index": 115, "data": "a photo of a laptop and a carrot", "additional_info": {"tag": "two_object", "include": [{"class": "laptop", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a laptop and a carrot"}}
+{"index": 116, "data": "a photo of a stop sign and a bottle", "additional_info": {"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "bottle", "count": 1}], "prompt": "a photo of a stop sign and a bottle"}}
+{"index": 117, "data": "a photo of a microwave and a truck", "additional_info": {"tag": "two_object", "include": [{"class": "microwave", "count": 1}, {"class": "truck", "count": 1}], "prompt": "a photo of a microwave and a truck"}}
+{"index": 118, "data": "a photo of a person and a bear", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a person and a bear"}}
+{"index": 119, "data": "a photo of a frisbee and a cell phone", "additional_info": {"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a frisbee and a cell phone"}}
+{"index": 120, "data": "a photo of a parking meter and a teddy bear", "additional_info": {"tag": "two_object", "include": [{"class": "parking meter", "count": 1}, {"class": "teddy bear", "count": 1}], "prompt": "a photo of a parking meter and a teddy bear"}}
+{"index": 121, "data": "a photo of a tennis racket and a bicycle", "additional_info": {"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "bicycle", "count": 1}], "prompt": "a photo of a tennis racket and a bicycle"}}
+{"index": 122, "data": "a photo of a stop sign and a motorcycle", "additional_info": {"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "motorcycle", "count": 1}], "prompt": "a photo of a stop sign and a motorcycle"}}
+{"index": 123, "data": "a photo of a fire hydrant and a tennis racket", "additional_info": {"tag": "two_object", "include": [{"class": "fire hydrant", "count": 1}, {"class": "tennis racket", "count": 1}], "prompt": "a photo of a fire hydrant and a tennis racket"}}
+{"index": 124, "data": "a photo of a scissors and a sandwich", "additional_info": {"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "sandwich", "count": 1}], "prompt": "a photo of a scissors and a sandwich"}}
+{"index": 125, "data": "a photo of a pizza and a book", "additional_info": {"tag": "two_object", "include": [{"class": "pizza", "count": 1}, {"class": "book", "count": 1}], "prompt": "a photo of a pizza and a book"}}
+{"index": 126, "data": "a photo of a giraffe and a computer mouse", "additional_info": {"tag": "two_object", "include": [{"class": "giraffe", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a giraffe and a computer mouse"}}
+{"index": 127, "data": "a photo of a stop sign and a toaster", "additional_info": {"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "toaster", "count": 1}], "prompt": "a photo of a stop sign and a toaster"}}
+{"index": 128, "data": "a photo of a computer mouse and a zebra", "additional_info": {"tag": "two_object", "include": [{"class": "computer mouse", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a computer mouse and a zebra"}}
+{"index": 129, "data": "a photo of a chair and a bench", "additional_info": {"tag": "two_object", "include": [{"class": "chair", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a chair and a bench"}}
+{"index": 130, "data": "a photo of a tv and a carrot", "additional_info": {"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a tv and a carrot"}}
+{"index": 131, "data": "a photo of a surfboard and a suitcase", "additional_info": {"tag": "two_object", "include": [{"class": "surfboard", "count": 1}, {"class": "suitcase", "count": 1}], "prompt": "a photo of a surfboard and a suitcase"}}
+{"index": 132, "data": "a photo of a computer keyboard and a laptop", "additional_info": {"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a computer keyboard and a laptop"}}
+{"index": 133, "data": "a photo of a computer keyboard and a microwave", "additional_info": {"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "microwave", "count": 1}], "prompt": "a photo of a computer keyboard and a microwave"}}
+{"index": 134, "data": "a photo of a scissors and a bird", "additional_info": {"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "bird", "count": 1}], "prompt": "a photo of a scissors and a bird"}}
+{"index": 135, "data": "a photo of a person and a snowboard", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a person and a snowboard"}}
+{"index": 136, "data": "a photo of a cow and a horse", "additional_info": {"tag": "two_object", "include": [{"class": "cow", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a cow and a horse"}}
+{"index": 137, "data": "a photo of a handbag and a refrigerator", "additional_info": {"tag": "two_object", "include": [{"class": "handbag", "count": 1}, {"class": "refrigerator", "count": 1}], "prompt": "a photo of a handbag and a refrigerator"}}
+{"index": 138, "data": "a photo of a chair and a laptop", "additional_info": {"tag": "two_object", "include": [{"class": "chair", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a chair and a laptop"}}
+{"index": 139, "data": "a photo of a toothbrush and a bench", "additional_info": {"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a toothbrush and a bench"}}
+{"index": 140, "data": "a photo of a book and a baseball bat", "additional_info": {"tag": "two_object", "include": [{"class": "book", "count": 1}, {"class": "baseball bat", "count": 1}], "prompt": "a photo of a book and a baseball bat"}}
+{"index": 141, "data": "a photo of a horse and a train", "additional_info": {"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "train", "count": 1}], "prompt": "a photo of a horse and a train"}}
+{"index": 142, "data": "a photo of a bench and a vase", "additional_info": {"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a bench and a vase"}}
+{"index": 143, "data": "a photo of a traffic light and a backpack", "additional_info": {"tag": "two_object", "include": [{"class": "traffic light", "count": 1}, {"class": "backpack", "count": 1}], "prompt": "a photo of a traffic light and a backpack"}}
+{"index": 144, "data": "a photo of a sports ball and a cow", "additional_info": {"tag": "two_object", "include": [{"class": "sports ball", "count": 1}, {"class": "cow", "count": 1}], "prompt": "a photo of a sports ball and a cow"}}
+{"index": 145, "data": "a photo of a computer mouse and a spoon", "additional_info": {"tag": "two_object", "include": [{"class": "computer mouse", "count": 1}, {"class": "spoon", "count": 1}], "prompt": "a photo of a computer mouse and a spoon"}}
+{"index": 146, "data": "a photo of a tv and a bicycle", "additional_info": {"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "bicycle", "count": 1}], "prompt": "a photo of a tv and a bicycle"}}
+{"index": 147, "data": "a photo of a bench and a snowboard", "additional_info": {"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a bench and a snowboard"}}
+{"index": 148, "data": "a photo of a toothbrush and a toilet", "additional_info": {"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "toilet", "count": 1}], "prompt": "a photo of a toothbrush and a toilet"}}
+{"index": 149, "data": "a photo of a person and an apple", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "apple", "count": 1}], "prompt": "a photo of a person and an apple"}}
+{"index": 150, "data": "a photo of a sink and a sports ball", "additional_info": {"tag": "two_object", "include": [{"class": "sink", "count": 1}, {"class": "sports ball", "count": 1}], "prompt": "a photo of a sink and a sports ball"}}
+{"index": 151, "data": "a photo of a stop sign and a dog", "additional_info": {"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "dog", "count": 1}], "prompt": "a photo of a stop sign and a dog"}}
+{"index": 152, "data": "a photo of a knife and a stop sign", "additional_info": {"tag": "two_object", "include": [{"class": "knife", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a knife and a stop sign"}}
+{"index": 153, "data": "a photo of a wine glass and a handbag", "additional_info": {"tag": "two_object", "include": [{"class": "wine glass", "count": 1}, {"class": "handbag", "count": 1}], "prompt": "a photo of a wine glass and a handbag"}}
+{"index": 154, "data": "a photo of a bowl and a skis", "additional_info": {"tag": "two_object", "include": [{"class": "bowl", "count": 1}, {"class": "skis", "count": 1}], "prompt": "a photo of a bowl and a skis"}}
+{"index": 155, "data": "a photo of a frisbee and an apple", "additional_info": {"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "apple", "count": 1}], "prompt": "a photo of a frisbee and an apple"}}
+{"index": 156, "data": "a photo of a computer keyboard and a cell phone", "additional_info": {"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a computer keyboard and a cell phone"}}
+{"index": 157, "data": "a photo of a stop sign and a fork", "additional_info": {"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "fork", "count": 1}], "prompt": "a photo of a stop sign and a fork"}}
+{"index": 158, "data": "a photo of a potted plant and a boat", "additional_info": {"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "boat", "count": 1}], "prompt": "a photo of a potted plant and a boat"}}
+{"index": 159, "data": "a photo of a tv and a cell phone", "additional_info": {"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a tv and a cell phone"}}
+{"index": 160, "data": "a photo of a tie and a broccoli", "additional_info": {"tag": "two_object", "include": [{"class": "tie", "count": 1}, {"class": "broccoli", "count": 1}], "prompt": "a photo of a tie and a broccoli"}}
+{"index": 161, "data": "a photo of a potted plant and a donut", "additional_info": {"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "donut", "count": 1}], "prompt": "a photo of a potted plant and a donut"}}
+{"index": 162, "data": "a photo of a person and a sink", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "sink", "count": 1}], "prompt": "a photo of a person and a sink"}}
+{"index": 163, "data": "a photo of a couch and a snowboard", "additional_info": {"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a couch and a snowboard"}}
+{"index": 164, "data": "a photo of a fork and a baseball glove", "additional_info": {"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "baseball glove", "count": 1}], "prompt": "a photo of a fork and a baseball glove"}}
+{"index": 165, "data": "a photo of an apple and a toothbrush", "additional_info": {"tag": "two_object", "include": [{"class": "apple", "count": 1}, {"class": "toothbrush", "count": 1}], "prompt": "a photo of an apple and a toothbrush"}}
+{"index": 166, "data": "a photo of a bus and a baseball glove", "additional_info": {"tag": "two_object", "include": [{"class": "bus", "count": 1}, {"class": "baseball glove", "count": 1}], "prompt": "a photo of a bus and a baseball glove"}}
+{"index": 167, "data": "a photo of a person and a stop sign", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a person and a stop sign"}}
+{"index": 168, "data": "a photo of a carrot and a couch", "additional_info": {"tag": "two_object", "include": [{"class": "carrot", "count": 1}, {"class": "couch", "count": 1}], "prompt": "a photo of a carrot and a couch"}}
+{"index": 169, "data": "a photo of a baseball bat and a bear", "additional_info": {"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a baseball bat and a bear"}}
+{"index": 170, "data": "a photo of a fire hydrant and a train", "additional_info": {"tag": "two_object", "include": [{"class": "fire hydrant", "count": 1}, {"class": "train", "count": 1}], "prompt": "a photo of a fire hydrant and a train"}}
+{"index": 171, "data": "a photo of a baseball glove and a carrot", "additional_info": {"tag": "two_object", "include": [{"class": "baseball glove", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a baseball glove and a carrot"}}
+{"index": 172, "data": "a photo of a microwave and a bench", "additional_info": {"tag": "two_object", "include": [{"class": "microwave", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a microwave and a bench"}}
+{"index": 173, "data": "a photo of a cake and a stop sign", "additional_info": {"tag": "two_object", "include": [{"class": "cake", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a cake and a stop sign"}}
+{"index": 174, "data": "a photo of a car and a computer mouse", "additional_info": {"tag": "two_object", "include": [{"class": "car", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a car and a computer mouse"}}
+{"index": 175, "data": "a photo of a suitcase and a dining table", "additional_info": {"tag": "two_object", "include": [{"class": "suitcase", "count": 1}, {"class": "dining table", "count": 1}], "prompt": "a photo of a suitcase and a dining table"}}
+{"index": 176, "data": "a photo of a person and a traffic light", "additional_info": {"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "traffic light", "count": 1}], "prompt": "a photo of a person and a traffic light"}}
+{"index": 177, "data": "a photo of a cell phone and a horse", "additional_info": {"tag": "two_object", "include": [{"class": "cell phone", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a cell phone and a horse"}}
+{"index": 178, "data": "a photo of a baseball bat and a giraffe", "additional_info": {"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "giraffe", "count": 1}], "prompt": "a photo of a baseball bat and a giraffe"}}
+{"index": 179, "data": "a photo of two clocks", "additional_info": {"tag": "counting", "include": [{"class": "clock", "count": 2}], "exclude": [{"class": "clock", "count": 3}], "prompt": "a photo of two clocks"}}
+{"index": 180, "data": "a photo of two backpacks", "additional_info": {"tag": "counting", "include": [{"class": "backpack", "count": 2}], "exclude": [{"class": "backpack", "count": 3}], "prompt": "a photo of two backpacks"}}
+{"index": 181, "data": "a photo of four handbags", "additional_info": {"tag": "counting", "include": [{"class": "handbag", "count": 4}], "exclude": [{"class": "handbag", "count": 5}], "prompt": "a photo of four handbags"}}
+{"index": 182, "data": "a photo of two frisbees", "additional_info": {"tag": "counting", "include": [{"class": "frisbee", "count": 2}], "exclude": [{"class": "frisbee", "count": 3}], "prompt": "a photo of two frisbees"}}
+{"index": 183, "data": "a photo of three sports balls", "additional_info": {"tag": "counting", "include": [{"class": "sports ball", "count": 3}], "exclude": [{"class": "sports ball", "count": 4}], "prompt": "a photo of three sports balls"}}
+{"index": 184, "data": "a photo of two bears", "additional_info": {"tag": "counting", "include": [{"class": "bear", "count": 2}], "exclude": [{"class": "bear", "count": 3}], "prompt": "a photo of two bears"}}
+{"index": 185, "data": "a photo of two ties", "additional_info": {"tag": "counting", "include": [{"class": "tie", "count": 2}], "exclude": [{"class": "tie", "count": 3}], "prompt": "a photo of two ties"}}
+{"index": 186, "data": "a photo of four sinks", "additional_info": {"tag": "counting", "include": [{"class": "sink", "count": 4}], "exclude": [{"class": "sink", "count": 5}], "prompt": "a photo of four sinks"}}
+{"index": 187, "data": "a photo of two toothbrushs", "additional_info": {"tag": "counting", "include": [{"class": "toothbrush", "count": 2}], "exclude": [{"class": "toothbrush", "count": 3}], "prompt": "a photo of two toothbrushs"}}
+{"index": 188, "data": "a photo of three persons", "additional_info": {"tag": "counting", "include": [{"class": "person", "count": 3}], "exclude": [{"class": "person", "count": 4}], "prompt": "a photo of three persons"}}
+{"index": 189, "data": "a photo of three tennis rackets", "additional_info": {"tag": "counting", "include": [{"class": "tennis racket", "count": 3}], "exclude": [{"class": "tennis racket", "count": 4}], "prompt": "a photo of three tennis rackets"}}
+{"index": 190, "data": "a photo of four bowls", "additional_info": {"tag": "counting", "include": [{"class": "bowl", "count": 4}], "exclude": [{"class": "bowl", "count": 5}], "prompt": "a photo of four bowls"}}
+{"index": 191, "data": "a photo of four vases", "additional_info": {"tag": "counting", "include": [{"class": "vase", "count": 4}], "exclude": [{"class": "vase", "count": 5}], "prompt": "a photo of four vases"}}
+{"index": 192, "data": "a photo of three cups", "additional_info": {"tag": "counting", "include": [{"class": "cup", "count": 3}], "exclude": [{"class": "cup", "count": 4}], "prompt": "a photo of three cups"}}
+{"index": 193, "data": "a photo of four computer keyboards", "additional_info": {"tag": "counting", "include": [{"class": "computer keyboard", "count": 4}], "exclude": [{"class": "computer keyboard", "count": 5}], "prompt": "a photo of four computer keyboards"}}
+{"index": 194, "data": "a photo of three sinks", "additional_info": {"tag": "counting", "include": [{"class": "sink", "count": 3}], "exclude": [{"class": "sink", "count": 4}], "prompt": "a photo of three sinks"}}
+{"index": 195, "data": "a photo of two ovens", "additional_info": {"tag": "counting", "include": [{"class": "oven", "count": 2}], "exclude": [{"class": "oven", "count": 3}], "prompt": "a photo of two ovens"}}
+{"index": 196, "data": "a photo of two toilets", "additional_info": {"tag": "counting", "include": [{"class": "toilet", "count": 2}], "exclude": [{"class": "toilet", "count": 3}], "prompt": "a photo of two toilets"}}
+{"index": 197, "data": "a photo of two bicycles", "additional_info": {"tag": "counting", "include": [{"class": "bicycle", "count": 2}], "exclude": [{"class": "bicycle", "count": 3}], "prompt": "a photo of two bicycles"}}
+{"index": 198, "data": "a photo of two trains", "additional_info": {"tag": "counting", "include": [{"class": "train", "count": 2}], "exclude": [{"class": "train", "count": 3}], "prompt": "a photo of two trains"}}
+{"index": 199, "data": "a photo of three oranges", "additional_info": {"tag": "counting", "include": [{"class": "orange", "count": 3}], "exclude": [{"class": "orange", "count": 4}], "prompt": "a photo of three oranges"}}
+{"index": 200, "data": "a photo of three buses", "additional_info": {"tag": "counting", "include": [{"class": "bus", "count": 3}], "exclude": [{"class": "bus", "count": 4}], "prompt": "a photo of three buses"}}
+{"index": 201, "data": "a photo of three handbags", "additional_info": {"tag": "counting", "include": [{"class": "handbag", "count": 3}], "exclude": [{"class": "handbag", "count": 4}], "prompt": "a photo of three handbags"}}
+{"index": 202, "data": "a photo of three snowboards", "additional_info": {"tag": "counting", "include": [{"class": "snowboard", "count": 3}], "exclude": [{"class": "snowboard", "count": 4}], "prompt": "a photo of three snowboards"}}
+{"index": 203, "data": "a photo of two snowboards", "additional_info": {"tag": "counting", "include": [{"class": "snowboard", "count": 2}], "exclude": [{"class": "snowboard", "count": 3}], "prompt": "a photo of two snowboards"}}
+{"index": 204, "data": "a photo of four dogs", "additional_info": {"tag": "counting", "include": [{"class": "dog", "count": 4}], "exclude": [{"class": "dog", "count": 5}], "prompt": "a photo of four dogs"}}
+{"index": 205, "data": "a photo of three apples", "additional_info": {"tag": "counting", "include": [{"class": "apple", "count": 3}], "exclude": [{"class": "apple", "count": 4}], "prompt": "a photo of three apples"}}
+{"index": 206, "data": "a photo of two sheeps", "additional_info": {"tag": "counting", "include": [{"class": "sheep", "count": 2}], "exclude": [{"class": "sheep", "count": 3}], "prompt": "a photo of two sheeps"}}
+{"index": 207, "data": "a photo of three hot dogs", "additional_info": {"tag": "counting", "include": [{"class": "hot dog", "count": 3}], "exclude": [{"class": "hot dog", "count": 4}], "prompt": "a photo of three hot dogs"}}
+{"index": 208, "data": "a photo of three zebras", "additional_info": {"tag": "counting", "include": [{"class": "zebra", "count": 3}], "exclude": [{"class": "zebra", "count": 4}], "prompt": "a photo of three zebras"}}
+{"index": 209, "data": "a photo of three kites", "additional_info": {"tag": "counting", "include": [{"class": "kite", "count": 3}], "exclude": [{"class": "kite", "count": 4}], "prompt": "a photo of three kites"}}
+{"index": 210, "data": "a photo of four apples", "additional_info": {"tag": "counting", "include": [{"class": "apple", "count": 4}], "exclude": [{"class": "apple", "count": 5}], "prompt": "a photo of four apples"}}
+{"index": 211, "data": "a photo of three cell phones", "additional_info": {"tag": "counting", "include": [{"class": "cell phone", "count": 3}], "exclude": [{"class": "cell phone", "count": 4}], "prompt": "a photo of three cell phones"}}
+{"index": 212, "data": "a photo of four baseball gloves", "additional_info": {"tag": "counting", "include": [{"class": "baseball glove", "count": 4}], "exclude": [{"class": "baseball glove", "count": 5}], "prompt": "a photo of four baseball gloves"}}
+{"index": 213, "data": "a photo of three computer keyboards", "additional_info": {"tag": "counting", "include": [{"class": "computer keyboard", "count": 3}], "exclude": [{"class": "computer keyboard", "count": 4}], "prompt": "a photo of three computer keyboards"}}
+{"index": 214, "data": "a photo of two beds", "additional_info": {"tag": "counting", "include": [{"class": "bed", "count": 2}], "exclude": [{"class": "bed", "count": 3}], "prompt": "a photo of two beds"}}
+{"index": 215, "data": "a photo of two tv remotes", "additional_info": {"tag": "counting", "include": [{"class": "tv remote", "count": 2}], "exclude": [{"class": "tv remote", "count": 3}], "prompt": "a photo of two tv remotes"}}
+{"index": 216, "data": "a photo of three fire hydrants", "additional_info": {"tag": "counting", "include": [{"class": "fire hydrant", "count": 3}], "exclude": [{"class": "fire hydrant", "count": 4}], "prompt": "a photo of three fire hydrants"}}
+{"index": 217, "data": "a photo of three books", "additional_info": {"tag": "counting", "include": [{"class": "book", "count": 3}], "exclude": [{"class": "book", "count": 4}], "prompt": "a photo of three books"}}
+{"index": 218, "data": "a photo of four giraffes", "additional_info": {"tag": "counting", "include": [{"class": "giraffe", "count": 4}], "exclude": [{"class": "giraffe", "count": 5}], "prompt": "a photo of four giraffes"}}
+{"index": 219, "data": "a photo of two vases", "additional_info": {"tag": "counting", "include": [{"class": "vase", "count": 2}], "exclude": [{"class": "vase", "count": 3}], "prompt": "a photo of two vases"}}
+{"index": 220, "data": "a photo of four donuts", "additional_info": {"tag": "counting", "include": [{"class": "donut", "count": 4}], "exclude": [{"class": "donut", "count": 5}], "prompt": "a photo of four donuts"}}
+{"index": 221, "data": "a photo of four chairs", "additional_info": {"tag": "counting", "include": [{"class": "chair", "count": 4}], "exclude": [{"class": "chair", "count": 5}], "prompt": "a photo of four chairs"}}
+{"index": 222, "data": "a photo of three baseball bats", "additional_info": {"tag": "counting", "include": [{"class": "baseball bat", "count": 3}], "exclude": [{"class": "baseball bat", "count": 4}], "prompt": "a photo of three baseball bats"}}
+{"index": 223, "data": "a photo of four stop signs", "additional_info": {"tag": "counting", "include": [{"class": "stop sign", "count": 4}], "exclude": [{"class": "stop sign", "count": 5}], "prompt": "a photo of four stop signs"}}
+{"index": 224, "data": "a photo of two pizzas", "additional_info": {"tag": "counting", "include": [{"class": "pizza", "count": 2}], "exclude": [{"class": "pizza", "count": 3}], "prompt": "a photo of two pizzas"}}
+{"index": 225, "data": "a photo of three refrigerators", "additional_info": {"tag": "counting", "include": [{"class": "refrigerator", "count": 3}], "exclude": [{"class": "refrigerator", "count": 4}], "prompt": "a photo of three refrigerators"}}
+{"index": 226, "data": "a photo of two fire hydrants", "additional_info": {"tag": "counting", "include": [{"class": "fire hydrant", "count": 2}], "exclude": [{"class": "fire hydrant", "count": 3}], "prompt": "a photo of two fire hydrants"}}
+{"index": 227, "data": "a photo of three giraffes", "additional_info": {"tag": "counting", "include": [{"class": "giraffe", "count": 3}], "exclude": [{"class": "giraffe", "count": 4}], "prompt": "a photo of three giraffes"}}
+{"index": 228, "data": "a photo of four tvs", "additional_info": {"tag": "counting", "include": [{"class": "tv", "count": 4}], "exclude": [{"class": "tv", "count": 5}], "prompt": "a photo of four tvs"}}
+{"index": 229, "data": "a photo of three wine glasses", "additional_info": {"tag": "counting", "include": [{"class": "wine glass", "count": 3}], "exclude": [{"class": "wine glass", "count": 4}], "prompt": "a photo of three wine glasses"}}
+{"index": 230, "data": "a photo of four broccolis", "additional_info": {"tag": "counting", "include": [{"class": "broccoli", "count": 4}], "exclude": [{"class": "broccoli", "count": 5}], "prompt": "a photo of four broccolis"}}
+{"index": 231, "data": "a photo of three trucks", "additional_info": {"tag": "counting", "include": [{"class": "truck", "count": 3}], "exclude": [{"class": "truck", "count": 4}], "prompt": "a photo of three trucks"}}
+{"index": 232, "data": "a photo of two trucks", "additional_info": {"tag": "counting", "include": [{"class": "truck", "count": 2}], "exclude": [{"class": "truck", "count": 3}], "prompt": "a photo of two trucks"}}
+{"index": 233, "data": "a photo of two carrots", "additional_info": {"tag": "counting", "include": [{"class": "carrot", "count": 2}], "exclude": [{"class": "carrot", "count": 3}], "prompt": "a photo of two carrots"}}
+{"index": 234, "data": "a photo of two sandwichs", "additional_info": {"tag": "counting", "include": [{"class": "sandwich", "count": 2}], "exclude": [{"class": "sandwich", "count": 3}], "prompt": "a photo of two sandwichs"}}
+{"index": 235, "data": "a photo of four traffic lights", "additional_info": {"tag": "counting", "include": [{"class": "traffic light", "count": 4}], "exclude": [{"class": "traffic light", "count": 5}], "prompt": "a photo of four traffic lights"}}
+{"index": 236, "data": "a photo of four clocks", "additional_info": {"tag": "counting", "include": [{"class": "clock", "count": 4}], "exclude": [{"class": "clock", "count": 5}], "prompt": "a photo of four clocks"}}
+{"index": 237, "data": "a photo of two cars", "additional_info": {"tag": "counting", "include": [{"class": "car", "count": 2}], "exclude": [{"class": "car", "count": 3}], "prompt": "a photo of two cars"}}
+{"index": 238, "data": "a photo of two bananas", "additional_info": {"tag": "counting", "include": [{"class": "banana", "count": 2}], "exclude": [{"class": "banana", "count": 3}], "prompt": "a photo of two bananas"}}
+{"index": 239, "data": "a photo of two wine glasses", "additional_info": {"tag": "counting", "include": [{"class": "wine glass", "count": 2}], "exclude": [{"class": "wine glass", "count": 3}], "prompt": "a photo of two wine glasses"}}
+{"index": 240, "data": "a photo of three pizzas", "additional_info": {"tag": "counting", "include": [{"class": "pizza", "count": 3}], "exclude": [{"class": "pizza", "count": 4}], "prompt": "a photo of three pizzas"}}
+{"index": 241, "data": "a photo of four knifes", "additional_info": {"tag": "counting", "include": [{"class": "knife", "count": 4}], "exclude": [{"class": "knife", "count": 5}], "prompt": "a photo of four knifes"}}
+{"index": 242, "data": "a photo of three suitcases", "additional_info": {"tag": "counting", "include": [{"class": "suitcase", "count": 3}], "exclude": [{"class": "suitcase", "count": 4}], "prompt": "a photo of three suitcases"}}
+{"index": 243, "data": "a photo of four zebras", "additional_info": {"tag": "counting", "include": [{"class": "zebra", "count": 4}], "exclude": [{"class": "zebra", "count": 5}], "prompt": "a photo of four zebras"}}
+{"index": 244, "data": "a photo of two teddy bears", "additional_info": {"tag": "counting", "include": [{"class": "teddy bear", "count": 2}], "exclude": [{"class": "teddy bear", "count": 3}], "prompt": "a photo of two teddy bears"}}
+{"index": 245, "data": "a photo of four skateboards", "additional_info": {"tag": "counting", "include": [{"class": "skateboard", "count": 4}], "exclude": [{"class": "skateboard", "count": 5}], "prompt": "a photo of four skateboards"}}
+{"index": 246, "data": "a photo of four hot dogs", "additional_info": {"tag": "counting", "include": [{"class": "hot dog", "count": 4}], "exclude": [{"class": "hot dog", "count": 5}], "prompt": "a photo of four hot dogs"}}
+{"index": 247, "data": "a photo of three birds", "additional_info": {"tag": "counting", "include": [{"class": "bird", "count": 3}], "exclude": [{"class": "bird", "count": 4}], "prompt": "a photo of three birds"}}
+{"index": 248, "data": "a photo of four boats", "additional_info": {"tag": "counting", "include": [{"class": "boat", "count": 4}], "exclude": [{"class": "boat", "count": 5}], "prompt": "a photo of four boats"}}
+{"index": 249, "data": "a photo of four microwaves", "additional_info": {"tag": "counting", "include": [{"class": "microwave", "count": 4}], "exclude": [{"class": "microwave", "count": 5}], "prompt": "a photo of four microwaves"}}
+{"index": 250, "data": "a photo of two hair driers", "additional_info": {"tag": "counting", "include": [{"class": "hair drier", "count": 2}], "exclude": [{"class": "hair drier", "count": 3}], "prompt": "a photo of two hair driers"}}
+{"index": 251, "data": "a photo of three laptops", "additional_info": {"tag": "counting", "include": [{"class": "laptop", "count": 3}], "exclude": [{"class": "laptop", "count": 4}], "prompt": "a photo of three laptops"}}
+{"index": 252, "data": "a photo of three cows", "additional_info": {"tag": "counting", "include": [{"class": "cow", "count": 3}], "exclude": [{"class": "cow", "count": 4}], "prompt": "a photo of three cows"}}
+{"index": 253, "data": "a photo of two parking meters", "additional_info": {"tag": "counting", "include": [{"class": "parking meter", "count": 2}], "exclude": [{"class": "parking meter", "count": 3}], "prompt": "a photo of two parking meters"}}
+{"index": 254, "data": "a photo of four benchs", "additional_info": {"tag": "counting", "include": [{"class": "bench", "count": 4}], "exclude": [{"class": "bench", "count": 5}], "prompt": "a photo of four benchs"}}
+{"index": 255, "data": "a photo of three benchs", "additional_info": {"tag": "counting", "include": [{"class": "bench", "count": 3}], "exclude": [{"class": "bench", "count": 4}], "prompt": "a photo of three benchs"}}
+{"index": 256, "data": "a photo of four frisbees", "additional_info": {"tag": "counting", "include": [{"class": "frisbee", "count": 4}], "exclude": [{"class": "frisbee", "count": 5}], "prompt": "a photo of four frisbees"}}
+{"index": 257, "data": "a photo of four books", "additional_info": {"tag": "counting", "include": [{"class": "book", "count": 4}], "exclude": [{"class": "book", "count": 5}], "prompt": "a photo of four books"}}
+{"index": 258, "data": "a photo of four buses", "additional_info": {"tag": "counting", "include": [{"class": "bus", "count": 4}], "exclude": [{"class": "bus", "count": 5}], "prompt": "a photo of four buses"}}
+{"index": 259, "data": "a photo of a blue fire hydrant", "additional_info": {"tag": "colors", "include": [{"class": "fire hydrant", "count": 1, "color": "blue"}], "prompt": "a photo of a blue fire hydrant"}}
+{"index": 260, "data": "a photo of a pink car", "additional_info": {"tag": "colors", "include": [{"class": "car", "count": 1, "color": "pink"}], "prompt": "a photo of a pink car"}}
+{"index": 261, "data": "a photo of a purple cup", "additional_info": {"tag": "colors", "include": [{"class": "cup", "count": 1, "color": "purple"}], "prompt": "a photo of a purple cup"}}
+{"index": 262, "data": "a photo of a blue cow", "additional_info": {"tag": "colors", "include": [{"class": "cow", "count": 1, "color": "blue"}], "prompt": "a photo of a blue cow"}}
+{"index": 263, "data": "a photo of a yellow boat", "additional_info": {"tag": "colors", "include": [{"class": "boat", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow boat"}}
+{"index": 264, "data": "a photo of a blue umbrella", "additional_info": {"tag": "colors", "include": [{"class": "umbrella", "count": 1, "color": "blue"}], "prompt": "a photo of a blue umbrella"}}
+{"index": 265, "data": "a photo of a blue elephant", "additional_info": {"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "blue"}], "prompt": "a photo of a blue elephant"}}
+{"index": 266, "data": "a photo of a yellow elephant", "additional_info": {"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow elephant"}}
+{"index": 267, "data": "a photo of a red bicycle", "additional_info": {"tag": "colors", "include": [{"class": "bicycle", "count": 1, "color": "red"}], "prompt": "a photo of a red bicycle"}}
+{"index": 268, "data": "a photo of a purple suitcase", "additional_info": {"tag": "colors", "include": [{"class": "suitcase", "count": 1, "color": "purple"}], "prompt": "a photo of a purple suitcase"}}
+{"index": 269, "data": "a photo of a purple hair drier", "additional_info": {"tag": "colors", "include": [{"class": "hair drier", "count": 1, "color": "purple"}], "prompt": "a photo of a purple hair drier"}}
+{"index": 270, "data": "a photo of a white sandwich", "additional_info": {"tag": "colors", "include": [{"class": "sandwich", "count": 1, "color": "white"}], "prompt": "a photo of a white sandwich"}}
+{"index": 271, "data": "a photo of a purple elephant", "additional_info": {"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "purple"}], "prompt": "a photo of a purple elephant"}}
+{"index": 272, "data": "a photo of a green microwave", "additional_info": {"tag": "colors", "include": [{"class": "microwave", "count": 1, "color": "green"}], "prompt": "a photo of a green microwave"}}
+{"index": 273, "data": "a photo of a red zebra", "additional_info": {"tag": "colors", "include": [{"class": "zebra", "count": 1, "color": "red"}], "prompt": "a photo of a red zebra"}}
+{"index": 274, "data": "a photo of a red apple", "additional_info": {"tag": "colors", "include": [{"class": "apple", "count": 1, "color": "red"}], "prompt": "a photo of a red apple"}}
+{"index": 275, "data": "a photo of a yellow tv remote", "additional_info": {"tag": "colors", "include": [{"class": "tv remote", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow tv remote"}}
+{"index": 276, "data": "a photo of a blue toilet", "additional_info": {"tag": "colors", "include": [{"class": "toilet", "count": 1, "color": "blue"}], "prompt": "a photo of a blue toilet"}}
+{"index": 277, "data": "a photo of an orange orange", "additional_info": {"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "orange"}], "prompt": "a photo of an orange orange"}}
+{"index": 278, "data": "a photo of a black donut", "additional_info": {"tag": "colors", "include": [{"class": "donut", "count": 1, "color": "black"}], "prompt": "a photo of a black donut"}}
+{"index": 279, "data": "a photo of a red vase", "additional_info": {"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "red"}], "prompt": "a photo of a red vase"}}
+{"index": 280, "data": "a photo of a purple pizza", "additional_info": {"tag": "colors", "include": [{"class": "pizza", "count": 1, "color": "purple"}], "prompt": "a photo of a purple pizza"}}
+{"index": 281, "data": "a photo of a pink skateboard", "additional_info": {"tag": "colors", "include": [{"class": "skateboard", "count": 1, "color": "pink"}], "prompt": "a photo of a pink skateboard"}}
+{"index": 282, "data": "a photo of a green skateboard", "additional_info": {"tag": "colors", "include": [{"class": "skateboard", "count": 1, "color": "green"}], "prompt": "a photo of a green skateboard"}}
+{"index": 283, "data": "a photo of a purple bear", "additional_info": {"tag": "colors", "include": [{"class": "bear", "count": 1, "color": "purple"}], "prompt": "a photo of a purple bear"}}
+{"index": 284, "data": "a photo of a brown chair", "additional_info": {"tag": "colors", "include": [{"class": "chair", "count": 1, "color": "brown"}], "prompt": "a photo of a brown chair"}}
+{"index": 285, "data": "a photo of a brown computer keyboard", "additional_info": {"tag": "colors", "include": [{"class": "computer keyboard", "count": 1, "color": "brown"}], "prompt": "a photo of a brown computer keyboard"}}
+{"index": 286, "data": "a photo of an orange cow", "additional_info": {"tag": "colors", "include": [{"class": "cow", "count": 1, "color": "orange"}], "prompt": "a photo of an orange cow"}}
+{"index": 287, "data": "a photo of a brown skis", "additional_info": {"tag": "colors", "include": [{"class": "skis", "count": 1, "color": "brown"}], "prompt": "a photo of a brown skis"}}
+{"index": 288, "data": "a photo of a white kite", "additional_info": {"tag": "colors", "include": [{"class": "kite", "count": 1, "color": "white"}], "prompt": "a photo of a white kite"}}
+{"index": 289, "data": "a photo of a red dog", "additional_info": {"tag": "colors", "include": [{"class": "dog", "count": 1, "color": "red"}], "prompt": "a photo of a red dog"}}
+{"index": 290, "data": "a photo of a green couch", "additional_info": {"tag": "colors", "include": [{"class": "couch", "count": 1, "color": "green"}], "prompt": "a photo of a green couch"}}
+{"index": 291, "data": "a photo of a yellow airplane", "additional_info": {"tag": "colors", "include": [{"class": "airplane", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow airplane"}}
+{"index": 292, "data": "a photo of an orange tv", "additional_info": {"tag": "colors", "include": [{"class": "tv", "count": 1, "color": "orange"}], "prompt": "a photo of an orange tv"}}
+{"index": 293, "data": "a photo of a white scissors", "additional_info": {"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "white"}], "prompt": "a photo of a white scissors"}}
+{"index": 294, "data": "a photo of a pink cell phone", "additional_info": {"tag": "colors", "include": [{"class": "cell phone", "count": 1, "color": "pink"}], "prompt": "a photo of a pink cell phone"}}
+{"index": 295, "data": "a photo of a green surfboard", "additional_info": {"tag": "colors", "include": [{"class": "surfboard", "count": 1, "color": "green"}], "prompt": "a photo of a green surfboard"}}
+{"index": 296, "data": "a photo of a white fire hydrant", "additional_info": {"tag": "colors", "include": [{"class": "fire hydrant", "count": 1, "color": "white"}], "prompt": "a photo of a white fire hydrant"}}
+{"index": 297, "data": "a photo of a black bicycle", "additional_info": {"tag": "colors", "include": [{"class": "bicycle", "count": 1, "color": "black"}], "prompt": "a photo of a black bicycle"}}
+{"index": 298, "data": "a photo of a purple carrot", "additional_info": {"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "purple"}], "prompt": "a photo of a purple carrot"}}
+{"index": 299, "data": "a photo of a black dining table", "additional_info": {"tag": "colors", "include": [{"class": "dining table", "count": 1, "color": "black"}], "prompt": "a photo of a black dining table"}}
+{"index": 300, "data": "a photo of a purple potted plant", "additional_info": {"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "purple"}], "prompt": "a photo of a purple potted plant"}}
+{"index": 301, "data": "a photo of a purple backpack", "additional_info": {"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "purple"}], "prompt": "a photo of a purple backpack"}}
+{"index": 302, "data": "a photo of a yellow train", "additional_info": {"tag": "colors", "include": [{"class": "train", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow train"}}
+{"index": 303, "data": "a photo of a pink potted plant", "additional_info": {"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "pink"}], "prompt": "a photo of a pink potted plant"}}
+{"index": 304, "data": "a photo of a red giraffe", "additional_info": {"tag": "colors", "include": [{"class": "giraffe", "count": 1, "color": "red"}], "prompt": "a photo of a red giraffe"}}
+{"index": 305, "data": "a photo of a brown bear", "additional_info": {"tag": "colors", "include": [{"class": "bear", "count": 1, "color": "brown"}], "prompt": "a photo of a brown bear"}}
+{"index": 306, "data": "a photo of a black train", "additional_info": {"tag": "colors", "include": [{"class": "train", "count": 1, "color": "black"}], "prompt": "a photo of a black train"}}
+{"index": 307, "data": "a photo of an orange laptop", "additional_info": {"tag": "colors", "include": [{"class": "laptop", "count": 1, "color": "orange"}], "prompt": "a photo of an orange laptop"}}
+{"index": 308, "data": "a photo of a green hot dog", "additional_info": {"tag": "colors", "include": [{"class": "hot dog", "count": 1, "color": "green"}], "prompt": "a photo of a green hot dog"}}
+{"index": 309, "data": "a photo of a yellow parking meter", "additional_info": {"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow parking meter"}}
+{"index": 310, "data": "a photo of a red potted plant", "additional_info": {"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "red"}], "prompt": "a photo of a red potted plant"}}
+{"index": 311, "data": "a photo of a green traffic light", "additional_info": {"tag": "colors", "include": [{"class": "traffic light", "count": 1, "color": "green"}], "prompt": "a photo of a green traffic light"}}
+{"index": 312, "data": "a photo of a blue tv", "additional_info": {"tag": "colors", "include": [{"class": "tv", "count": 1, "color": "blue"}], "prompt": "a photo of a blue tv"}}
+{"index": 313, "data": "a photo of a brown refrigerator", "additional_info": {"tag": "colors", "include": [{"class": "refrigerator", "count": 1, "color": "brown"}], "prompt": "a photo of a brown refrigerator"}}
+{"index": 314, "data": "a photo of a black tv remote", "additional_info": {"tag": "colors", "include": [{"class": "tv remote", "count": 1, "color": "black"}], "prompt": "a photo of a black tv remote"}}
+{"index": 315, "data": "a photo of a purple scissors", "additional_info": {"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "purple"}], "prompt": "a photo of a purple scissors"}}
+{"index": 316, "data": "a photo of a yellow orange", "additional_info": {"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow orange"}}
+{"index": 317, "data": "a photo of a brown toaster", "additional_info": {"tag": "colors", "include": [{"class": "toaster", "count": 1, "color": "brown"}], "prompt": "a photo of a brown toaster"}}
+{"index": 318, "data": "a photo of a red parking meter", "additional_info": {"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "red"}], "prompt": "a photo of a red parking meter"}}
+{"index": 319, "data": "a photo of a brown orange", "additional_info": {"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "brown"}], "prompt": "a photo of a brown orange"}}
+{"index": 320, "data": "a photo of a green clock", "additional_info": {"tag": "colors", "include": [{"class": "clock", "count": 1, "color": "green"}], "prompt": "a photo of a green clock"}}
+{"index": 321, "data": "a photo of a white sheep", "additional_info": {"tag": "colors", "include": [{"class": "sheep", "count": 1, "color": "white"}], "prompt": "a photo of a white sheep"}}
+{"index": 322, "data": "a photo of a yellow oven", "additional_info": {"tag": "colors", "include": [{"class": "oven", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow oven"}}
+{"index": 323, "data": "a photo of a green vase", "additional_info": {"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "green"}], "prompt": "a photo of a green vase"}}
+{"index": 324, "data": "a photo of a black teddy bear", "additional_info": {"tag": "colors", "include": [{"class": "teddy bear", "count": 1, "color": "black"}], "prompt": "a photo of a black teddy bear"}}
+{"index": 325, "data": "a photo of a yellow carrot", "additional_info": {"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow carrot"}}
+{"index": 326, "data": "a photo of a black hot dog", "additional_info": {"tag": "colors", "include": [{"class": "hot dog", "count": 1, "color": "black"}], "prompt": "a photo of a black hot dog"}}
+{"index": 327, "data": "a photo of a red scissors", "additional_info": {"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "red"}], "prompt": "a photo of a red scissors"}}
+{"index": 328, "data": "a photo of a white teddy bear", "additional_info": {"tag": "colors", "include": [{"class": "teddy bear", "count": 1, "color": "white"}], "prompt": "a photo of a white teddy bear"}}
+{"index": 329, "data": "a photo of a black skis", "additional_info": {"tag": "colors", "include": [{"class": "skis", "count": 1, "color": "black"}], "prompt": "a photo of a black skis"}}
+{"index": 330, "data": "a photo of a blue dining table", "additional_info": {"tag": "colors", "include": [{"class": "dining table", "count": 1, "color": "blue"}], "prompt": "a photo of a blue dining table"}}
+{"index": 331, "data": "a photo of a black refrigerator", "additional_info": {"tag": "colors", "include": [{"class": "refrigerator", "count": 1, "color": "black"}], "prompt": "a photo of a black refrigerator"}}
+{"index": 332, "data": "a photo of a white dog", "additional_info": {"tag": "colors", "include": [{"class": "dog", "count": 1, "color": "white"}], "prompt": "a photo of a white dog"}}
+{"index": 333, "data": "a photo of an orange scissors", "additional_info": {"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "orange"}], "prompt": "a photo of an orange scissors"}}
+{"index": 334, "data": "a photo of a red cell phone", "additional_info": {"tag": "colors", "include": [{"class": "cell phone", "count": 1, "color": "red"}], "prompt": "a photo of a red cell phone"}}
+{"index": 335, "data": "a photo of a white orange", "additional_info": {"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "white"}], "prompt": "a photo of a white orange"}}
+{"index": 336, "data": "a photo of a blue clock", "additional_info": {"tag": "colors", "include": [{"class": "clock", "count": 1, "color": "blue"}], "prompt": "a photo of a blue clock"}}
+{"index": 337, "data": "a photo of a blue carrot", "additional_info": {"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "blue"}], "prompt": "a photo of a blue carrot"}}
+{"index": 338, "data": "a photo of a green motorcycle", "additional_info": {"tag": "colors", "include": [{"class": "motorcycle", "count": 1, "color": "green"}], "prompt": "a photo of a green motorcycle"}}
+{"index": 339, "data": "a photo of a pink stop sign", "additional_info": {"tag": "colors", "include": [{"class": "stop sign", "count": 1, "color": "pink"}], "prompt": "a photo of a pink stop sign"}}
+{"index": 340, "data": "a photo of a black vase", "additional_info": {"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "black"}], "prompt": "a photo of a black vase"}}
+{"index": 341, "data": "a photo of a black backpack", "additional_info": {"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "black"}], "prompt": "a photo of a black backpack"}}
+{"index": 342, "data": "a photo of a red car", "additional_info": {"tag": "colors", "include": [{"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of a red car"}}
+{"index": 343, "data": "a photo of a green computer mouse", "additional_info": {"tag": "colors", "include": [{"class": "computer mouse", "count": 1, "color": "green"}], "prompt": "a photo of a green computer mouse"}}
+{"index": 344, "data": "a photo of a red backpack", "additional_info": {"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "red"}], "prompt": "a photo of a red backpack"}}
+{"index": 345, "data": "a photo of a green bus", "additional_info": {"tag": "colors", "include": [{"class": "bus", "count": 1, "color": "green"}], "prompt": "a photo of a green bus"}}
+{"index": 346, "data": "a photo of an orange toaster", "additional_info": {"tag": "colors", "include": [{"class": "toaster", "count": 1, "color": "orange"}], "prompt": "a photo of an orange toaster"}}
+{"index": 347, "data": "a photo of a yellow fork", "additional_info": {"tag": "colors", "include": [{"class": "fork", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow fork"}}
+{"index": 348, "data": "a photo of a pink parking meter", "additional_info": {"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "pink"}], "prompt": "a photo of a pink parking meter"}}
+{"index": 349, "data": "a photo of a blue book", "additional_info": {"tag": "colors", "include": [{"class": "book", "count": 1, "color": "blue"}], "prompt": "a photo of a blue book"}}
+{"index": 350, "data": "a photo of a yellow broccoli", "additional_info": {"tag": "colors", "include": [{"class": "broccoli", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow broccoli"}}
+{"index": 351, "data": "a photo of an orange computer mouse", "additional_info": {"tag": "colors", "include": [{"class": "computer mouse", "count": 1, "color": "orange"}], "prompt": "a photo of an orange computer mouse"}}
+{"index": 352, "data": "a photo of a red cake", "additional_info": {"tag": "colors", "include": [{"class": "cake", "count": 1, "color": "red"}], "prompt": "a photo of a red cake"}}
+{"index": 353, "data": "a photo of a dog right of a teddy bear", "additional_info": {"tag": "position", "include": [{"class": "teddy bear", "count": 1}, {"class": "dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dog right of a teddy bear"}}
+{"index": 354, "data": "a photo of a wine glass above a kite", "additional_info": {"tag": "position", "include": [{"class": "kite", "count": 1}, {"class": "wine glass", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a wine glass above a kite"}}
+{"index": 355, "data": "a photo of a couch below a cup", "additional_info": {"tag": "position", "include": [{"class": "cup", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a cup"}}
+{"index": 356, "data": "a photo of a laptop left of a cow", "additional_info": {"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "laptop", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a laptop left of a cow"}}
+{"index": 357, "data": "a photo of a fork above a hair drier", "additional_info": {"tag": "position", "include": [{"class": "hair drier", "count": 1}, {"class": "fork", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a fork above a hair drier"}}
+{"index": 358, "data": "a photo of a tie right of a baseball bat", "additional_info": {"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "tie", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tie right of a baseball bat"}}
+{"index": 359, "data": "a photo of a stop sign above a fork", "additional_info": {"tag": "position", "include": [{"class": "fork", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a fork"}}
+{"index": 360, "data": "a photo of a bird below a skateboard", "additional_info": {"tag": "position", "include": [{"class": "skateboard", "count": 1}, {"class": "bird", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a bird below a skateboard"}}
+{"index": 361, "data": "a photo of an apple above a tv", "additional_info": {"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "apple", "count": 1, "position": ["above", 0]}], "prompt": "a photo of an apple above a tv"}}
+{"index": 362, "data": "a photo of a train above a potted plant", "additional_info": {"tag": "position", "include": [{"class": "potted plant", "count": 1}, {"class": "train", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a train above a potted plant"}}
+{"index": 363, "data": "a photo of a truck left of a refrigerator", "additional_info": {"tag": "position", "include": [{"class": "refrigerator", "count": 1}, {"class": "truck", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a truck left of a refrigerator"}}
+{"index": 364, "data": "a photo of a tv remote below a cow", "additional_info": {"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "tv remote", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a tv remote below a cow"}}
+{"index": 365, "data": "a photo of a bottle right of a train", "additional_info": {"tag": "position", "include": [{"class": "train", "count": 1}, {"class": "bottle", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bottle right of a train"}}
+{"index": 366, "data": "a photo of a dog above a cow", "additional_info": {"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "dog", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a dog above a cow"}}
+{"index": 367, "data": "a photo of a skateboard above a person", "additional_info": {"tag": "position", "include": [{"class": "person", "count": 1}, {"class": "skateboard", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a skateboard above a person"}}
+{"index": 368, "data": "a photo of a baseball glove below an umbrella", "additional_info": {"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "baseball glove", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a baseball glove below an umbrella"}}
+{"index": 369, "data": "a photo of a dining table right of an oven", "additional_info": {"tag": "position", "include": [{"class": "oven", "count": 1}, {"class": "dining table", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dining table right of an oven"}}
+{"index": 370, "data": "a photo of a hot dog left of a suitcase", "additional_info": {"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "hot dog", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a hot dog left of a suitcase"}}
+{"index": 371, "data": "a photo of a bus below a toothbrush", "additional_info": {"tag": "position", "include": [{"class": "toothbrush", "count": 1}, {"class": "bus", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a bus below a toothbrush"}}
+{"index": 372, "data": "a photo of a backpack right of a sandwich", "additional_info": {"tag": "position", "include": [{"class": "sandwich", "count": 1}, {"class": "backpack", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a backpack right of a sandwich"}}
+{"index": 373, "data": "a photo of a cake below a baseball bat", "additional_info": {"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "cake", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cake below a baseball bat"}}
+{"index": 374, "data": "a photo of a dog right of a tie", "additional_info": {"tag": "position", "include": [{"class": "tie", "count": 1}, {"class": "dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dog right of a tie"}}
+{"index": 375, "data": "a photo of a suitcase right of a boat", "additional_info": {"tag": "position", "include": [{"class": "boat", "count": 1}, {"class": "suitcase", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a suitcase right of a boat"}}
+{"index": 376, "data": "a photo of a bear above a clock", "additional_info": {"tag": "position", "include": [{"class": "clock", "count": 1}, {"class": "bear", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bear above a clock"}}
+{"index": 377, "data": "a photo of a tv remote left of an umbrella", "additional_info": {"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "tv remote", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a tv remote left of an umbrella"}}
+{"index": 378, "data": "a photo of a sports ball left of an umbrella", "additional_info": {"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "sports ball", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a sports ball left of an umbrella"}}
+{"index": 379, "data": "a photo of a train right of a dining table", "additional_info": {"tag": "position", "include": [{"class": "dining table", "count": 1}, {"class": "train", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a train right of a dining table"}}
+{"index": 380, "data": "a photo of a hair drier below an elephant", "additional_info": {"tag": "position", "include": [{"class": "elephant", "count": 1}, {"class": "hair drier", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a hair drier below an elephant"}}
+{"index": 381, "data": "a photo of a tennis racket right of a spoon", "additional_info": {"tag": "position", "include": [{"class": "spoon", "count": 1}, {"class": "tennis racket", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tennis racket right of a spoon"}}
+{"index": 382, "data": "a photo of a wine glass right of a hot dog", "additional_info": {"tag": "position", "include": [{"class": "hot dog", "count": 1}, {"class": "wine glass", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a wine glass right of a hot dog"}}
+{"index": 383, "data": "a photo of a computer mouse left of a bench", "additional_info": {"tag": "position", "include": [{"class": "bench", "count": 1}, {"class": "computer mouse", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a computer mouse left of a bench"}}
+{"index": 384, "data": "a photo of a carrot left of an orange", "additional_info": {"tag": "position", "include": [{"class": "orange", "count": 1}, {"class": "carrot", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a carrot left of an orange"}}
+{"index": 385, "data": "a photo of a kite above a toothbrush", "additional_info": {"tag": "position", "include": [{"class": "toothbrush", "count": 1}, {"class": "kite", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a kite above a toothbrush"}}
+{"index": 386, "data": "a photo of a toaster below a traffic light", "additional_info": {"tag": "position", "include": [{"class": "traffic light", "count": 1}, {"class": "toaster", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a toaster below a traffic light"}}
+{"index": 387, "data": "a photo of a cat below a baseball glove", "additional_info": {"tag": "position", "include": [{"class": "baseball glove", "count": 1}, {"class": "cat", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cat below a baseball glove"}}
+{"index": 388, "data": "a photo of a skis right of a zebra", "additional_info": {"tag": "position", "include": [{"class": "zebra", "count": 1}, {"class": "skis", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a skis right of a zebra"}}
+{"index": 389, "data": "a photo of a stop sign above a chair", "additional_info": {"tag": "position", "include": [{"class": "chair", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a chair"}}
+{"index": 390, "data": "a photo of a stop sign above a parking meter", "additional_info": {"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a parking meter"}}
+{"index": 391, "data": "a photo of a hot dog right of a skateboard", "additional_info": {"tag": "position", "include": [{"class": "skateboard", "count": 1}, {"class": "hot dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a hot dog right of a skateboard"}}
+{"index": 392, "data": "a photo of a pizza below a computer keyboard", "additional_info": {"tag": "position", "include": [{"class": "computer keyboard", "count": 1}, {"class": "pizza", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a pizza below a computer keyboard"}}
+{"index": 393, "data": "a photo of a hair drier left of a toilet", "additional_info": {"tag": "position", "include": [{"class": "toilet", "count": 1}, {"class": "hair drier", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a hair drier left of a toilet"}}
+{"index": 394, "data": "a photo of a cow left of a stop sign", "additional_info": {"tag": "position", "include": [{"class": "stop sign", "count": 1}, {"class": "cow", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cow left of a stop sign"}}
+{"index": 395, "data": "a photo of a suitcase above a skis", "additional_info": {"tag": "position", "include": [{"class": "skis", "count": 1}, {"class": "suitcase", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a suitcase above a skis"}}
+{"index": 396, "data": "a photo of a book above a laptop", "additional_info": {"tag": "position", "include": [{"class": "laptop", "count": 1}, {"class": "book", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a book above a laptop"}}
+{"index": 397, "data": "a photo of a toothbrush below a pizza", "additional_info": {"tag": "position", "include": [{"class": "pizza", "count": 1}, {"class": "toothbrush", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a toothbrush below a pizza"}}
+{"index": 398, "data": "a photo of a toilet left of a kite", "additional_info": {"tag": "position", "include": [{"class": "kite", "count": 1}, {"class": "toilet", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a toilet left of a kite"}}
+{"index": 399, "data": "a photo of a tie above a sink", "additional_info": {"tag": "position", "include": [{"class": "sink", "count": 1}, {"class": "tie", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a tie above a sink"}}
+{"index": 400, "data": "a photo of a bird left of a couch", "additional_info": {"tag": "position", "include": [{"class": "couch", "count": 1}, {"class": "bird", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a bird left of a couch"}}
+{"index": 401, "data": "a photo of a bed right of a sports ball", "additional_info": {"tag": "position", "include": [{"class": "sports ball", "count": 1}, {"class": "bed", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bed right of a sports ball"}}
+{"index": 402, "data": "a photo of an elephant below a surfboard", "additional_info": {"tag": "position", "include": [{"class": "surfboard", "count": 1}, {"class": "elephant", "count": 1, "position": ["below", 0]}], "prompt": "a photo of an elephant below a surfboard"}}
+{"index": 403, "data": "a photo of a frisbee right of a motorcycle", "additional_info": {"tag": "position", "include": [{"class": "motorcycle", "count": 1}, {"class": "frisbee", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a frisbee right of a motorcycle"}}
+{"index": 404, "data": "a photo of a vase above a fire hydrant", "additional_info": {"tag": "position", "include": [{"class": "fire hydrant", "count": 1}, {"class": "vase", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a vase above a fire hydrant"}}
+{"index": 405, "data": "a photo of a zebra left of an elephant", "additional_info": {"tag": "position", "include": [{"class": "elephant", "count": 1}, {"class": "zebra", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a zebra left of an elephant"}}
+{"index": 406, "data": "a photo of a bench left of a bear", "additional_info": {"tag": "position", "include": [{"class": "bear", "count": 1}, {"class": "bench", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a bench left of a bear"}}
+{"index": 407, "data": "a photo of a donut right of a bench", "additional_info": {"tag": "position", "include": [{"class": "bench", "count": 1}, {"class": "donut", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a donut right of a bench"}}
+{"index": 408, "data": "a photo of a frisbee below a horse", "additional_info": {"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "frisbee", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a frisbee below a horse"}}
+{"index": 409, "data": "a photo of a computer keyboard above a snowboard", "additional_info": {"tag": "position", "include": [{"class": "snowboard", "count": 1}, {"class": "computer keyboard", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a computer keyboard above a snowboard"}}
+{"index": 410, "data": "a photo of a tv below a cow", "additional_info": {"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "tv", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a tv below a cow"}}
+{"index": 411, "data": "a photo of an elephant below a horse", "additional_info": {"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "elephant", "count": 1, "position": ["below", 0]}], "prompt": "a photo of an elephant below a horse"}}
+{"index": 412, "data": "a photo of a suitcase left of a banana", "additional_info": {"tag": "position", "include": [{"class": "banana", "count": 1}, {"class": "suitcase", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a suitcase left of a banana"}}
+{"index": 413, "data": "a photo of a train below an airplane", "additional_info": {"tag": "position", "include": [{"class": "airplane", "count": 1}, {"class": "train", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a train below an airplane"}}
+{"index": 414, "data": "a photo of a cat below a backpack", "additional_info": {"tag": "position", "include": [{"class": "backpack", "count": 1}, {"class": "cat", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cat below a backpack"}}
+{"index": 415, "data": "a photo of a backpack below a cake", "additional_info": {"tag": "position", "include": [{"class": "cake", "count": 1}, {"class": "backpack", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a backpack below a cake"}}
+{"index": 416, "data": "a photo of a sandwich below a knife", "additional_info": {"tag": "position", "include": [{"class": "knife", "count": 1}, {"class": "sandwich", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a sandwich below a knife"}}
+{"index": 417, "data": "a photo of a bicycle above a parking meter", "additional_info": {"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "bicycle", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bicycle above a parking meter"}}
+{"index": 418, "data": "a photo of a knife right of a suitcase", "additional_info": {"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "knife", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a knife right of a suitcase"}}
+{"index": 419, "data": "a photo of a hot dog above a knife", "additional_info": {"tag": "position", "include": [{"class": "knife", "count": 1}, {"class": "hot dog", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a hot dog above a knife"}}
+{"index": 420, "data": "a photo of a zebra right of a parking meter", "additional_info": {"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "zebra", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a zebra right of a parking meter"}}
+{"index": 421, "data": "a photo of a chair left of a zebra", "additional_info": {"tag": "position", "include": [{"class": "zebra", "count": 1}, {"class": "chair", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a chair left of a zebra"}}
+{"index": 422, "data": "a photo of a cow below an airplane", "additional_info": {"tag": "position", "include": [{"class": "airplane", "count": 1}, {"class": "cow", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cow below an airplane"}}
+{"index": 423, "data": "a photo of a cup left of an umbrella", "additional_info": {"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "cup", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cup left of an umbrella"}}
+{"index": 424, "data": "a photo of a zebra below a computer keyboard", "additional_info": {"tag": "position", "include": [{"class": "computer keyboard", "count": 1}, {"class": "zebra", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a zebra below a computer keyboard"}}
+{"index": 425, "data": "a photo of a zebra below a broccoli", "additional_info": {"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "zebra", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a zebra below a broccoli"}}
+{"index": 426, "data": "a photo of a laptop below a sports ball", "additional_info": {"tag": "position", "include": [{"class": "sports ball", "count": 1}, {"class": "laptop", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a laptop below a sports ball"}}
+{"index": 427, "data": "a photo of a truck left of a baseball bat", "additional_info": {"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "truck", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a truck left of a baseball bat"}}
+{"index": 428, "data": "a photo of a refrigerator above a baseball bat", "additional_info": {"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "refrigerator", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a refrigerator above a baseball bat"}}
+{"index": 429, "data": "a photo of a tv above a baseball bat", "additional_info": {"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "tv", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a tv above a baseball bat"}}
+{"index": 430, "data": "a photo of a baseball glove right of a bear", "additional_info": {"tag": "position", "include": [{"class": "bear", "count": 1}, {"class": "baseball glove", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a baseball glove right of a bear"}}
+{"index": 431, "data": "a photo of a refrigerator below a scissors", "additional_info": {"tag": "position", "include": [{"class": "scissors", "count": 1}, {"class": "refrigerator", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a refrigerator below a scissors"}}
+{"index": 432, "data": "a photo of a dining table above a suitcase", "additional_info": {"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "dining table", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a dining table above a suitcase"}}
+{"index": 433, "data": "a photo of a parking meter above a broccoli", "additional_info": {"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "parking meter", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a parking meter above a broccoli"}}
+{"index": 434, "data": "a photo of a frisbee above a truck", "additional_info": {"tag": "position", "include": [{"class": "truck", "count": 1}, {"class": "frisbee", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a frisbee above a truck"}}
+{"index": 435, "data": "a photo of a pizza right of a banana", "additional_info": {"tag": "position", "include": [{"class": "banana", "count": 1}, {"class": "pizza", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a pizza right of a banana"}}
+{"index": 436, "data": "a photo of a bus above a boat", "additional_info": {"tag": "position", "include": [{"class": "boat", "count": 1}, {"class": "bus", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bus above a boat"}}
+{"index": 437, "data": "a photo of a cell phone left of a tennis racket", "additional_info": {"tag": "position", "include": [{"class": "tennis racket", "count": 1}, {"class": "cell phone", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cell phone left of a tennis racket"}}
+{"index": 438, "data": "a photo of a horse right of a broccoli", "additional_info": {"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "horse", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a horse right of a broccoli"}}
+{"index": 439, "data": "a photo of a broccoli above a bottle", "additional_info": {"tag": "position", "include": [{"class": "bottle", "count": 1}, {"class": "broccoli", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a broccoli above a bottle"}}
+{"index": 440, "data": "a photo of a vase right of a horse", "additional_info": {"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "vase", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a vase right of a horse"}}
+{"index": 441, "data": "a photo of a bear above a spoon", "additional_info": {"tag": "position", "include": [{"class": "spoon", "count": 1}, {"class": "bear", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bear above a spoon"}}
+{"index": 442, "data": "a photo of a zebra right of a bed", "additional_info": {"tag": "position", "include": [{"class": "bed", "count": 1}, {"class": "zebra", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a zebra right of a bed"}}
+{"index": 443, "data": "a photo of a cow right of a laptop", "additional_info": {"tag": "position", "include": [{"class": "laptop", "count": 1}, {"class": "cow", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a cow right of a laptop"}}
+{"index": 444, "data": "a photo of a bed right of a frisbee", "additional_info": {"tag": "position", "include": [{"class": "frisbee", "count": 1}, {"class": "bed", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bed right of a frisbee"}}
+{"index": 445, "data": "a photo of a tie right of a motorcycle", "additional_info": {"tag": "position", "include": [{"class": "motorcycle", "count": 1}, {"class": "tie", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tie right of a motorcycle"}}
+{"index": 446, "data": "a photo of a laptop right of a tv", "additional_info": {"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "laptop", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a laptop right of a tv"}}
+{"index": 447, "data": "a photo of a cell phone right of a chair", "additional_info": {"tag": "position", "include": [{"class": "chair", "count": 1}, {"class": "cell phone", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a cell phone right of a chair"}}
+{"index": 448, "data": "a photo of a couch below a potted plant", "additional_info": {"tag": "position", "include": [{"class": "potted plant", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a potted plant"}}
+{"index": 449, "data": "a photo of a clock below a tv", "additional_info": {"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "clock", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a clock below a tv"}}
+{"index": 450, "data": "a photo of a couch below a vase", "additional_info": {"tag": "position", "include": [{"class": "vase", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a vase"}}
+{"index": 451, "data": "a photo of a donut below a cat", "additional_info": {"tag": "position", "include": [{"class": "cat", "count": 1}, {"class": "donut", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a donut below a cat"}}
+{"index": 452, "data": "a photo of a couch left of a toaster", "additional_info": {"tag": "position", "include": [{"class": "toaster", "count": 1}, {"class": "couch", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a couch left of a toaster"}}
+{"index": 453, "data": "a photo of a purple wine glass and a black apple", "additional_info": {"tag": "color_attr", "include": [{"class": "wine glass", "count": 1, "color": "purple"}, {"class": "apple", "count": 1, "color": "black"}], "prompt": "a photo of a purple wine glass and a black apple"}}
+{"index": 454, "data": "a photo of a green bus and a purple microwave", "additional_info": {"tag": "color_attr", "include": [{"class": "bus", "count": 1, "color": "green"}, {"class": "microwave", "count": 1, "color": "purple"}], "prompt": "a photo of a green bus and a purple microwave"}}
+{"index": 455, "data": "a photo of a green skis and a brown airplane", "additional_info": {"tag": "color_attr", "include": [{"class": "skis", "count": 1, "color": "green"}, {"class": "airplane", "count": 1, "color": "brown"}], "prompt": "a photo of a green skis and a brown airplane"}}
+{"index": 456, "data": "a photo of a yellow computer keyboard and a black sink", "additional_info": {"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "yellow"}, {"class": "sink", "count": 1, "color": "black"}], "prompt": "a photo of a yellow computer keyboard and a black sink"}}
+{"index": 457, "data": "a photo of a pink oven and a green motorcycle", "additional_info": {"tag": "color_attr", "include": [{"class": "oven", "count": 1, "color": "pink"}, {"class": "motorcycle", "count": 1, "color": "green"}], "prompt": "a photo of a pink oven and a green motorcycle"}}
+{"index": 458, "data": "a photo of a purple parking meter and a red laptop", "additional_info": {"tag": "color_attr", "include": [{"class": "parking meter", "count": 1, "color": "purple"}, {"class": "laptop", "count": 1, "color": "red"}], "prompt": "a photo of a purple parking meter and a red laptop"}}
+{"index": 459, "data": "a photo of a yellow skateboard and an orange computer mouse", "additional_info": {"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "yellow"}, {"class": "computer mouse", "count": 1, "color": "orange"}], "prompt": "a photo of a yellow skateboard and an orange computer mouse"}}
+{"index": 460, "data": "a photo of a red skis and a brown tie", "additional_info": {"tag": "color_attr", "include": [{"class": "skis", "count": 1, "color": "red"}, {"class": "tie", "count": 1, "color": "brown"}], "prompt": "a photo of a red skis and a brown tie"}}
+{"index": 461, "data": "a photo of a pink skateboard and a black train", "additional_info": {"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "pink"}, {"class": "train", "count": 1, "color": "black"}], "prompt": "a photo of a pink skateboard and a black train"}}
+{"index": 462, "data": "a photo of a white handbag and a purple bed", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "white"}, {"class": "bed", "count": 1, "color": "purple"}], "prompt": "a photo of a white handbag and a purple bed"}}
+{"index": 463, "data": "a photo of a purple elephant and a brown sports ball", "additional_info": {"tag": "color_attr", "include": [{"class": "elephant", "count": 1, "color": "purple"}, {"class": "sports ball", "count": 1, "color": "brown"}], "prompt": "a photo of a purple elephant and a brown sports ball"}}
+{"index": 464, "data": "a photo of a purple dog and a black dining table", "additional_info": {"tag": "color_attr", "include": [{"class": "dog", "count": 1, "color": "purple"}, {"class": "dining table", "count": 1, "color": "black"}], "prompt": "a photo of a purple dog and a black dining table"}}
+{"index": 465, "data": "a photo of a white dining table and a red car", "additional_info": {"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "white"}, {"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of a white dining table and a red car"}}
+{"index": 466, "data": "a photo of a blue cell phone and a green apple", "additional_info": {"tag": "color_attr", "include": [{"class": "cell phone", "count": 1, "color": "blue"}, {"class": "apple", "count": 1, "color": "green"}], "prompt": "a photo of a blue cell phone and a green apple"}}
+{"index": 467, "data": "a photo of a red car and an orange potted plant", "additional_info": {"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "red"}, {"class": "potted plant", "count": 1, "color": "orange"}], "prompt": "a photo of a red car and an orange potted plant"}}
+{"index": 468, "data": "a photo of a brown carrot and a white potted plant", "additional_info": {"tag": "color_attr", "include": [{"class": "carrot", "count": 1, "color": "brown"}, {"class": "potted plant", "count": 1, "color": "white"}], "prompt": "a photo of a brown carrot and a white potted plant"}}
+{"index": 469, "data": "a photo of a black kite and a green bear", "additional_info": {"tag": "color_attr", "include": [{"class": "kite", "count": 1, "color": "black"}, {"class": "bear", "count": 1, "color": "green"}], "prompt": "a photo of a black kite and a green bear"}}
+{"index": 470, "data": "a photo of a blue laptop and a brown bear", "additional_info": {"tag": "color_attr", "include": [{"class": "laptop", "count": 1, "color": "blue"}, {"class": "bear", "count": 1, "color": "brown"}], "prompt": "a photo of a blue laptop and a brown bear"}}
+{"index": 471, "data": "a photo of a green teddy bear and a brown kite", "additional_info": {"tag": "color_attr", "include": [{"class": "teddy bear", "count": 1, "color": "green"}, {"class": "kite", "count": 1, "color": "brown"}], "prompt": "a photo of a green teddy bear and a brown kite"}}
+{"index": 472, "data": "a photo of a yellow stop sign and a blue potted plant", "additional_info": {"tag": "color_attr", "include": [{"class": "stop sign", "count": 1, "color": "yellow"}, {"class": "potted plant", "count": 1, "color": "blue"}], "prompt": "a photo of a yellow stop sign and a blue potted plant"}}
+{"index": 473, "data": "a photo of an orange snowboard and a green cat", "additional_info": {"tag": "color_attr", "include": [{"class": "snowboard", "count": 1, "color": "orange"}, {"class": "cat", "count": 1, "color": "green"}], "prompt": "a photo of an orange snowboard and a green cat"}}
+{"index": 474, "data": "a photo of an orange truck and a pink sink", "additional_info": {"tag": "color_attr", "include": [{"class": "truck", "count": 1, "color": "orange"}, {"class": "sink", "count": 1, "color": "pink"}], "prompt": "a photo of an orange truck and a pink sink"}}
+{"index": 475, "data": "a photo of a brown hot dog and a purple pizza", "additional_info": {"tag": "color_attr", "include": [{"class": "hot dog", "count": 1, "color": "brown"}, {"class": "pizza", "count": 1, "color": "purple"}], "prompt": "a photo of a brown hot dog and a purple pizza"}}
+{"index": 476, "data": "a photo of a green couch and an orange umbrella", "additional_info": {"tag": "color_attr", "include": [{"class": "couch", "count": 1, "color": "green"}, {"class": "umbrella", "count": 1, "color": "orange"}], "prompt": "a photo of a green couch and an orange umbrella"}}
+{"index": 477, "data": "a photo of a brown bed and a pink cell phone", "additional_info": {"tag": "color_attr", "include": [{"class": "bed", "count": 1, "color": "brown"}, {"class": "cell phone", "count": 1, "color": "pink"}], "prompt": "a photo of a brown bed and a pink cell phone"}}
+{"index": 478, "data": "a photo of a black broccoli and a yellow cake", "additional_info": {"tag": "color_attr", "include": [{"class": "broccoli", "count": 1, "color": "black"}, {"class": "cake", "count": 1, "color": "yellow"}], "prompt": "a photo of a black broccoli and a yellow cake"}}
+{"index": 479, "data": "a photo of a red train and a purple bear", "additional_info": {"tag": "color_attr", "include": [{"class": "train", "count": 1, "color": "red"}, {"class": "bear", "count": 1, "color": "purple"}], "prompt": "a photo of a red train and a purple bear"}}
+{"index": 480, "data": "a photo of a purple tennis racket and a black sink", "additional_info": {"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "purple"}, {"class": "sink", "count": 1, "color": "black"}], "prompt": "a photo of a purple tennis racket and a black sink"}}
+{"index": 481, "data": "a photo of a blue vase and a black banana", "additional_info": {"tag": "color_attr", "include": [{"class": "vase", "count": 1, "color": "blue"}, {"class": "banana", "count": 1, "color": "black"}], "prompt": "a photo of a blue vase and a black banana"}}
+{"index": 482, "data": "a photo of a blue clock and a white cup", "additional_info": {"tag": "color_attr", "include": [{"class": "clock", "count": 1, "color": "blue"}, {"class": "cup", "count": 1, "color": "white"}], "prompt": "a photo of a blue clock and a white cup"}}
+{"index": 483, "data": "a photo of a red umbrella and a blue couch", "additional_info": {"tag": "color_attr", "include": [{"class": "umbrella", "count": 1, "color": "red"}, {"class": "couch", "count": 1, "color": "blue"}], "prompt": "a photo of a red umbrella and a blue couch"}}
+{"index": 484, "data": "a photo of a white handbag and a red giraffe", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "white"}, {"class": "giraffe", "count": 1, "color": "red"}], "prompt": "a photo of a white handbag and a red giraffe"}}
+{"index": 485, "data": "a photo of a pink tv remote and a blue airplane", "additional_info": {"tag": "color_attr", "include": [{"class": "tv remote", "count": 1, "color": "pink"}, {"class": "airplane", "count": 1, "color": "blue"}], "prompt": "a photo of a pink tv remote and a blue airplane"}}
+{"index": 486, "data": "a photo of a pink handbag and a black scissors", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "pink"}, {"class": "scissors", "count": 1, "color": "black"}], "prompt": "a photo of a pink handbag and a black scissors"}}
+{"index": 487, "data": "a photo of a brown car and a pink hair drier", "additional_info": {"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "brown"}, {"class": "hair drier", "count": 1, "color": "pink"}], "prompt": "a photo of a brown car and a pink hair drier"}}
+{"index": 488, "data": "a photo of a black bus and a brown cell phone", "additional_info": {"tag": "color_attr", "include": [{"class": "bus", "count": 1, "color": "black"}, {"class": "cell phone", "count": 1, "color": "brown"}], "prompt": "a photo of a black bus and a brown cell phone"}}
+{"index": 489, "data": "a photo of a purple sheep and a pink banana", "additional_info": {"tag": "color_attr", "include": [{"class": "sheep", "count": 1, "color": "purple"}, {"class": "banana", "count": 1, "color": "pink"}], "prompt": "a photo of a purple sheep and a pink banana"}}
+{"index": 490, "data": "a photo of a blue handbag and a white cell phone", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "blue"}, {"class": "cell phone", "count": 1, "color": "white"}], "prompt": "a photo of a blue handbag and a white cell phone"}}
+{"index": 491, "data": "a photo of a white pizza and a green umbrella", "additional_info": {"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "white"}, {"class": "umbrella", "count": 1, "color": "green"}], "prompt": "a photo of a white pizza and a green umbrella"}}
+{"index": 492, "data": "a photo of a white tie and a purple skateboard", "additional_info": {"tag": "color_attr", "include": [{"class": "tie", "count": 1, "color": "white"}, {"class": "skateboard", "count": 1, "color": "purple"}], "prompt": "a photo of a white tie and a purple skateboard"}}
+{"index": 493, "data": "a photo of a yellow sports ball and a green boat", "additional_info": {"tag": "color_attr", "include": [{"class": "sports ball", "count": 1, "color": "yellow"}, {"class": "boat", "count": 1, "color": "green"}], "prompt": "a photo of a yellow sports ball and a green boat"}}
+{"index": 494, "data": "a photo of a white wine glass and a brown giraffe", "additional_info": {"tag": "color_attr", "include": [{"class": "wine glass", "count": 1, "color": "white"}, {"class": "giraffe", "count": 1, "color": "brown"}], "prompt": "a photo of a white wine glass and a brown giraffe"}}
+{"index": 495, "data": "a photo of a yellow bowl and a white baseball glove", "additional_info": {"tag": "color_attr", "include": [{"class": "bowl", "count": 1, "color": "yellow"}, {"class": "baseball glove", "count": 1, "color": "white"}], "prompt": "a photo of a yellow bowl and a white baseball glove"}}
+{"index": 496, "data": "a photo of an orange microwave and a black spoon", "additional_info": {"tag": "color_attr", "include": [{"class": "microwave", "count": 1, "color": "orange"}, {"class": "spoon", "count": 1, "color": "black"}], "prompt": "a photo of an orange microwave and a black spoon"}}
+{"index": 497, "data": "a photo of an orange skateboard and a pink bowl", "additional_info": {"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "orange"}, {"class": "bowl", "count": 1, "color": "pink"}], "prompt": "a photo of an orange skateboard and a pink bowl"}}
+{"index": 498, "data": "a photo of a blue toilet and a white suitcase", "additional_info": {"tag": "color_attr", "include": [{"class": "toilet", "count": 1, "color": "blue"}, {"class": "suitcase", "count": 1, "color": "white"}], "prompt": "a photo of a blue toilet and a white suitcase"}}
+{"index": 499, "data": "a photo of a white boat and an orange hot dog", "additional_info": {"tag": "color_attr", "include": [{"class": "boat", "count": 1, "color": "white"}, {"class": "hot dog", "count": 1, "color": "orange"}], "prompt": "a photo of a white boat and an orange hot dog"}}
+{"index": 500, "data": "a photo of a yellow dining table and a pink dog", "additional_info": {"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "yellow"}, {"class": "dog", "count": 1, "color": "pink"}], "prompt": "a photo of a yellow dining table and a pink dog"}}
+{"index": 501, "data": "a photo of a red cake and a purple chair", "additional_info": {"tag": "color_attr", "include": [{"class": "cake", "count": 1, "color": "red"}, {"class": "chair", "count": 1, "color": "purple"}], "prompt": "a photo of a red cake and a purple chair"}}
+{"index": 502, "data": "a photo of a blue tie and a pink dining table", "additional_info": {"tag": "color_attr", "include": [{"class": "tie", "count": 1, "color": "blue"}, {"class": "dining table", "count": 1, "color": "pink"}], "prompt": "a photo of a blue tie and a pink dining table"}}
+{"index": 503, "data": "a photo of a blue cow and a black computer keyboard", "additional_info": {"tag": "color_attr", "include": [{"class": "cow", "count": 1, "color": "blue"}, {"class": "computer keyboard", "count": 1, "color": "black"}], "prompt": "a photo of a blue cow and a black computer keyboard"}}
+{"index": 504, "data": "a photo of a yellow pizza and a green oven", "additional_info": {"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "yellow"}, {"class": "oven", "count": 1, "color": "green"}], "prompt": "a photo of a yellow pizza and a green oven"}}
+{"index": 505, "data": "a photo of a red laptop and a brown car", "additional_info": {"tag": "color_attr", "include": [{"class": "laptop", "count": 1, "color": "red"}, {"class": "car", "count": 1, "color": "brown"}], "prompt": "a photo of a red laptop and a brown car"}}
+{"index": 506, "data": "a photo of a purple computer keyboard and a blue scissors", "additional_info": {"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "purple"}, {"class": "scissors", "count": 1, "color": "blue"}], "prompt": "a photo of a purple computer keyboard and a blue scissors"}}
+{"index": 507, "data": "a photo of a green surfboard and an orange oven", "additional_info": {"tag": "color_attr", "include": [{"class": "surfboard", "count": 1, "color": "green"}, {"class": "oven", "count": 1, "color": "orange"}], "prompt": "a photo of a green surfboard and an orange oven"}}
+{"index": 508, "data": "a photo of a yellow parking meter and a pink refrigerator", "additional_info": {"tag": "color_attr", "include": [{"class": "parking meter", "count": 1, "color": "yellow"}, {"class": "refrigerator", "count": 1, "color": "pink"}], "prompt": "a photo of a yellow parking meter and a pink refrigerator"}}
+{"index": 509, "data": "a photo of a brown computer mouse and a purple bottle", "additional_info": {"tag": "color_attr", "include": [{"class": "computer mouse", "count": 1, "color": "brown"}, {"class": "bottle", "count": 1, "color": "purple"}], "prompt": "a photo of a brown computer mouse and a purple bottle"}}
+{"index": 510, "data": "a photo of a red umbrella and a green cow", "additional_info": {"tag": "color_attr", "include": [{"class": "umbrella", "count": 1, "color": "red"}, {"class": "cow", "count": 1, "color": "green"}], "prompt": "a photo of a red umbrella and a green cow"}}
+{"index": 511, "data": "a photo of a red giraffe and a black cell phone", "additional_info": {"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "red"}, {"class": "cell phone", "count": 1, "color": "black"}], "prompt": "a photo of a red giraffe and a black cell phone"}}
+{"index": 512, "data": "a photo of a brown oven and a purple train", "additional_info": {"tag": "color_attr", "include": [{"class": "oven", "count": 1, "color": "brown"}, {"class": "train", "count": 1, "color": "purple"}], "prompt": "a photo of a brown oven and a purple train"}}
+{"index": 513, "data": "a photo of a blue baseball bat and a pink book", "additional_info": {"tag": "color_attr", "include": [{"class": "baseball bat", "count": 1, "color": "blue"}, {"class": "book", "count": 1, "color": "pink"}], "prompt": "a photo of a blue baseball bat and a pink book"}}
+{"index": 514, "data": "a photo of a green cup and a yellow bowl", "additional_info": {"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "green"}, {"class": "bowl", "count": 1, "color": "yellow"}], "prompt": "a photo of a green cup and a yellow bowl"}}
+{"index": 515, "data": "a photo of a yellow suitcase and a brown bus", "additional_info": {"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "yellow"}, {"class": "bus", "count": 1, "color": "brown"}], "prompt": "a photo of a yellow suitcase and a brown bus"}}
+{"index": 516, "data": "a photo of an orange motorcycle and a pink donut", "additional_info": {"tag": "color_attr", "include": [{"class": "motorcycle", "count": 1, "color": "orange"}, {"class": "donut", "count": 1, "color": "pink"}], "prompt": "a photo of an orange motorcycle and a pink donut"}}
+{"index": 517, "data": "a photo of an orange giraffe and a white baseball glove", "additional_info": {"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "orange"}, {"class": "baseball glove", "count": 1, "color": "white"}], "prompt": "a photo of an orange giraffe and a white baseball glove"}}
+{"index": 518, "data": "a photo of an orange handbag and a green carrot", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "orange"}, {"class": "carrot", "count": 1, "color": "green"}], "prompt": "a photo of an orange handbag and a green carrot"}}
+{"index": 519, "data": "a photo of a black bottle and a white refrigerator", "additional_info": {"tag": "color_attr", "include": [{"class": "bottle", "count": 1, "color": "black"}, {"class": "refrigerator", "count": 1, "color": "white"}], "prompt": "a photo of a black bottle and a white refrigerator"}}
+{"index": 520, "data": "a photo of a white dog and a blue potted plant", "additional_info": {"tag": "color_attr", "include": [{"class": "dog", "count": 1, "color": "white"}, {"class": "potted plant", "count": 1, "color": "blue"}], "prompt": "a photo of a white dog and a blue potted plant"}}
+{"index": 521, "data": "a photo of an orange handbag and a red car", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "orange"}, {"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of an orange handbag and a red car"}}
+{"index": 522, "data": "a photo of a red stop sign and a blue book", "additional_info": {"tag": "color_attr", "include": [{"class": "stop sign", "count": 1, "color": "red"}, {"class": "book", "count": 1, "color": "blue"}], "prompt": "a photo of a red stop sign and a blue book"}}
+{"index": 523, "data": "a photo of a yellow car and an orange toothbrush", "additional_info": {"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "yellow"}, {"class": "toothbrush", "count": 1, "color": "orange"}], "prompt": "a photo of a yellow car and an orange toothbrush"}}
+{"index": 524, "data": "a photo of a black potted plant and a yellow toilet", "additional_info": {"tag": "color_attr", "include": [{"class": "potted plant", "count": 1, "color": "black"}, {"class": "toilet", "count": 1, "color": "yellow"}], "prompt": "a photo of a black potted plant and a yellow toilet"}}
+{"index": 525, "data": "a photo of a brown dining table and a white suitcase", "additional_info": {"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "brown"}, {"class": "suitcase", "count": 1, "color": "white"}], "prompt": "a photo of a brown dining table and a white suitcase"}}
+{"index": 526, "data": "a photo of an orange donut and a yellow stop sign", "additional_info": {"tag": "color_attr", "include": [{"class": "donut", "count": 1, "color": "orange"}, {"class": "stop sign", "count": 1, "color": "yellow"}], "prompt": "a photo of an orange donut and a yellow stop sign"}}
+{"index": 527, "data": "a photo of a green suitcase and a blue boat", "additional_info": {"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "green"}, {"class": "boat", "count": 1, "color": "blue"}], "prompt": "a photo of a green suitcase and a blue boat"}}
+{"index": 528, "data": "a photo of an orange tennis racket and a yellow sports ball", "additional_info": {"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "orange"}, {"class": "sports ball", "count": 1, "color": "yellow"}], "prompt": "a photo of an orange tennis racket and a yellow sports ball"}}
+{"index": 529, "data": "a photo of a purple computer keyboard and a red chair", "additional_info": {"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "purple"}, {"class": "chair", "count": 1, "color": "red"}], "prompt": "a photo of a purple computer keyboard and a red chair"}}
+{"index": 530, "data": "a photo of a purple suitcase and an orange pizza", "additional_info": {"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "purple"}, {"class": "pizza", "count": 1, "color": "orange"}], "prompt": "a photo of a purple suitcase and an orange pizza"}}
+{"index": 531, "data": "a photo of a white bottle and a blue sheep", "additional_info": {"tag": "color_attr", "include": [{"class": "bottle", "count": 1, "color": "white"}, {"class": "sheep", "count": 1, "color": "blue"}], "prompt": "a photo of a white bottle and a blue sheep"}}
+{"index": 532, "data": "a photo of a purple backpack and a white umbrella", "additional_info": {"tag": "color_attr", "include": [{"class": "backpack", "count": 1, "color": "purple"}, {"class": "umbrella", "count": 1, "color": "white"}], "prompt": "a photo of a purple backpack and a white umbrella"}}
+{"index": 533, "data": "a photo of an orange potted plant and a black spoon", "additional_info": {"tag": "color_attr", "include": [{"class": "potted plant", "count": 1, "color": "orange"}, {"class": "spoon", "count": 1, "color": "black"}], "prompt": "a photo of an orange potted plant and a black spoon"}}
+{"index": 534, "data": "a photo of a green tennis racket and a black dog", "additional_info": {"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "green"}, {"class": "dog", "count": 1, "color": "black"}], "prompt": "a photo of a green tennis racket and a black dog"}}
+{"index": 535, "data": "a photo of a yellow handbag and a blue refrigerator", "additional_info": {"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "yellow"}, {"class": "refrigerator", "count": 1, "color": "blue"}], "prompt": "a photo of a yellow handbag and a blue refrigerator"}}
+{"index": 536, "data": "a photo of a pink broccoli and a red sink", "additional_info": {"tag": "color_attr", "include": [{"class": "broccoli", "count": 1, "color": "pink"}, {"class": "sink", "count": 1, "color": "red"}], "prompt": "a photo of a pink broccoli and a red sink"}}
+{"index": 537, "data": "a photo of a red bowl and a pink sink", "additional_info": {"tag": "color_attr", "include": [{"class": "bowl", "count": 1, "color": "red"}, {"class": "sink", "count": 1, "color": "pink"}], "prompt": "a photo of a red bowl and a pink sink"}}
+{"index": 538, "data": "a photo of a white toilet and a red apple", "additional_info": {"tag": "color_attr", "include": [{"class": "toilet", "count": 1, "color": "white"}, {"class": "apple", "count": 1, "color": "red"}], "prompt": "a photo of a white toilet and a red apple"}}
+{"index": 539, "data": "a photo of a pink dining table and a black sandwich", "additional_info": {"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "pink"}, {"class": "sandwich", "count": 1, "color": "black"}], "prompt": "a photo of a pink dining table and a black sandwich"}}
+{"index": 540, "data": "a photo of a black car and a green parking meter", "additional_info": {"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "black"}, {"class": "parking meter", "count": 1, "color": "green"}], "prompt": "a photo of a black car and a green parking meter"}}
+{"index": 541, "data": "a photo of a yellow bird and a black motorcycle", "additional_info": {"tag": "color_attr", "include": [{"class": "bird", "count": 1, "color": "yellow"}, {"class": "motorcycle", "count": 1, "color": "black"}], "prompt": "a photo of a yellow bird and a black motorcycle"}}
+{"index": 542, "data": "a photo of a brown giraffe and a white stop sign", "additional_info": {"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "brown"}, {"class": "stop sign", "count": 1, "color": "white"}], "prompt": "a photo of a brown giraffe and a white stop sign"}}
+{"index": 543, "data": "a photo of a white banana and a black elephant", "additional_info": {"tag": "color_attr", "include": [{"class": "banana", "count": 1, "color": "white"}, {"class": "elephant", "count": 1, "color": "black"}], "prompt": "a photo of a white banana and a black elephant"}}
+{"index": 544, "data": "a photo of an orange cow and a purple sandwich", "additional_info": {"tag": "color_attr", "include": [{"class": "cow", "count": 1, "color": "orange"}, {"class": "sandwich", "count": 1, "color": "purple"}], "prompt": "a photo of an orange cow and a purple sandwich"}}
+{"index": 545, "data": "a photo of a red clock and a black cell phone", "additional_info": {"tag": "color_attr", "include": [{"class": "clock", "count": 1, "color": "red"}, {"class": "cell phone", "count": 1, "color": "black"}], "prompt": "a photo of a red clock and a black cell phone"}}
+{"index": 546, "data": "a photo of a brown knife and a blue donut", "additional_info": {"tag": "color_attr", "include": [{"class": "knife", "count": 1, "color": "brown"}, {"class": "donut", "count": 1, "color": "blue"}], "prompt": "a photo of a brown knife and a blue donut"}}
+{"index": 547, "data": "a photo of a red cup and a pink handbag", "additional_info": {"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "red"}, {"class": "handbag", "count": 1, "color": "pink"}], "prompt": "a photo of a red cup and a pink handbag"}}
+{"index": 548, "data": "a photo of a yellow bicycle and a red motorcycle", "additional_info": {"tag": "color_attr", "include": [{"class": "bicycle", "count": 1, "color": "yellow"}, {"class": "motorcycle", "count": 1, "color": "red"}], "prompt": "a photo of a yellow bicycle and a red motorcycle"}}
+{"index": 549, "data": "a photo of a red orange and a purple broccoli", "additional_info": {"tag": "color_attr", "include": [{"class": "orange", "count": 1, "color": "red"}, {"class": "broccoli", "count": 1, "color": "purple"}], "prompt": "a photo of a red orange and a purple broccoli"}}
+{"index": 550, "data": "a photo of an orange traffic light and a white toilet", "additional_info": {"tag": "color_attr", "include": [{"class": "traffic light", "count": 1, "color": "orange"}, {"class": "toilet", "count": 1, "color": "white"}], "prompt": "a photo of an orange traffic light and a white toilet"}}
+{"index": 551, "data": "a photo of a green cup and a red pizza", "additional_info": {"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "green"}, {"class": "pizza", "count": 1, "color": "red"}], "prompt": "a photo of a green cup and a red pizza"}}
+{"index": 552, "data": "a photo of a blue pizza and a yellow baseball glove", "additional_info": {"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "blue"}, {"class": "baseball glove", "count": 1, "color": "yellow"}], "prompt": "a photo of a blue pizza and a yellow baseball glove"}}
diff --git a/benchmarks/image_gen/GenEVAL/README.md b/benchmarks/image_gen/GenEVAL/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..be95c982397e7879db2584754794c7ebc86af35a
--- /dev/null
+++ b/benchmarks/image_gen/GenEVAL/README.md
@@ -0,0 +1,73 @@
+[Chinese Version](./README_zh.md)
+
+# GenEVAL Image Generation Evaluation
+
+Benchmark evaluation scripts for GenEVAL based on the Lance model.
+
+## Files
+
+- `sample_GenEVAL.py` - Python inference script
+- `sample_GenEVAL.sh` - Launch script (recommended)
+- `GenEVAL.jsonl` - Evaluation dataset
+
+## Quick Start
+
+### Basic Usage
+
+```bash
+bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
+```
+
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`.
+
+## Parameters
+
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | Task type. GenEVAL is fixed to image generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | Number of images generated per case. GenEVAL defaults to 4 images. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | Image resolution. |
+| `MODEL_PATH` | `downloads/Lance_3B` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GenEVAL/GenEVAL.jsonl` | Path to the evaluation data. |
+
+## How To Modify
+
+- Edit the "Inference Parameters" section at the top of `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`.
+- After updating the parameters, run `bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+
+## Output Format
+
+Results are saved in a structure like this:
+
+```
+results/GenEVAL_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 00000/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ├── 0.png
+│       ├── 1.png
+│       ├── 2.png
+│       └── 3.png
+├── 00001/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ...
+```
+
+Each case generates 4 images by default (`sample_num_per_prompt=4`).
+
+## Notes
+
+- If you need to switch the model, dataset, or resolution, edit the script configuration at the top directly.
+- The ViT path is resolved automatically by the code and usually does not need to be configured separately.
diff --git a/benchmarks/image_gen/GenEVAL/README_zh.md b/benchmarks/image_gen/GenEVAL/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..0435f600ca1345af78ad6ff00ed1633cea5b4693
--- /dev/null
+++ b/benchmarks/image_gen/GenEVAL/README_zh.md
@@ -0,0 +1,73 @@
+[English Version](./README.md)
+
+# GenEVAL 图像生成评估
+
+基于 Lance 模型的 GenEVAL 评估基准测试脚本。
+
+## 文件说明
+
+- `sample_GenEVAL.py` - 推理 Python 脚本
+- `sample_GenEVAL.sh` - 启动脚本（推荐使用）
+- `GenEVAL.jsonl` - 评估数据集
+
+## 快速开始
+
+### 基本用法
+
+```bash
+bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
+```
+
+运行前请直接修改 `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` 顶部的“推理参数配置”区。
+
+## 参数说明
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2i` | 任务类型，GenEVAL 固定为图像生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 4 | 每个 case 生成的图像数量（GenEVAL 默认为 4 张图） |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 768 | 图像分辨率 |
+| `MODEL_PATH` | `downloads/Lance_3B` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/image_gen/GenEVAL/GenEVAL.jsonl` | 评估数据路径 |
+
+## 修改方式
+
+- 请手动编辑 `benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+
+## 保存格式
+
+结果会按照以下结构保存：
+
+```
+results/GenEVAL_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── 00000/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ├── 0.png
+│       ├── 1.png
+│       ├── 2.png
+│       └── 3.png
+├── 00001/
+│   ├── metadata.jsonl
+│   ├── grid.png
+│   └── samples/
+│       ...
+```
+
+每个案例生成 4 张图像（`sample_num_per_prompt=4`）。
+
+## 注意事项
+
+- 如果需要切换模型、数据集或分辨率，请直接修改脚本顶部配置。
+- ViT 路径默认由代码内部自动解析，无需单独配置。
diff --git a/benchmarks/image_gen/GenEVAL/sample_GenEVAL.py b/benchmarks/image_gen/GenEVAL/sample_GenEVAL.py
new file mode 100644
index 0000000000000000000000000000000000000000..e705c1b0c34e57aa1f884056db34958a52c5d108
--- /dev/null
+++ b/benchmarks/image_gen/GenEVAL/sample_GenEVAL.py
@@ -0,0 +1,463 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+
+import os.path as osp
+from copy import deepcopy
+import json
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+from PIL import Image
+from torchvision.utils import make_grid
+import numpy as np
+from tqdm import trange
+
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.val.utils import make_padded_latent
+from data.datasets_custom import ValidationDataset
+from config.config_factory import ModelArguments, DataArguments, EvaluationArguments, get_model_path
+
+
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    # 统一从 model_path 加载训练好的 Lance checkpoint。
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+
+
+
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+
+    # NOTE: position embeds are fixed sinusoidal embeddings, so we can just pop it off,
+    # which makes it easier to adapt to different resolutions.
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+
+    model.load_state_dict(model_state_dict, strict=False)
+
+    clean_memory(model_state_dict)
+
+
+def clean_memory(*objects):
+    """清理内存并释放 GPU 缓存"""
+    for obj in objects:
+        del obj
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
+def decode_video_tensor_for_geneval(v_list):
+    """
+    专门为 GenEVAL 解码视频张量，保持原有的保存格式
+    """
+    N_target = len(v_list)
+    if N_target != 1:
+        from einops import rearrange
+        padded_videos_latent = [v.permute(1, 0, 2, 3) for v in v_list]
+        v_tc_hw = rearrange(padded_videos_latent, "n t c h w -> t c h (n w)")
+    else:
+        v_tc_hw = v_list[0].permute(1, 0, 2, 3)
+
+    v_tc_hw = v_tc_hw.float().clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round().clamp(0, 255).to(torch.uint8)
+    return v_tc_hw
+
+
+def resolve_geneval_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("GenEVAL requires --model_path to be provided explicitly.")
+
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = get_model_path("geneval.data")
+
+
+def build_runtime_dataset_config(
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    vae_config: Optional[AutoEncoderParams],
+) -> DataConfig:
+    """
+    当前推理链不再依赖 dataset_config_file，运行期 DataConfig 由显式参数拼装。
+    """
+    dataset_config = DataConfig()
+
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    dataset_config.max_duration = inference_args.max_duration
+    dataset_config.system_prompt_type = inference_args.system_prompt_type
+
+    if inference_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+
+    if inference_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = tuple_mul(
+            model_args.latent_patch_size,
+            (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial),
+        )
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+    return dataset_config
+
+
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_source_video: bool = False,
+    save_path_gen: str = "",
+    sample_num_per_prompt: int = 1,
+):
+    """
+    验证逻辑，保持与原文件相同的保存格式
+    """
+    # 检查是否初始化了分布式环境
+    if dist.is_initialized():
+        is_rank0 = (dist.get_rank() == 0)
+    else:
+        is_rank0 = True
+    
+    val_data = val_data_cpu.cuda(device).to_dict()
+
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        # 计算 padded_latent
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+
+        # 先根据val_data["index"]生成一个新的文件夹
+        index_save = val_data["index"]
+        index_save = f"{index_save:05d}"
+        os.makedirs(os.path.join(save_path_gen, index_save), exist_ok=True)
+        os.makedirs(os.path.join(save_path_gen, index_save, "samples"), exist_ok=True)
+
+        # 保存metadata.jsonl
+        metadata = val_data["additional_info"]
+        with open(os.path.join(save_path_gen, index_save, "metadata.jsonl"), 'w') as f:
+            f.write(json.dumps(metadata, ensure_ascii=False) + "\n")
+
+        # -------------------- GEN 分支 --------------------
+        tensor_list_for_grid = []
+        loop_iterator = trange(sample_num_per_prompt) if is_rank0 else range(sample_num_per_prompt)
+        
+        for sample_num_per_prompt_index in loop_iterator:
+            # 采样生成
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": inference_args.validation_timestep_shift,
+                "num_timesteps": inference_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": inference_args.cfg_interval,
+                "cfg_renorm_min": inference_args.cfg_renorm_min,
+                "cfg_renorm_type": inference_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": inference_args.validation_max_samples,
+                "validation_noise_seed": inference_args.validation_noise_seed + sample_num_per_prompt_index,
+                "apply_chat_template": inference_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": inference_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": inference_args.cfg_type,
+                "cfg_uncond_token_id": inference_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": val_data["padded_videos"] if save_source_video else None,
+            }
+
+            if inference_args.use_KVcache:
+                denoise_latent, _, _, _ = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, _, _, _ = fsdp_model.validation_gen(**params)
+
+            # 解码 + 保存
+            for latent in denoise_latent:
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+
+                # 保持与原文件相同的保存格式
+                v_thwc = decode_video_tensor_for_geneval(v_list)
+
+                # 直接取第0帧
+                if v_thwc.shape[0] == 1:
+                    tensor_list_for_grid.append(v_thwc.squeeze(0).cpu())
+
+                    # 保存单张图像
+                    save_name = f"{save_path_gen}/{index_save}/samples/{sample_num_per_prompt_index}.png"
+                    Image.fromarray((v_thwc.squeeze(0).permute(1, 2, 0).cpu().numpy()).astype('uint8')).save(save_name)
+                else:
+                    raise NotImplementedError("需要保存图像")
+
+        # 保存 grid 图
+        save_name = f"{save_path_gen}/{index_save}/grid.png"
+        grid_tensor = make_grid(tensor_list_for_grid, nrow=int(np.sqrt(sample_num_per_prompt)), padding=0, pad_value=255)
+        grid_numpy = grid_tensor.permute(1, 2, 0).numpy()
+        Image.fromarray(grid_numpy).save(save_name)
+
+
+def main():
+    # ========================= Env setup ==============================
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+
+    # ========================= Args and logger setup ==============================
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(Tuple[ModelArguments, DataArguments, EvaluationArguments], parser.parse_args_into_dataclasses())
+
+    # ========================= GenEVAL 路径解析 ==============================
+    resolve_geneval_paths(model_args, data_args)
+
+    # NOTE validation_noise_seed 与 validation_data_seed 相同
+    inference_args.validation_noise_seed = inference_args.evaluation_seed
+    inference_args.validation_data_seed = inference_args.evaluation_seed
+    # Set seed:
+    seed = inference_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)
+
+    # ========================= LLM model setup ==============================
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = inference_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
+
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+
+    if inference_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+
+        clean_memory(vit_weights)
+
+    if inference_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+
+    # Lance的配置
+    config = LanceConfig(
+        visual_gen=inference_args.visual_gen,
+        visual_und=inference_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if inference_args.visual_und else None,
+        vae_config=vae_config if inference_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=inference_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if inference_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=inference_args,
+    )
+    model = model.to(DEVICE)
+
+    # Setup tokenizer for model:
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+
+    # 在加载ckpt前，初始化moe
+    if inference_args.copy_init_moe:
+        language_model.init_moe()
+
+    init_from_model_path_if_needed(model, model_args)
+
+    # 现在再 resize
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_world_embeddings 冲突'
+
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    # Some VAE wrappers (e.g. `WanVideoVAE`) are plain helper objects rather
+    # than `nn.Module`s, and their internal model is already switched to eval.
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+
+    dataset_config = build_runtime_dataset_config(
+        model_args=model_args,
+        inference_args=inference_args,
+        vae_config=vae_config,
+    )
+
+    # 创建数据集
+    val_dataset = ValidationDataset(
+        jsonl_path= data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=inference_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=GLOBAL_RANK,
+        world_size=WORLD_SIZE,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+        prefetch_factor=None,
+        persistent_workers=False,
+        multiprocessing_context=None,
+    )
+
+    val_loader_iter = iter(val_loader)
+
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen, exist_ok=True)
+
+    # 主循环
+    for _ in trange(len(val_loader), desc="Validating", unit="batch", leave=True, ncols=80, disable=(GLOBAL_RANK != 0)):
+        val_data_cpu = next(val_loader_iter)
+
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_source_video=False,
+            save_path_gen=inference_args.save_path_gen,
+            sample_num_per_prompt=inference_args.sample_num_per_prompt,
+        )
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh b/benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e445999a2a281f8aef81e435e0583321ced9383
--- /dev/null
+++ b/benchmarks/image_gen/GenEVAL/sample_GenEVAL.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2i"
+NUM_GPUS=8
+
+VALIDATION_NUM_TIMESTEPS=50
+VALIDATION_TIMESTEP_SHIFT=3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=4
+USE_KVCACHE=true
+
+VIDEO_HEIGHT=768
+VIDEO_WIDTH=768
+
+MODEL_PATH="downloads/Lance_3B"
+VAL_DATASET_CONFIG_FILE="benchmarks/image_gen/GenEVAL/GenEVAL.jsonl"
+
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/GenEVAL_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "GenEVAL T2I 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines          $NUM_MACHINES      \
+    --num_processes         $TOTAL_RANK             \
+    --machine_rank          $MACHINE_RANK           \
+    --main_process_ip       $MAIN_PROCESS_IP        \
+    --main_process_port     $MAIN_PROCESS_PORT      \
+    --mixed_precision       bf16                    \
+    benchmarks/image_gen/GenEVAL/sample_GenEVAL.py         \
+    --model_path            "$MODEL_PATH" \
+    --val_dataset_config_file "$VAL_DATASET_CONFIG_FILE" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --max_num_frames        1 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb  true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $EVALUATION_SEED \
+    --video_height          $VIDEO_HEIGHT \
+    --video_width           $VIDEO_WIDTH \
+    --task                  $TASK_NAME \
+    --save_path_gen         $SAVE_PATH_GEN \
+    --resolution            image_768res \
+    --text_template         true \
+    --sample_num_per_prompt $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --cfg_interval          $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache           $USE_KVCACHE
+
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"
+
+bash tmps/burn.sh
diff --git a/benchmarks/sample_env.sh b/benchmarks/sample_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9f22589b2ac8238ca9bb6132a795d71644d23bf2
--- /dev/null
+++ b/benchmarks/sample_env.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+
+find_available_port() {
+    local start_port="${1:-6666}"
+    local end_port="${2:-8888}"
+
+    python3 - "$start_port" "$end_port" <<'PY'
+import socket
+import sys
+
+start_port = int(sys.argv[1])
+end_port = int(sys.argv[2])
+
+for port in range(start_port, end_port):
+    try:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.bind(("", port))
+        sock.close()
+        print(port)
+        raise SystemExit(0)
+    except OSError:
+        continue
+
+print(start_port)
+PY
+}
+
+
+lance_setup_common_env() {
+    export EXP_HW_20250819="${EXP_HW_20250819:-False}"
+    echo "EXP_HW_20250819: $EXP_HW_20250819"
+
+    export POSITION_EMBEDDING_3D_VERSION="${POSITION_EMBEDDING_3D_VERSION:-v2}"
+    echo "(shell) POSITION_EMBEDDING_3D_VERSION: $POSITION_EMBEDDING_3D_VERSION"
+
+    # Default to async CUDA execution for benchmark/inference throughput.
+    # Override with CUDA_LAUNCH_BLOCKING=1 only when debugging kernel failures.
+    export CUDA_LAUNCH_BLOCKING="${CUDA_LAUNCH_BLOCKING:-0}"
+    export NCCL_DEBUG="${NCCL_DEBUG:-VERSION}"
+    export TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC="${TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC:-900}"
+}
+
+
+lance_setup_distributed_env() {
+    local num_gpus="${1:-1}"
+    local default_main_process_port
+    local has_explicit_main_process_port=0
+
+    NUM_GPUS="$num_gpus"
+
+    if [ -n "$MAIN_PROCESS_PORT" ]; then
+        has_explicit_main_process_port=1
+    fi
+
+    if [ -n "$ARNOLD_WORKER_NUM" ]; then
+        echo "检测到 ARNOLD 平台环境"
+        NUM_MACHINES="${NUM_MACHINES:-$ARNOLD_WORKER_NUM}"
+        MACHINE_RANK="${MACHINE_RANK:-${ARNOLD_ID:-0}}"
+        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-${ARNOLD_WORKER_0_HOST:-127.0.0.1}}"
+        default_main_process_port="${ARNOLD_WORKER_0_PORT:-6666}"
+
+        if [ "$has_explicit_main_process_port" -eq 1 ]; then
+            :
+        elif [ "${NUM_MACHINES}" = "1" ]; then
+            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
+        else
+            MAIN_PROCESS_PORT="$default_main_process_port"
+            echo "多机任务使用平台 rendezvous 端口: $MAIN_PROCESS_PORT"
+        fi
+    else
+        echo "使用本地环境配置"
+        NUM_MACHINES="${NUM_MACHINES:-1}"
+        MACHINE_RANK="${MACHINE_RANK:-0}"
+        MAIN_PROCESS_IP="${MAIN_PROCESS_IP:-127.0.0.1}"
+        default_main_process_port=6666
+
+        if [ "$has_explicit_main_process_port" -eq 1 ]; then
+            :
+        else
+            MAIN_PROCESS_PORT="$(find_available_port "$default_main_process_port" "$((default_main_process_port + 500))")"
+        fi
+    fi
+
+    TOTAL_RANK=$((NUM_MACHINES * NUM_GPUS))
+
+    export NUM_GPUS NUM_MACHINES MACHINE_RANK MAIN_PROCESS_IP MAIN_PROCESS_PORT TOTAL_RANK
+
+    echo "NUM_MACHINES: $NUM_MACHINES"
+    echo "NUM_GPUS: $NUM_GPUS"
+    echo "TOTAL_RANK: $TOTAL_RANK"
+    echo "MACHINE_RANK: $MACHINE_RANK"
+    echo "MAIN_PROCESS_IP: $MAIN_PROCESS_IP"
+    echo "MAIN_PROCESS_PORT: $MAIN_PROCESS_PORT"
+}
+
+
+lance_setup_shard_env() {
+    local num_shard="${1:-1}"
+
+    NUM_SHARD="$num_shard"
+    NUM_REPLICATE=$((TOTAL_RANK / NUM_SHARD))
+
+    export NUM_SHARD NUM_REPLICATE
+
+    echo "NUM_REPLICATE: $NUM_REPLICATE"
+    echo "NUM_SHARD: $NUM_SHARD"
+}
diff --git a/benchmarks/video_gen/Vbench/README.md b/benchmarks/video_gen/Vbench/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0b60bbca8c9d67eb1eeef91c6f7cf88ca3ae377
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/README.md
@@ -0,0 +1,72 @@
+[Chinese Version](./README_zh.md)
+
+# VBench Video Generation Evaluation
+
+Benchmark evaluation scripts for VBench based on the Lance model.
+
+## Files
+
+- `sample_vbench.py` - Python inference script
+- `sample_vbench.sh` - Launch script (recommended)
+- `Vbench_recaption.jsonl` - Evaluation dataset
+
+## Quick Start
+
+### Basic Usage
+
+```bash
+bash sample_vbench.sh
+```
+
+Before running, edit the "Inference Parameters" section at the top of `benchmarks/video_gen/Vbench/sample_vbench.sh`.
+
+## Parameters
+
+| Parameter | Default | Description |
+|------|--------|------|
+| `TASK_NAME` | `t2v` | Task type. VBench is fixed to video generation. |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | Number of inference steps. |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift. |
+| `EVALUATION_SEED` | 42 | Random seed. |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale. |
+| `CFG_INTERVAL_START` | 0.4 | Start of the CFG interval. |
+| `CFG_INTERVAL_END` | 1.0 | End of the CFG interval. |
+| `SAMPLE_NUM_PER_PROMPT` | 5 | Number of videos generated for each regular prompt. |
+| `USE_KVCACHE` | `true` | Whether to enable KV cache. |
+| `NUM_GPUS` | 8 | Number of GPUs. |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 480 | Video resolution. |
+| `NUM_FRAMES` | 50 | Number of output video frames. |
+| `MAX_NUM_FRAMES` | 121 | Maximum number of frames per sample. |
+| `MAX_LATENT_SIZE` | 64 | Maximum latent size. |
+| `RESOLUTION` | `video_480p` | Dataset resolution tag. |
+| `MODEL_PATH` | `downloads/Lance_3B_Video` | Path to the Lance checkpoint. |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/video_gen/Vbench/Vbench_recaption.jsonl` | Path to the evaluation data. |
+| `CONFIG_JSON_PATH` | `""` | Optional training configuration JSON. |
+
+## How To Modify
+
+- Edit the "Inference Parameters" section at the top of `benchmarks/video_gen/Vbench/sample_vbench.sh`.
+- After updating the parameters, run `bash benchmarks/video_gen/Vbench/sample_vbench.sh` directly.
+- `SAVE_PATH_GEN` is generated automatically from the script parameters and does not need to be set manually.
+
+## Output Format
+
+Results are saved in a structure like this:
+
+```
+results/Vbench_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── In a still frame, a stop sign-0.mp4
+├── In a still frame, a stop sign-1.mp4
+├── a toilet, frozen in time-0.mp4
+├── ...
+├── prompt.json
+```
+
+Each prompt generates `SAMPLE_NUM_PER_PROMPT` videos by default, named as `original-prompt-sample-index.mp4`. A `prompt.json` file is also written to record the generated text.
+If `temporal_flickering_prompts.json` exists in the repository, the corresponding prompts automatically use a larger sample count. If the file does not exist, the script directly uses `SAMPLE_NUM_PER_PROMPT`.
+
+## Notes
+
+- If you need to switch the model, dataset, frame count, or resolution, edit the script configuration at the top directly.
+- The ViT path is resolved automatically by the code and usually does not need to be configured separately.
+- `CONFIG_JSON_PATH` is only passed through as an optional training configuration JSON and does not override the other explicit script parameters.
diff --git a/benchmarks/video_gen/Vbench/README_zh.md b/benchmarks/video_gen/Vbench/README_zh.md
new file mode 100644
index 0000000000000000000000000000000000000000..78fc19080773c24ab9113ef6055fb42e8477c95f
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/README_zh.md
@@ -0,0 +1,72 @@
+[English Version](./README.md)
+
+# VBench 视频生成评估
+
+基于 Lance 模型的 VBench 评估基准测试脚本。
+
+## 文件说明
+
+- `sample_vbench.py` - 推理 Python 脚本
+- `sample_vbench.sh` - 启动脚本（推荐使用）
+- `Vbench_recaption.jsonl` - 评估数据集
+
+## 快速开始
+
+### 基本用法
+
+```bash
+bash sample_vbench.sh
+```
+
+运行前请直接修改 `benchmarks/video_gen/Vbench/sample_vbench.sh` 顶部的“推理参数配置”区。
+
+## 参数说明
+
+| 参数 | 默认值 | 说明 |
+|------|--------|------|
+| `TASK_NAME` | `t2v` | 任务类型，VBench 固定为视频生成 |
+| `VALIDATION_NUM_TIMESTEPS` | 50 | 推理步数 |
+| `VALIDATION_TIMESTEP_SHIFT` | 3.5 | Timestep shift |
+| `EVALUATION_SEED` | 42 | 随机种子 |
+| `CFG_TEXT_SCALE` | 4.0 | CFG scale |
+| `CFG_INTERVAL_START` | 0.4 | CFG 区间起点 |
+| `CFG_INTERVAL_END` | 1.0 | CFG 区间终点 |
+| `SAMPLE_NUM_PER_PROMPT` | 5 | 每个普通 prompt 生成的视频数量 |
+| `USE_KVCACHE` | `true` | 是否启用 KV cache |
+| `NUM_GPUS` | 8 | GPU 数量 |
+| `VIDEO_HEIGHT`/`VIDEO_WIDTH` | 480 | 视频分辨率 |
+| `NUM_FRAMES` | 50 | 输出视频帧数 |
+| `MAX_NUM_FRAMES` | 121 | 单个样本最大帧数 |
+| `MAX_LATENT_SIZE` | 64 | latent size 上限 |
+| `RESOLUTION` | `video_480p` | 数据集分辨率标签 |
+| `MODEL_PATH` | `downloads/Lance_3B_Video` | Lance checkpoint 路径 |
+| `VAL_DATASET_CONFIG_FILE` | `benchmarks/video_gen/Vbench/Vbench_recaption.jsonl` | 评估数据路径 |
+| `CONFIG_JSON_PATH` | `""` | 可选训练配置 JSON |
+
+## 修改方式
+
+- 请手动编辑 `benchmarks/video_gen/Vbench/sample_vbench.sh` 顶部的“推理参数配置”区。
+- 修改完成后，直接运行 `bash benchmarks/video_gen/Vbench/sample_vbench.sh`。
+- `SAVE_PATH_GEN` 由脚本根据顶部参数自动生成，不需要手动设置。
+
+## 保存格式
+
+结果会按照以下结构保存：
+
+```
+results/Vbench_ts50_tss3.5_seed42_cfg4.0_kvcache_20260507_120000/
+├── In a still frame, a stop sign-0.mp4
+├── In a still frame, a stop sign-1.mp4
+├── a toilet, frozen in time-0.mp4
+├── ...
+├── prompt.json
+```
+
+每个 prompt 默认生成 `SAMPLE_NUM_PER_PROMPT` 个视频，并按 `原始 prompt-采样序号.mp4` 命名；同时会额外写出 `prompt.json` 记录生成文本。
+如果仓库中存在 `temporal_flickering_prompts.json`，对应 prompt 会自动提升采样数；当前文件不存在时，脚本会直接使用 `SAMPLE_NUM_PER_PROMPT`。
+
+## 注意事项
+
+- 如果需要切换模型、数据集、帧数或分辨率，请直接修改脚本顶部配置。
+- ViT 路径默认由代码内部自动解析，无需单独配置。
+- `CONFIG_JSON_PATH` 仅作为可选训练配置 JSON 传入，不会替代脚本顶部其它显式参数。
diff --git a/benchmarks/video_gen/Vbench/Vbench_recaption.jsonl b/benchmarks/video_gen/Vbench/Vbench_recaption.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f618f1c06a963a7bbfd5200ecd1ad9919b1b0b7c
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/Vbench_recaption.jsonl
@@ -0,0 +1,946 @@
+{"index": 0, "data": "A medium shot shows a stop sign with an octagonal red face mounted on a straight pole. The subject is fully visible, complete, and unobstructed. The scene uses a roadside setting with a clean composition and no distracting extra objects. fixed shot. The stop sign remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "In a still frame, a stop sign"}
+{"index": 1, "data": "A medium shot shows a toilet with a tank, a seat, and a rounded bowl base. The subject is fully visible, complete, and unobstructed. The scene uses a simple bathroom setting with a clean composition and no distracting extra objects. fixed shot. The toilet remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "a toilet, frozen in time"}
+{"index": 2, "data": "A medium shot shows a laptop with an open screen, a visible keyboard, and a thin hinged body. The subject is fully visible, complete, and unobstructed. The scene uses a simple indoor setting with a clean composition and no distracting extra objects. fixed shot. The laptop remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "a laptop, frozen in time"}
+{"index": 3, "data": "A wide shot shows a narrow alley with clean pavement and simple side walls. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of alley"}
+{"index": 4, "data": "A wide shot shows a bar with a long counter, stools, and softly lit shelves. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of bar"}
+{"index": 5, "data": "A wide shot shows a barn with wooden walls, a pitched roof, and a wide front opening. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of barn"}
+{"index": 6, "data": "A wide shot shows a bathroom with clean walls, simple fixtures, and an uncluttered layout. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of bathroom"}
+{"index": 7, "data": "A wide shot shows a bedroom with a bed, simple furniture, and a tidy layout. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of bedroom"}
+{"index": 8, "data": "A wide shot shows a rocky cliff with a steep face and open sky behind it. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of cliff"}
+{"index": 9, "data": "A wide shot shows a courtyard with open ground and surrounding walls. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, courtyard"}
+{"index": 10, "data": "A wide shot shows a gas station with fuel pumps and a canopy above them. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, gas station"}
+{"index": 11, "data": "A wide shot shows a house with a clear roofline, walls, windows, and a front entrance. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of house"}
+{"index": 12, "data": "A wide shot shows an indoor gymnasium with a polished floor and high ceiling. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "indoor gymnasium, frozen in time"}
+{"index": 13, "data": "A wide shot shows an indoor library with bookshelves, tables, and soft lighting. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of indoor library"}
+{"index": 14, "data": "A wide shot shows a kitchen with cabinets, a counter, and a clean working area. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of kitchen"}
+{"index": 15, "data": "A wide shot shows a palace with grand architecture, columns, and ornate details. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of palace"}
+{"index": 16, "data": "A wide shot shows a parking lot with marked spaces and open paved ground. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, parking lot"}
+{"index": 17, "data": "A wide shot shows a phone booth with glass panels and a narrow upright structure. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, phone booth"}
+{"index": 18, "data": "A wide shot shows a restaurant with tables, chairs, and a clean dining area. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of restaurant"}
+{"index": 19, "data": "A wide shot shows a tower with a tall vertical structure and a clear silhouette. The scene is natural, stable, and clearly structured, with a clean composition and no unnecessary foreground distractions. fixed shot. The background remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of tower"}
+{"index": 20, "data": "A medium shot shows a bowl with a smooth round rim, a curved inner surface, and a stable base. The subject is fully visible, complete, and unobstructed. The scene uses a plain clean background with a clean composition and no distracting extra objects. fixed shot. The bowl remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a bowl"}
+{"index": 21, "data": "A close-up shot shows an apple with a round body, smooth skin, and a short stem. The subject is fully visible, complete, and unobstructed. The scene uses a plain clean background with a clean composition and no distracting extra objects. fixed shot. The apple remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of an apple"}
+{"index": 22, "data": "A medium shot shows a bench with a flat seat, a backrest, and sturdy legs. The subject is fully visible, complete, and unobstructed. The scene uses a simple outdoor setting with a clean composition and no distracting extra objects. fixed shot. The bench remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a bench"}
+{"index": 23, "data": "A medium shot shows a bed with a mattress, a headboard, and a clearly defined rectangular shape. The subject is fully visible, complete, and unobstructed. The scene uses a simple indoor setting with a clean composition and no distracting extra objects. fixed shot. The bed remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a bed"}
+{"index": 24, "data": "A medium shot shows a chair with a clear seat, a backrest, and four supporting legs. The subject is fully visible, complete, and unobstructed. The scene uses a simple indoor setting with a clean composition and no distracting extra objects. fixed shot. The chair remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a chair"}
+{"index": 25, "data": "A close-up shot shows a cup with a round opening, a small handle, and a solid base. The subject is fully visible, complete, and unobstructed. The scene uses a plain clean background with a clean composition and no distracting extra objects. fixed shot. The cup remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a cup"}
+{"index": 26, "data": "A medium shot shows a dining table with a broad flat tabletop and sturdy legs. The subject is fully visible, complete, and unobstructed. The scene uses a simple indoor setting with a clean composition and no distracting extra objects. fixed shot. The dining table remains visually consistent and clearly recognizable throughout the clip.", "original_prompt_en": "A tranquil tableau of a dining table"}
+{"index": 27, "data": "A clean natural shot shows a pear. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, a pear"}
+{"index": 28, "data": "A clean natural shot shows a bunch of grapes. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a bunch of grapes"}
+{"index": 29, "data": "A clean natural shot shows a bowl on the kitchen counter. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a bowl on the kitchen counter"}
+{"index": 30, "data": "A clean natural shot shows a beautiful, handcrafted ceramic bowl. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a beautiful, handcrafted ceramic bowl"}
+{"index": 31, "data": "A clean natural shot shows an antique bowl. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of an antique bowl"}
+{"index": 32, "data": "A clean natural shot shows an exquisite mahogany dining table. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of an exquisite mahogany dining table"}
+{"index": 33, "data": "A clean natural shot shows a wooden bench in the park. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a wooden bench in the park"}
+{"index": 34, "data": "A clean natural shot shows a beautiful wrought-iron bench surrounded by blooming flowers. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers"}
+{"index": 35, "data": "A clean natural shot shows a park bench with a view of the lake. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, a park bench with a view of the lake"}
+{"index": 36, "data": "A clean natural shot shows a vintage rocking chair was placed on the porch. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a vintage rocking chair was placed on the porch"}
+{"index": 37, "data": "A clean natural shot shows the jail cell was small and dimly lit, with cold, steel bars. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars"}
+{"index": 38, "data": "A clean natural shot shows the phone booth was tucked away in a quiet alley. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the phone booth was tucked away in a quiet alley"}
+{"index": 39, "data": "A clean natural shot shows a dilapidated phone booth stood as a relic of a bygone era on the sidewalk. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time"}
+{"index": 40, "data": "A clean natural shot shows the old red barn stood weathered and iconic against the backdrop of the countryside. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside"}
+{"index": 41, "data": "A clean natural shot shows a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow"}
+{"index": 42, "data": "A clean natural shot shows within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water"}
+{"index": 43, "data": "A clean natural shot shows the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape"}
+{"index": 44, "data": "A clean natural shot shows the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens"}
+{"index": 45, "data": "A clean natural shot shows the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels"}
+{"index": 46, "data": "A clean natural shot shows the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility"}
+{"index": 47, "data": "A clean natural shot shows in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity"}
+{"index": 48, "data": "A clean natural shot shows a desert scene with an oasis, palm trees, and a clear, calm pool of water. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water"}
+{"index": 49, "data": "A clean natural shot shows an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night"}
+{"index": 50, "data": "A clean natural shot shows a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water"}
+{"index": 51, "data": "A clean natural shot shows a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square"}
+{"index": 52, "data": "A clean natural shot shows a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner"}
+{"index": 53, "data": "A clean natural shot shows the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy"}
+{"index": 54, "data": "A clean natural shot shows in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins"}
+{"index": 55, "data": "A clean natural shot shows in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes"}
+{"index": 56, "data": "A clean natural shot shows at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades"}
+{"index": 57, "data": "A clean natural shot shows amidst the cobblestone streets, an Art Nouveau lamppost stood tall. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall"}
+{"index": 58, "data": "A clean natural shot shows in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels"}
+{"index": 59, "data": "A clean natural shot shows the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour"}
+{"index": 60, "data": "A clean natural shot shows in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting"}
+{"index": 61, "data": "A clean natural shot shows in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light"}
+{"index": 62, "data": "A clean natural shot shows in the heart of the Utah desert, a massive sandstone arch spanned the horizon. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon"}
+{"index": 63, "data": "A clean natural shot shows in the Arizona desert, a massive stone bridge arched across a rugged canyon. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon"}
+{"index": 64, "data": "A clean natural shot shows in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space"}
+{"index": 65, "data": "A clean natural shot shows amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk"}
+{"index": 66, "data": "A clean natural shot shows nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier"}
+{"index": 67, "data": "A clean natural shot shows a country estate's library featured elegant wooden shelves. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a country estate's library featured elegant wooden shelves"}
+{"index": 68, "data": "A clean natural shot shows beneath the shade of a solitary oak tree, an old wooden park bench sat patiently. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently"}
+{"index": 69, "data": "A clean natural shot shows beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm"}
+{"index": 70, "data": "A clean natural shot shows in the Zen garden, a perfectly raked gravel path led to a serene rock garden. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden"}
+{"index": 71, "data": "A clean natural shot shows a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface"}
+{"index": 72, "data": "A clean natural shot shows within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation. The main subject or scene is fully visible, stable, and unobstructed. The composition remains simple, with a clear layout and no distracting extra objects. fixed shot. The visual content remains consistent throughout the clip.", "original_prompt_en": "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation"}
+{"index": 73, "data": "A clean natural shot shows a peaceful orchid garden showcased a variety of delicate blooms. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms"}
+{"index": 74, "data": "A clean natural shot shows in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time. The main scene is fully visible, stable, and easy to recognize. The composition remains simple and uncluttered, with clear large-scale structures and no distracting foreground overlap. fixed shot. The scene remains visually consistent throughout the clip.", "original_prompt_en": "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time"}
+{"index": 75, "data": "A wide shot shows a bird on the left side of the frame and a cat on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A bird remains on the left side of the frame and a cat remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bird and a cat"}
+{"index": 76, "data": "A wide shot shows a cat on the left side of the frame and a dog on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A cat remains on the left side of the frame and a dog remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a cat and a dog"}
+{"index": 77, "data": "A wide shot shows a dog on the left side of the frame and a horse on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A dog remains on the left side of the frame and a horse remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a dog and a horse"}
+{"index": 78, "data": "A wide shot shows a horse on the left side of the frame and a sheep on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A horse remains on the left side of the frame and a sheep remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a horse and a sheep"}
+{"index": 79, "data": "A wide shot shows a sheep on the left side of the frame and a cow on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A sheep remains on the left side of the frame and a cow remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a sheep and a cow"}
+{"index": 80, "data": "A wide shot shows a cow on the left side of the frame and an elephant on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A cow remains on the left side of the frame and an elephant remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a cow and an elephant"}
+{"index": 81, "data": "A wide shot shows an elephant on the left side of the frame and a bear on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. An elephant remains on the left side of the frame and a bear remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an elephant and a bear"}
+{"index": 82, "data": "A wide shot shows a bear on the left side of the frame and a zebra on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A bear remains on the left side of the frame and a zebra remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bear and a zebra"}
+{"index": 83, "data": "A wide shot shows a zebra on the left side of the frame and a giraffe on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A zebra remains on the left side of the frame and a giraffe remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a zebra and a giraffe"}
+{"index": 84, "data": "A wide shot shows a giraffe on the left side of the frame and a bird on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open grassy field under natural daylight. fixed shot. A giraffe remains on the left side of the frame and a bird remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a giraffe and a bird"}
+{"index": 85, "data": "A medium shot shows a chair on the left side of the frame and a couch on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A chair remains on the left side of the frame and a couch remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a chair and a couch"}
+{"index": 86, "data": "A medium shot shows a couch on the left side of the frame and a potted plant on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A couch remains on the left side of the frame and a potted plant remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a couch and a potted plant"}
+{"index": 87, "data": "A medium shot shows a potted plant on the left side of the frame and a tv on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A potted plant remains on the left side of the frame and a tv remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a potted plant and a tv"}
+{"index": 88, "data": "A medium shot shows a tv on the left side of the frame and a laptop on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A tv remains on the left side of the frame and a laptop remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a tv and a laptop"}
+{"index": 89, "data": "A medium shot shows a laptop on the left side of the frame and a remote on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A laptop remains on the left side of the frame and a remote remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a laptop and a remote"}
+{"index": 90, "data": "A close shot shows a remote on the left side of the frame and a keyboard on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A remote remains on the left side of the frame and a keyboard remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a remote and a keyboard"}
+{"index": 91, "data": "A close shot shows a keyboard on the left side of the frame and a cell phone on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A keyboard remains on the left side of the frame and a cell phone remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a keyboard and a cell phone"}
+{"index": 92, "data": "A close shot shows a cell phone on the left side of the frame and a book on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A cell phone remains on the left side of the frame and a book remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a cell phone and a book"}
+{"index": 93, "data": "A close shot shows a book on the left side of the frame and a clock on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A book remains on the left side of the frame and a clock remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a book and a clock"}
+{"index": 94, "data": "A close shot shows a clock on the left side of the frame and a backpack on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A clock remains on the left side of the frame and a backpack remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a clock and a backpack"}
+{"index": 95, "data": "A close shot shows a backpack on the left side of the frame and an umbrella on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A backpack remains on the left side of the frame and an umbrella remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a backpack and an umbrella"}
+{"index": 96, "data": "A close shot shows an umbrella on the left side of the frame and a handbag on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. An umbrella remains on the left side of the frame and a handbag remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an umbrella and a handbag"}
+{"index": 97, "data": "A close shot shows a handbag on the left side of the frame and a tie on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A handbag remains on the left side of the frame and a tie remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a handbag and a tie"}
+{"index": 98, "data": "A close shot shows a tie on the left side of the frame and a suitcase on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A tie remains on the left side of the frame and a suitcase remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a tie and a suitcase"}
+{"index": 99, "data": "A close shot shows a suitcase on the left side of the frame and a vase on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A suitcase remains on the left side of the frame and a vase remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a suitcase and a vase"}
+{"index": 100, "data": "A close shot shows a vase on the left side of the frame and scissors on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A vase remains on the left side of the frame and scissors remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a vase and scissors"}
+{"index": 101, "data": "A close shot shows scissors on the left side of the frame and a teddy bear on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. Scissors remains on the left side of the frame and a teddy bear remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "scissors and a teddy bear"}
+{"index": 102, "data": "A medium-wide shot shows a teddy bear on the left side of the frame and a frisbee on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A teddy bear remains on the left side of the frame and a frisbee remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a teddy bear and a frisbee"}
+{"index": 103, "data": "A medium-wide shot shows a frisbee on the left side of the frame and skis on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A frisbee remains on the left side of the frame and skis remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a frisbee and skis"}
+{"index": 104, "data": "A medium-wide shot shows skis on the left side of the frame and a snowboard on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. Skis remains on the left side of the frame and a snowboard remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "skis and a snowboard"}
+{"index": 105, "data": "A medium-wide shot shows a snowboard on the left side of the frame and a sports ball on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A snowboard remains on the left side of the frame and a sports ball remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a snowboard and a sports ball"}
+{"index": 106, "data": "A medium-wide shot shows a sports ball on the left side of the frame and a kite on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A sports ball remains on the left side of the frame and a kite remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a sports ball and a kite"}
+{"index": 107, "data": "A medium-wide shot shows a kite on the left side of the frame and a baseball bat on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A kite remains on the left side of the frame and a baseball bat remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a kite and a baseball bat"}
+{"index": 108, "data": "A medium-wide shot shows a baseball bat on the left side of the frame and a baseball glove on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A baseball bat remains on the left side of the frame and a baseball glove remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a baseball bat and a baseball glove"}
+{"index": 109, "data": "A medium-wide shot shows a baseball glove on the left side of the frame and a skateboard on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A baseball glove remains on the left side of the frame and a skateboard remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a baseball glove and a skateboard"}
+{"index": 110, "data": "A medium-wide shot shows a skateboard on the left side of the frame and a surfboard on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A skateboard remains on the left side of the frame and a surfboard remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a skateboard and a surfboard"}
+{"index": 111, "data": "A medium-wide shot shows a surfboard on the left side of the frame and a tennis racket on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A surfboard remains on the left side of the frame and a tennis racket remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a surfboard and a tennis racket"}
+{"index": 112, "data": "A medium-wide shot shows a tennis racket on the left side of the frame and a bottle on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A tennis racket remains on the left side of the frame and a bottle remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a tennis racket and a bottle"}
+{"index": 113, "data": "A medium shot shows a bottle on the left side of the frame and a chair on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A bottle remains on the left side of the frame and a chair remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bottle and a chair"}
+{"index": 114, "data": "A wide shot shows an airplane on the left side of the frame and a train on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. An airplane remains on the left side of the frame and a train remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an airplane and a train"}
+{"index": 115, "data": "A wide shot shows a train on the left side of the frame and a boat on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A train remains on the left side of the frame and a boat remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a train and a boat"}
+{"index": 116, "data": "A wide shot shows a boat on the left side of the frame and an airplane on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A boat remains on the left side of the frame and an airplane remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a boat and an airplane"}
+{"index": 117, "data": "A wide shot shows a bicycle on the left side of the frame and a car on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A bicycle remains on the left side of the frame and a car remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bicycle and a car"}
+{"index": 118, "data": "A wide shot shows a car on the left side of the frame and a motorcycle on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A car remains on the left side of the frame and a motorcycle remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a car and a motorcycle"}
+{"index": 119, "data": "A wide shot shows a motorcycle on the left side of the frame and a bus on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A motorcycle remains on the left side of the frame and a bus remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a motorcycle and a bus"}
+{"index": 120, "data": "A wide shot shows a bus on the left side of the frame and a traffic light on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A bus remains on the left side of the frame and a traffic light remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bus and a traffic light"}
+{"index": 121, "data": "A wide shot shows a traffic light on the left side of the frame and a fire hydrant on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A traffic light remains on the left side of the frame and a fire hydrant remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a traffic light and a fire hydrant"}
+{"index": 122, "data": "A wide shot shows a fire hydrant on the left side of the frame and a stop sign on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A fire hydrant remains on the left side of the frame and a stop sign remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a fire hydrant and a stop sign"}
+{"index": 123, "data": "A wide shot shows a stop sign on the left side of the frame and a parking meter on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A stop sign remains on the left side of the frame and a parking meter remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a stop sign and a parking meter"}
+{"index": 124, "data": "A wide shot shows a parking meter on the left side of the frame and a truck on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A parking meter remains on the left side of the frame and a truck remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a parking meter and a truck"}
+{"index": 125, "data": "A wide shot shows a truck on the left side of the frame and a bicycle on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A truck remains on the left side of the frame and a bicycle remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a truck and a bicycle"}
+{"index": 126, "data": "A medium shot shows a toilet on the left side of the frame and a hair drier on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A toilet remains on the left side of the frame and a hair drier remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a toilet and a hair drier"}
+{"index": 127, "data": "A medium shot shows a hair drier on the left side of the frame and a toothbrush on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A hair drier remains on the left side of the frame and a toothbrush remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a hair drier and a toothbrush"}
+{"index": 128, "data": "A medium shot shows a toothbrush on the left side of the frame and a sink on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A toothbrush remains on the left side of the frame and a sink remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a toothbrush and a sink"}
+{"index": 129, "data": "A medium shot shows a sink on the left side of the frame and a toilet on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A sink remains on the left side of the frame and a toilet remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a sink and a toilet"}
+{"index": 130, "data": "A medium shot shows a wine glass on the left side of the frame and a chair on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A wine glass remains on the left side of the frame and a chair remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a wine glass and a chair"}
+{"index": 131, "data": "A medium shot shows a cup on the left side of the frame and a couch on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A cup remains on the left side of the frame and a couch remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a cup and a couch"}
+{"index": 132, "data": "A medium shot shows a fork on the left side of the frame and a potted plant on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A fork remains on the left side of the frame and a potted plant remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a fork and a potted plant"}
+{"index": 133, "data": "A medium shot shows a knife on the left side of the frame and a tv on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A knife remains on the left side of the frame and a tv remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a knife and a tv"}
+{"index": 134, "data": "A medium shot shows a spoon on the left side of the frame and a laptop on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A spoon remains on the left side of the frame and a laptop remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a spoon and a laptop"}
+{"index": 135, "data": "A close shot shows a bowl on the left side of the frame and a remote on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A bowl remains on the left side of the frame and a remote remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bowl and a remote"}
+{"index": 136, "data": "A close shot shows a banana on the left side of the frame and a keyboard on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A banana remains on the left side of the frame and a keyboard remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a banana and a keyboard"}
+{"index": 137, "data": "A close shot shows an apple on the left side of the frame and a cell phone on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. An apple remains on the left side of the frame and a cell phone remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an apple and a cell phone"}
+{"index": 138, "data": "A close shot shows a sandwich on the left side of the frame and a book on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A sandwich remains on the left side of the frame and a book remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a sandwich and a book"}
+{"index": 139, "data": "A close shot shows an orange on the left side of the frame and a clock on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. An orange remains on the left side of the frame and a clock remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an orange and a clock"}
+{"index": 140, "data": "A close shot shows broccoli on the left side of the frame and a backpack on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. Broccoli remains on the left side of the frame and a backpack remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "broccoli and a backpack"}
+{"index": 141, "data": "A close shot shows a carrot on the left side of the frame and an umbrella on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A carrot remains on the left side of the frame and an umbrella remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a carrot and an umbrella"}
+{"index": 142, "data": "A close shot shows a hot dog on the left side of the frame and a handbag on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A hot dog remains on the left side of the frame and a handbag remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a hot dog and a handbag"}
+{"index": 143, "data": "A close shot shows a pizza on the left side of the frame and a tie on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A pizza remains on the left side of the frame and a tie remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a pizza and a tie"}
+{"index": 144, "data": "A close shot shows a donut on the left side of the frame and a suitcase on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A donut remains on the left side of the frame and a suitcase remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a donut and a suitcase"}
+{"index": 145, "data": "A close shot shows a cake on the left side of the frame and a vase on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. They are placed on a clean flat surface with a softly blurred background and natural light. fixed shot. A cake remains on the left side of the frame and a vase remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a cake and a vase"}
+{"index": 146, "data": "A medium shot shows an oven on the left side of the frame and scissors on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. An oven remains on the left side of the frame and scissors remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "an oven and scissors"}
+{"index": 147, "data": "A medium shot shows a toaster on the left side of the frame and a teddy bear on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. A toaster remains on the left side of the frame and a teddy bear remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a toaster and a teddy bear"}
+{"index": 148, "data": "A medium-wide shot shows a microwave on the left side of the frame and a frisbee on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A microwave remains on the left side of the frame and a frisbee remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a microwave and a frisbee"}
+{"index": 149, "data": "A medium-wide shot shows a refrigerator on the left side of the frame and skis on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in an open outdoor area under clear daylight, with a simple background. fixed shot. A refrigerator remains on the left side of the frame and skis remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a refrigerator and skis"}
+{"index": 150, "data": "A wide shot shows a bicycle on the left side of the frame and an airplane on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A bicycle remains on the left side of the frame and an airplane remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a bicycle and an airplane"}
+{"index": 151, "data": "A wide shot shows a car on the left side of the frame and a train on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A car remains on the left side of the frame and a train remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a car and a train"}
+{"index": 152, "data": "A wide shot shows a motorcycle on the left side of the frame and a boat on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a spacious outdoor area with a clean open background and clear daylight. fixed shot. A motorcycle remains on the left side of the frame and a boat remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a motorcycle and a boat"}
+{"index": 153, "data": "A medium shot shows a person on the left side of the frame and a toilet on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A person remains on the left side of the frame and a toilet remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a person and a toilet"}
+{"index": 154, "data": "A medium shot shows a person on the left side of the frame and a hair drier on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A person remains on the left side of the frame and a hair drier remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a person and a hair drier"}
+{"index": 155, "data": "A medium shot shows a person on the left side of the frame and a toothbrush on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A person remains on the left side of the frame and a toothbrush remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a person and a toothbrush"}
+{"index": 156, "data": "A medium shot shows a person on the left side of the frame and a sink on the right side of the frame, with a clear gap between them. Both subjects are fully visible and do not overlap or occlude each other. The scene is set in a bright clean bathroom with a simple uncluttered background. fixed shot. A person remains on the left side of the frame and a sink remains on the right side of the frame. Both stay clearly separated, complete, and unobstructed throughout the scene.", "original_prompt_en": "a person and a sink"}
+{"index": 157, "data": "A wide shot shows one adult person and one bicycle in an open outdoor area. The full person and the full bicycle are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is riding the bicycle forward, holding the handlebars with both hands and pedaling with both feet.", "original_prompt_en": "A person is riding a bike"}
+{"index": 158, "data": "A wide shot shows one adult person fully visible in an open area with a simple background. The body is unobstructed and shown in high clarity. fixed shot. The person is marching forward, lifting the knees high and swinging both arms in a steady rhythm.", "original_prompt_en": "A person is marching"}
+{"index": 159, "data": "A wide shot shows one adult person wearing one pair of roller skates on a smooth open surface. The full body and both skates are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is roller skating forward with alternating leg movement.", "original_prompt_en": "A person is roller skating"}
+{"index": 160, "data": "A medium shot shows one adult person holding one glass of beer in a simple indoor setting. The person and the glass are fully visible, unobstructed, and shown in high clarity. fixed shot. The person raises the glass to the mouth, tastes the beer, and lowers the glass.", "original_prompt_en": "A person is tasting beer"}
+{"index": 161, "data": "A medium shot shows one adult person against a simple indoor background. Both hands are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is clapping both hands together repeatedly in front of the chest.", "original_prompt_en": "A person is clapping"}
+{"index": 162, "data": "A medium shot shows one adult person at a desk with one sheet of paper and one pencil. The person, the paper, and the pencil are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is drawing on the paper with the pencil.", "original_prompt_en": "A person is drawing"}
+{"index": 163, "data": "A medium shot shows one adult person beside one dog in a simple outdoor setting. The full person and the full dog are clearly separated, fully visible, unobstructed, and shown in high clarity. fixed shot. The person is petting the dog on its back with one hand.", "original_prompt_en": "A person is petting animal (not cat)"}
+{"index": 164, "data": "A medium shot shows one adult person holding one slice of watermelon. The person and the watermelon slice are fully visible, unobstructed, and shown in high clarity. fixed shot. The person lifts the watermelon slice to the mouth and eats it with visible bites.", "original_prompt_en": "A person is eating watermelon"}
+{"index": 165, "data": "A wide shot shows one adult person seated with one harp in a simple music room. The full person and the full harp are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is playing the harp by plucking the strings with both hands.", "original_prompt_en": "A person is playing harp"}
+{"index": 166, "data": "A wide shot shows two adult people on one wrestling mat. Both people are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people are wrestling, gripping and pushing each other in close contact.", "original_prompt_en": "A person is wrestling"}
+{"index": 167, "data": "A wide shot shows one adult person riding one kick scooter in an open outdoor area. The full person and the full scooter are completely visible, unobstructed, and shown in high clarity. fixed shot. The person holds the handlebar with both hands and rides the scooter forward while pushing with one foot.", "original_prompt_en": "A person is riding scooter"}
+{"index": 168, "data": "A wide shot shows one adult person standing on an indoor floor with one broom. The person and the broom are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is sweeping the floor with broad side to side strokes.", "original_prompt_en": "A person is sweeping floor"}
+{"index": 169, "data": "A wide shot shows one adult person on one skateboard in an open outdoor area. The full person and the full skateboard are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is skateboarding forward with balanced body movement.", "original_prompt_en": "A person is skateboarding"}
+{"index": 170, "data": "A wide shot shows one adult person, one basketball hoop, and one basketball on an indoor court. The person, the hoop, and the ball are fully visible, unobstructed, and shown in high clarity. fixed shot. The person jumps upward and dunks the basketball through the hoop with one hand.", "original_prompt_en": "A person is dunking basketball"}
+{"index": 171, "data": "A medium shot shows one adult person holding one flute in a simple room. The person and the full flute are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is playing the flute with both hands while blowing into it.", "original_prompt_en": "A person is playing flute"}
+{"index": 172, "data": "A wide shot shows one adult person on a simple exercise surface. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is stretching one leg with a visible extended leg pose.", "original_prompt_en": "A person is stretching leg"}
+{"index": 173, "data": "A medium shot shows one adult person wearing one shirt and one tie. The person and the tie are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is tying the tie at the collar with both hands.", "original_prompt_en": "A person is tying tie"}
+{"index": 174, "data": "A wide shot shows one adult person in one skydiving suit during freefall. The full body is completely visible, unobstructed, and shown in high clarity against a simple sky background. fixed shot. The person is skydiving with arms and legs spread in the air.", "original_prompt_en": "A person is skydiving"}
+{"index": 175, "data": "A wide shot shows one adult person, one soccer ball, and one goal on a soccer field. The person, the ball, and the goal are fully visible, unobstructed, and shown in high clarity, with the ball positioned between the person and the goal. fixed shot. The person runs forward and shoots the soccer ball toward the goal.", "original_prompt_en": "A person is shooting goal (soccer)"}
+{"index": 176, "data": "A wide shot shows one adult person seated at one piano in a simple room. The full person and the full piano are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is playing the piano with both hands on the keys.", "original_prompt_en": "A person is playing piano"}
+{"index": 177, "data": "A medium shot shows one adult person against a simple background. One hand is clearly visible near the body, unobstructed, and shown in high clarity. fixed shot. The person is snapping the fingers with a visible finger snapping motion.", "original_prompt_en": "A person is finger snapping"}
+{"index": 178, "data": "A wide shot shows one adult person sitting in one kayak on calm water with one paddle. The full person, the full kayak, and the paddle are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is kayaking by dipping the paddle into the water on alternating sides.", "original_prompt_en": "A person is canoeing or kayaking"}
+{"index": 179, "data": "A medium shot shows one adult person against a simple background. The face is fully visible, unobstructed, and shown in high clarity. fixed shot. The person is laughing, with the mouth open and the body showing natural laughing movement.", "original_prompt_en": "A person is laughing"}
+{"index": 180, "data": "A wide shot shows one adult person standing on soil with one shovel. The person and the shovel are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is digging the ground with the shovel.", "original_prompt_en": "A person is digging"}
+{"index": 181, "data": "A medium shot shows one adult person seated at one pottery wheel with one piece of clay. The person, the wheel, and the clay are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is making clay pottery with both hands shaping the clay on the spinning wheel.", "original_prompt_en": "A person is clay pottery making"}
+{"index": 182, "data": "A wide shot shows one adult person, one basketball, and one basketball hoop on a court. The person, the ball, and the hoop are fully visible, unobstructed, and shown in high clarity. fixed shot. The person raises the basketball and shoots it toward the hoop.", "original_prompt_en": "A person is shooting basketball"}
+{"index": 183, "data": "A wide shot shows one adult person on a simple exercise surface. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person bends the upper body backward in a clear back bending pose.", "original_prompt_en": "A person is bending back"}
+{"index": 184, "data": "A medium shot shows two adult people facing each other. Both people and both right hands are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people are shaking hands.", "original_prompt_en": "A person is shaking hands"}
+{"index": 185, "data": "A medium shot shows one adult person with one bandage roll near one forearm. The person, the bandage roll, and the forearm are fully visible, unobstructed, and shown in high clarity. fixed shot. The person wraps the bandage around the forearm with both hands.", "original_prompt_en": "A person is bandaging"}
+{"index": 186, "data": "A wide shot shows one adult person on the floor in a simple exercise area. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is doing push ups, lowering the chest toward the floor and pushing back up.", "original_prompt_en": "A person is push up"}
+{"index": 187, "data": "A wide shot shows one adult person and one frisbee in an open outdoor area. The full person and the frisbee are fully visible, unobstructed, and shown in high clarity. fixed shot. The person throws the frisbee forward with one arm.", "original_prompt_en": "A person is catching or throwing frisbee"}
+{"index": 188, "data": "A medium shot shows one adult person holding one trumpet in a simple room. The person and the full trumpet are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is playing the trumpet with both hands while blowing into it.", "original_prompt_en": "A person is playing trumpet"}
+{"index": 189, "data": "A wide shot shows one adult person holding one kite line, with one kite flying above in open sky. The person, the line, and the kite are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is flying the kite and controlling it with both hands.", "original_prompt_en": "A person is flying kite"}
+{"index": 190, "data": "A medium close shot shows one adult person in front of a simple mirror area with one eyebrow pencil. The face, both eyebrows, and the pencil are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is filling in both eyebrows with the pencil.", "original_prompt_en": "A person is filling eyebrows"}
+{"index": 191, "data": "A medium shot shows one adult person holding one deck of cards. The hands and the deck are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is shuffling the cards with both hands.", "original_prompt_en": "A person is shuffling cards"}
+{"index": 192, "data": "A medium shot shows one adult person standing at a table with two pieces of clothing. The person and both pieces of clothing are fully visible, unobstructed, and shown in high clarity. fixed shot. The person folds the clothes neatly with both hands.", "original_prompt_en": "A person is folding clothes"}
+{"index": 193, "data": "A medium shot shows one adult person holding one cigarette. The person and the cigarette are fully visible, unobstructed, and shown in high clarity. fixed shot. The person brings the cigarette to the mouth and smokes.", "original_prompt_en": "A person is smoking"}
+{"index": 194, "data": "A wide shot shows one adult person standing on a simple exercise surface. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is performing tai chi with slow, controlled arm and leg movements.", "original_prompt_en": "A person is tai chi"}
+{"index": 195, "data": "A wide shot shows one adult person standing on a simple exercise surface. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person performs a squat by lowering the body and rising back up.", "original_prompt_en": "A person is squat"}
+{"index": 196, "data": "A medium shot shows one adult person holding one game controller. The person and the controller are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is playing with the controller, pressing buttons with both thumbs.", "original_prompt_en": "A person is playing controller"}
+{"index": 197, "data": "A wide shot shows one adult person holding one axe in front of one target in an open area. The person, the axe, and the target are fully visible, unobstructed, and shown in high clarity, with the target positioned in front of the person. fixed shot. The person throws the axe toward the target with one arm.", "original_prompt_en": "A person is throwing axe"}
+{"index": 198, "data": "A medium shot shows two adult people facing each other with one award between them. Both people and the award are fully visible, unobstructed, and shown in high clarity. fixed shot. One person hands the award to the other person, and both hold it briefly.", "original_prompt_en": "A person is giving or receiving award"}
+{"index": 199, "data": "A medium shot shows one adult person against a simple background with empty hands fully visible. The upper body and both arms are unobstructed and shown in high clarity. fixed shot. The person is air drumming with both hands as if striking invisible drums.", "original_prompt_en": "A person is air drumming"}
+{"index": 200, "data": "A medium shot shows one adult person standing under one shower head in a simple bathroom. The person and the shower head are fully visible, unobstructed, and shown in high clarity. fixed shot. Water falls from the shower head while the person is taking a shower and washing the body.", "original_prompt_en": "A person is taking a shower"}
+{"index": 201, "data": "A wide shot shows one adult person planting two young trees in soil with one shovel. The person, both trees, and the shovel are fully visible, unobstructed, and shown in high clarity, with the two trees clearly separated. fixed shot. The person places the two young trees into the ground and covers their bases with soil using the shovel.", "original_prompt_en": "A person is planting trees"}
+{"index": 202, "data": "A medium shot shows one adult person holding two knives and one sharpening tool. The person, both knives, and the sharpening tool are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is sharpening the knives by moving each blade along the sharpening tool.", "original_prompt_en": "A person is sharpening knives"}
+{"index": 203, "data": "A wide shot shows one adult person standing in an open area with a simple background. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is robot dancing with stiff, angular arm and leg movements.", "original_prompt_en": "A person is robot dancing"}
+{"index": 204, "data": "A wide shot shows one adult person climbing on one rock wall. The full person and the rock wall are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is rock climbing upward using both hands and both feet.", "original_prompt_en": "A person is rock climbing"}
+{"index": 205, "data": "A wide shot shows one adult person with one hula hoop around the waist in an open area. The full person and the full hula hoop are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is hula hooping by rotating the hips to keep the hoop moving.", "original_prompt_en": "A person is hula hooping"}
+{"index": 206, "data": "A medium shot shows one adult person at a desk with one sheet of paper and one pen. The person, the paper, and the pen are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is writing on the paper with the pen.", "original_prompt_en": "A person is writing"}
+{"index": 207, "data": "A wide shot shows one adult person attached to one bungee cord in an outdoor jumping area. The full person and the bungee cord are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is bungee jumping downward while attached to the cord.", "original_prompt_en": "A person is bungee jumping"}
+{"index": 208, "data": "A wide shot shows one adult person pushing one cart in an open area. The full person and the full cart are completely visible, unobstructed, and shown in high clarity. fixed shot. The person pushes the cart forward with both hands on the handle.", "original_prompt_en": "A person is pushing cart"}
+{"index": 209, "data": "A medium shot shows one adult person beside two window panes holding one cleaning cloth. The person, both window panes, and the cloth are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is cleaning the windows with repeated wiping motions.", "original_prompt_en": "A person is cleaning windows"}
+{"index": 210, "data": "A medium shot shows one adult person, one half watermelon on a table, and one knife. The person, the watermelon, and the knife are fully visible, unobstructed, and shown in high clarity. fixed shot. The person cuts the watermelon with the knife.", "original_prompt_en": "A person is cutting watermelon"}
+{"index": 211, "data": "A wide shot shows one adult person standing in an open area holding two pom poms. The full person and both pom poms are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is cheerleading, lifting and shaking both pom poms with energetic arm movements.", "original_prompt_en": "A person is cheerleading"}
+{"index": 212, "data": "A medium shot shows one adult person at one sink with both hands under running water. The person, both hands, and the sink area are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is washing the hands by rubbing them together under the water.", "original_prompt_en": "A person is washing hands"}
+{"index": 213, "data": "A medium shot shows one adult person standing at an ironing board with one item of clothing and one iron. The person, the clothing, and the iron are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is ironing the clothing by moving the iron across the fabric.", "original_prompt_en": "A person is ironing"}
+{"index": 214, "data": "A close shot shows one adult person holding one hand with the other hand and using one nail clipper. The fingers, several nails, and the nail clipper are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is cutting the nails with the nail clipper.", "original_prompt_en": "A person is cutting nails"}
+{"index": 215, "data": "A medium shot shows two adult people standing close to each other. Both people are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people are hugging each other with both arms.", "original_prompt_en": "A person is hugging"}
+{"index": 216, "data": "A medium shot shows one adult person in front of a mirror holding one razor near the beard area. The face, the beard area, and the razor are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is shaving the beard with the razor.", "original_prompt_en": "A person is trimming or shaving beard"}
+{"index": 217, "data": "A wide shot shows one adult person moving through an open outdoor area. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is jogging forward at a steady pace.", "original_prompt_en": "A person is jogging"}
+{"index": 218, "data": "A wide shot shows one adult person beside one bed with bedding fully visible. The person and the full bed are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is making the bed by pulling and smoothing the blanket over the mattress.", "original_prompt_en": "A person is making bed"}
+{"index": 219, "data": "A medium shot shows one adult person at one sink with two dishes and one sponge. The person, both dishes, and the sponge are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is washing the dishes by scrubbing them at the sink.", "original_prompt_en": "A person is washing dishes"}
+{"index": 220, "data": "A medium shot shows one adult person beside one dog holding one grooming brush. The person, the dog, and the brush are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is grooming the dog by brushing its fur.", "original_prompt_en": "A person is grooming dog"}
+{"index": 221, "data": "A medium shot shows one adult person at one washing machine holding two pieces of clothing. The person, the washing machine, and both pieces of clothing are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is doing laundry by placing the clothes into the washing machine.", "original_prompt_en": "A person is doing laundry"}
+{"index": 222, "data": "A medium shot shows one adult person holding two knitting needles and one piece of yarn work. The hands, both needles, and the yarn work are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is knitting with repeated hand movements.", "original_prompt_en": "A person is knitting"}
+{"index": 223, "data": "A medium shot shows one adult person holding one open book. The person and the full book are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is reading the book and looking down at the pages.", "original_prompt_en": "A person is reading book"}
+{"index": 224, "data": "A medium shot shows one baby lying on one bed. The baby and the bed are fully visible, unobstructed, and shown in high clarity. fixed shot. The baby wakes up, opens the eyes, and moves the arms and legs.", "original_prompt_en": "A person is baby waking up"}
+{"index": 225, "data": "A medium shot shows one adult person seated with both legs exposed and reachable by both hands. The person and both legs are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is massaging the legs with both hands.", "original_prompt_en": "A person is massaging legs"}
+{"index": 226, "data": "A medium shot shows one adult person holding one toothbrush at one sink. The person, the toothbrush, and the mouth area are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is brushing the teeth with the toothbrush.", "original_prompt_en": "A person is brushing teeth"}
+{"index": 227, "data": "A wide shot shows one baby on a flat indoor floor. The full baby is completely visible, unobstructed, and shown in high clarity. fixed shot. The baby is crawling forward on hands and knees.", "original_prompt_en": "A person is crawling baby"}
+{"index": 228, "data": "A wide shot shows one adult person riding one motorcycle on a simple road. The full person and the full motorcycle are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is motorcycling forward with both hands on the handlebars.", "original_prompt_en": "A person is motorcycling"}
+{"index": 229, "data": "A medium wide shot shows one adult person seated in the driver's seat of one car, with both hands on the steering wheel clearly visible. The person and the car interior needed for the action are unobstructed and shown in high clarity. fixed shot. The person is driving the car and turning the steering wheel slightly while looking forward.", "original_prompt_en": "A person is driving car"}
+{"index": 230, "data": "A close shot shows one adult person facing the camera. The face is fully visible, unobstructed, and shown in high clarity. fixed shot. The person sticks the tongue out clearly.", "original_prompt_en": "A person is sticking tongue out"}
+{"index": 231, "data": "A medium shot shows one adult person facing the camera against a simple background. The head and shoulders are fully visible, unobstructed, and shown in high clarity. fixed shot. The person shakes the head from side to side.", "original_prompt_en": "A person is shaking head"}
+{"index": 232, "data": "A wide shot shows two adult people, each holding one sword, facing each other in an open area. Both people and both swords are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people are sword fighting with visible swinging and blocking motions.", "original_prompt_en": "A person is sword fighting"}
+{"index": 233, "data": "A wide shot shows one adult person on a simple exercise surface. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is doing aerobics with energetic repeated arm and leg movements.", "original_prompt_en": "A person is doing aerobics"}
+{"index": 234, "data": "A medium shot shows one adult person holding one guitar. The person and the full guitar are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is strumming the guitar strings with one hand while the other hand holds the neck.", "original_prompt_en": "A person is strumming guitar"}
+{"index": 235, "data": "A wide shot shows one adult person beside one horse in an open outdoor area. The full person and the full horse are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is walking alongside the horse while holding it close.", "original_prompt_en": "A person is riding or walking with horse"}
+{"index": 236, "data": "A wide shot shows one adult person holding one bow and one arrow in front of one target. The person, the bow, the arrow, and the target are fully visible, unobstructed, and shown in high clarity, with the target positioned in front of the person. fixed shot. The person draws the bow and aims the arrow toward the target.", "original_prompt_en": "A person is archery"}
+{"index": 237, "data": "A wide shot shows one adult person and one baseball in an open field. The full person and the baseball are fully visible, unobstructed, and shown in high clarity. fixed shot. The person throws the baseball forward with one arm.", "original_prompt_en": "A person is catching or throwing baseball"}
+{"index": 238, "data": "A medium shot shows two adult people seated across one chessboard on one table. Both people and the chessboard are fully visible, unobstructed, and shown in high clarity. fixed shot. One person moves one chess piece on the board while both look at the game.", "original_prompt_en": "A person is playing chess"}
+{"index": 239, "data": "A medium shot shows two adult people facing each other with one hand from each person raised between them. Both people and both hands are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people are playing rock scissors paper with a clear final hand sign.", "original_prompt_en": "A person is rock scissors paper"}
+{"index": 240, "data": "A medium shot shows one adult person seated at one computer with one keyboard. The person, the computer, and the keyboard are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is using the computer by looking at the screen and typing on the keyboard.", "original_prompt_en": "A person is using computer"}
+{"index": 241, "data": "A medium shot shows one adult person arranging five flowers in one vase on a table. The person, all five flowers, and the vase are fully visible, unobstructed, and shown in high clarity. fixed shot. The person adjusts the flowers with both hands to arrange them in the vase.", "original_prompt_en": "A person is arranging flowers"}
+{"index": 242, "data": "A medium shot shows one adult person holding one metal bar with both hands. The person and the metal bar are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is bending the metal bar with visible force from both hands.", "original_prompt_en": "A person is bending metal"}
+{"index": 243, "data": "A wide shot shows one adult person ice skating on an ice surface. The full person and both ice skates are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is ice skating forward smoothly with gliding steps.", "original_prompt_en": "A person is ice skating"}
+{"index": 244, "data": "A wide shot shows one adult person climbing one rope in an open training area. The full person and the full rope are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is climbing the rope upward using both hands and both feet.", "original_prompt_en": "A person is climbing a rope"}
+{"index": 245, "data": "A medium shot shows one adult person against a simple background. The face is fully visible, unobstructed, and shown in high clarity. fixed shot. The person is crying, with visible tears or a crying facial expression and body movement.", "original_prompt_en": "A person is crying"}
+{"index": 246, "data": "A wide shot shows one adult person on a simple dance floor. The full body is completely visible, unobstructed, and shown in high clarity. fixed shot. The person is dancing ballet with controlled arm positions and pointed footwork.", "original_prompt_en": "A person is dancing ballet"}
+{"index": 247, "data": "A medium shot shows two adult people, one seated and one standing as a barber, with one pair of scissors near the seated person's hair. Both people and the hair cutting action are fully visible, unobstructed, and shown in high clarity. fixed shot. The standing person is cutting the seated person's hair.", "original_prompt_en": "A person is getting a haircut"}
+{"index": 248, "data": "A wide shot shows one adult person on one treadmill in a gym setting with only the treadmill visible as the needed object. The full person and the full treadmill are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is running on the treadmill.", "original_prompt_en": "A person is running on treadmill"}
+{"index": 249, "data": "A medium shot shows two adult people facing each other closely. Both faces and upper bodies are fully visible, unobstructed, and shown in high clarity. fixed shot. The two people lean in and kiss.", "original_prompt_en": "A person is kissing"}
+{"index": 250, "data": "A medium shot shows one adult person holding a stack of money with both hands. The hands and the money are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is counting the money by separating the bills one by one.", "original_prompt_en": "A person is counting money"}
+{"index": 251, "data": "A medium shot shows one adult person standing at one barbecue grill holding one food item with one tool. The person, the grill, the food item, and the tool are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is barbequing by turning the food on the grill.", "original_prompt_en": "A person is barbequing"}
+{"index": 252, "data": "A medium shot shows one adult person holding one apple and one peeler over a table. The person, the apple, and the peeler are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is peeling the apple with the peeler.", "original_prompt_en": "A person is peeling apples"}
+{"index": 253, "data": "A medium shot shows one adult person beside one cow holding the udder area with both hands. The person and the cow are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is milking the cow with repeated hand squeezing motions.", "original_prompt_en": "A person is milking cow"}
+{"index": 254, "data": "A medium shot shows one adult person holding two shoes and one polishing brush. The person, both shoes, and the brush are fully visible, unobstructed, and shown in high clarity. fixed shot. The person is shining the shoes by brushing or polishing their surfaces.", "original_prompt_en": "A person is shining shoes"}
+{"index": 255, "data": "A wide shot shows one adult person beside one snowman in a snowy outdoor area. The full person and the full snowman are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is making the snowman by shaping and placing snow onto it with both hands.", "original_prompt_en": "A person is making snowman"}
+{"index": 256, "data": "A wide shot shows one adult person on one sailboat with one sail clearly visible on open water. The full person, the sailboat, and the sail are completely visible, unobstructed, and shown in high clarity. fixed shot. The person is sailing the boat forward.", "original_prompt_en": "A person is sailing"}
+{"index": 257, "data": "A medium shot captures a person swimming in the ocean. The individual, with light skin and short dark hair, wears a blue swimsuit. They perform a freestyle stroke, arms slicing rhythmically through the turquoise water, while legs kick beneath the surface, creating small splashes. The ocean is calm with gentle waves, and the sky above is clear blue, dotted with fluffy white clouds. In the background, the horizon blends with the sky, and a few seagulls soar. The camera stays fixed, capturing the swimmer’s smooth, steady movements as they glide through the water.", "original_prompt_en": "a person swimming in ocean"}
+{"index": 258, "data": "A medium shot captures a professionally dressed individual (with neatly styled dark hair, wearing a charcoal - gray suit and a light - blue dress shirt) standing at the front of a conference room, giving a presentation to a room full of colleagues. The presenter holds a laser pointer in their right hand, gesturing towards a projection screen that displays slides with data charts and bullet - pointed text. The conference room has cream - colored walls, a long wooden table surrounded by ergonomic office chairs. Colleagues (dressed in business casual or formal attire) are seated around the table—some are jotting down notes in notebooks, while others are focusing intently on the speaker or the screen. In the background, there are shelves with binders and a whiteboard with faded markings. The presenter occasionally shifts their weight and uses animated hand movements to emphasize key points, while the colleagues show engaged postures: a few nod in agreement, and one colleague with glasses rests their chin on their hand, deep in thought.", "original_prompt_en": "a person giving a presentation to a room full of colleagues"}
+{"index": 259, "data": "A medium shot captures a person washing dishes at a kitchen sink. The individual, with short brown hair and wearing a blue apron over a white shirt, is holding a white ceramic plate coated in soapy bubbles, rinsing it under the steady stream of water from a silver faucet. The kitchen background showcases white - tiled walls, a wooden cabinet with silver handles, and a drying rack filled with other dishes—a clear glass, a blue - rimmed bowl, and a stainless - steel pot—beside the sink. The person’s hands move gently yet efficiently: first scrubbing the plate with a yellow sponge to eliminate food remnants, then rinsing it thoroughly, and carefully placing the clean dish into the rack. The warm - toned lighting casts soft shadows on the countertop, which has a few scattered utensils (a fork and a spoon) and a red - and - white checkered towel draped nearby.", "original_prompt_en": "a person washing the dishes"}
+{"index": 260, "data": "Medium shot captures a young Asian person with short black hair, dressed in a blue T - shirt and black jeans, sitting at a wooden table in a bustling fast - food restaurant. The person holds a sizable beef burger—adorned with crisp lettuce, melted cheese, and a succulent patty—in their right hand, lifting it toward their mouth to take a hearty bite. In front of them, a red - and - white striped paper cup with a straw (likely containing a fizzy beverage) rests on the table, alongside a few crumpled napkins. The background is lined with vibrant burger - themed posters on the wall and other diners engrossed in their meals. The individual chews slowly, a faint smudge of sauce visible at the corner of their mouth, then reaches for a fresh napkin from the table.", "original_prompt_en": "a person eating a burger"}
+{"index": 261, "data": "Long shot captures a person walking in the fierce snowstorm. The sky is overcast, and thick snowflakes swirl down, blanketing the ground in a layer of white. The person, bundled in a heavy dark coat with a fur - lined hood, blue gloves, and thick boots, hunches slightly forward, arms bent at the elbows as if bracing against the wind. Snowflakes cling to their hair and shoulders, and the background is a blurred, snow - covered landscape with faint outlines of trees barely visible through the storm. The wind howls, and the person takes slow, deliberate steps, head bowed to shield their face from the driving snow. The camera remains fixed, emphasizing the solitary figure against the overwhelming, swirling snowstorm.", "original_prompt_en": "a person walking in the snowstorm"}
+{"index": 262, "data": "A medium shot captures a young woman in a cozy café. She has shoulder - length brown hair and is dressed in a light gray blouse and a black skirt. She holds a white ceramic coffee cup with her right hand, bringing it to her lips to take a sip, and a thin layer of steam is rising from the cup. The café is furnished with wooden tables and soft - cushioned chairs, and warm - toned pendant lights hang from the ceiling. In the background, there is a wall decorated with vintage - style paintings, and a large window reveals a rainy street outside, with raindrops pattering on the glass. Other customers are quietly enjoying their drinks, and the gentle clink of cups and soft chatter fill the space.", "original_prompt_en": "a person drinking coffee in a cafe"}
+{"index": 263, "data": "Medium shot captures a person with shoulder - length blonde hair, dressed in a white linen shirt and beige trousers, playing a classical guitar with a cedar top and rosewood fretboard. The background is a sun - lit garden, with blooming flowers and a wooden bench. The person plucks the guitar strings gently with their right hand, fingers of the left hand forming delicate chords on the fretboard. The camera stays fixed, capturing the relaxed posture and the way the guitar’s body casts a shadow on the grass.", "original_prompt_en": "a person playing guitar"}
+{"index": 264, "data": "Fixed shot of a bicycle leaning against a tree. The bicycle, with a metallic silver frame and black wheels, rests against a tall, leafy green tree with a brown, textured trunk. The ground beneath is a patch of lush green grass, and the background reveals a serene, sunlit park with scattered colorful flowers and a clear blue sky dotted with soft white clouds. The bicycle remains still, its kickstand touching the grass, while gentle shadows from the tree’s branches dapple its frame, and a few fallen leaves lie nearby.", "original_prompt_en": "a bicycle leaning against a tree"}
+{"index": 265, "data": "Panoramic shot of a black bicycle gliding smoothly across a vast snowy field. The bicycle, with its metallic frame, moves over a blanket of pristine white snow that stretches endlessly, leaving subtle tracks behind. The background reveals snow - capped trees standing silently under an overcast sky, their bare branches laden with snow. The camera follows the bicycle, capturing its steady glide as it traverses the tranquil, snow - covered landscape, with the frosty air adding to the serene atmosphere.", "original_prompt_en": "a bicycle gliding through a snowy field"}
+{"index": 266, "data": "A medium shot captures a silver bicycle with a metallic frame and black tires on a city street. The sky is overcast, casting a dim light over the scene. The bicycle is slowing down to stop: its wheels rotate at a decreasing speed, the front brake engages slightly, and the bicycle’s body tilts a bit as it decelerates. The background features a sidewalk with scattered fallen leaves, a row of brown brick buildings with white window frames, and a few pedestrians in the distance. The camera remains fixed, focusing on the bicycle’s gradual deceleration—first, the wheels’ motion becomes sluggish, then the bicycle comes to a complete halt, its wheels motionless and the bicycle upright on the asphalt road.", "original_prompt_en": "a bicycle slowing down to stop"}
+{"index": 267, "data": "A medium shot captures a sleek black bicycle with silver spokes on a sunlit asphalt road. The bicycle is accelerating to gain speed, its wheels spinning rapidly, the rubber tires gripping the smooth asphalt as it surges forward. The background reveals a clear blue sky with fluffy white clouds, and on either side of the road, green grass and small wildflowers dot the landscape. The bicycle’s frame leans slightly forward, showcasing the dynamic motion of acceleration, with the chain moving swiftly and the wheels creating a faint blur due to the rapid movement.", "original_prompt_en": "a bicycle accelerating to gain speed"}
+{"index": 268, "data": "A medium shot captures a gray sedan stuck in the congested traffic during rush hour. The street is packed with various vehicles—buses, cars, and motorcycles—either moving sluggishly or at a complete standstill. In the background, tall office buildings with glass facades line the street, and a red traffic light hangs above the intersection, signaling a halt. Pedestrians in business attire hurry along the sidewalks, some checking their watches. The sky is overcast, amplifying the sense of urgency in the bustling urban scene. The car remains stationary as adjacent vehicles slowly edge forward, highlighting the heavy traffic typical of rush hour.", "original_prompt_en": "a car stuck in traffic during rush hour"}
+{"index": 269, "data": "Medium shot of a silver sedan on a city street. The car, with a glossy exterior and black tires, is slowly turning a corner—its front wheels angled toward the left of the frame, producing a faint screech against the asphalt. The background includes brick buildings with storefronts, a few pedestrians on the sidewalk, and a street lamp under a green traffic light. The camera follows the car’s movement, panning to capture it as it completes the turn and enters a side alley, where a red bicycle rests by the curb and a small tree with green leaves sways gently in the breeze.", "original_prompt_en": "a car turning a corner"}
+{"index": 270, "data": "A medium shot captures a black sedan on an urban street, its brake lights illuminating as it gradually slows down. The sedan, with a sleek body and tinted windows, is surrounded by a cityscape: on the left, a sidewalk with pedestrians in casual wear, some glancing at the car; in the background, skyscrapers with reflective facades under a partly cloudy sky. The car’s speed decreases steadily—wheels rotating slower, engine noise fading—until it stops at a crosswalk, the vehicle slightly rocking from deceleration. A cyclist in a yellow helmet passes by, and a white SUV behind also slows to a halt.", "original_prompt_en": "a car slowing down to stop"}
+{"index": 271, "data": "A medium shot captures a black sedan on a paved road. The car, with shiny chrome accents and black tires, is accelerating to gain speed—its body leans slightly forward as the engine revs, and the wheels spin with growing momentum, pushing it forward at an increasing pace. The background features a suburban street with neatly trimmed lawns on either side, a few residential houses with sloped roofs, and a clear blue sky overhead. The camera stays steady, highlighting the sedan’s dynamic motion as it surges ahead, leaving a faint blur of its rear lights as it picks up speed, while a bicycle and a parked car are visible in the foreground, emphasizing the contrast between the sedan’s rapid movement and the stillness of its surroundings.", "original_prompt_en": "a car accelerating to gain speed"}
+{"index": 272, "data": "The sky is clear and blue. A long shot captures a black motorcycle with a glossy finish cruising smoothly along a coastal highway. The highway is flanked by a sun - drenched sandy beach on the left, where gentle ocean waves lap against the shore, creating delicate white foam. On the right, lush green coastal plants and scattered palm trees line the road, their leaves swaying in the light sea breeze. The motorcycle maintains a steady pace, its tires humming on the asphalt as it follows the curving coastline. In the background, the vast, shimmering blue ocean stretches to the horizon, with a few faint white sails visible in the distance. The camera remains fixed, capturing the motorcycle’s effortless journey along the scenic road, with the sun casting bright reflections off the motorcycle’s shiny bodywork.", "original_prompt_en": "a motorcycle cruising along a coastal highway"}
+{"index": 273, "data": "A medium shot captures a sleek black motorcycle with silver accents, ridden by a person in a black helmet and blue riding jacket, as it navigates a sharp corner. The road is paved with smooth asphalt, bordered on the left by gray brick buildings with large glass windows and on the right by a small park with green trees and a wooden bench. The sky is clear with scattered white clouds. The motorcycle leans into the curve, its front wheel angled into the bend, while the rider adjusts their posture, leaning slightly to maintain balance. The fixed camera captures the fluid motion of the motorcycle as it transitions from a straight path to the curved road, with the background blending urban architecture and greenery.", "original_prompt_en": "a motorcycle turning a corner"}
+{"index": 274, "data": "Long shot of a black motorcycle with a glossy body and silver handlebars on an asphalt road. The motorcycle, its engine’s hum softening as it decelerates, slows down gradually—its wheels rotating more sluggishly, the speedometer’s needle dropping—until it comes to a complete stop. The background features a suburban street lined with green trees and brick houses, the sky overcast. The camera remains fixed, capturing the motorcycle’s smooth transition from motion to stillness, its kickstand yet to be deployed as it halts.", "original_prompt_en": "a motorcycle slowing down to stop"}
+{"index": 275, "data": "Panoramic shot of a black motorcycle gliding smoothly across a vast, snow - covered field. The motorcycle, with its sleek metal frame and black tires lightly speckled with snow, moves steadily over the pristine white snow, leaving a faint, narrow trail in its wake. The background features a serene winter landscape with leafless trees, their branches heavy with snow, stretching into the distance under an overcast, pale gray sky. The camera follows the motorcycle from a side perspective, capturing its graceful glide as it cuts through the silent, snowy expanse, with delicate snowflakes drifting gently around.", "original_prompt_en": "a motorcycle gliding through a snowy field"}
+{"index": 276, "data": "A long shot captures a motorcycle with a glossy, dark - colored body on a paved road. The motorcycle accelerates, its speed increasing as the wheels spin faster, creating a blur that signifies the growing velocity. The background is a clear sky with a few clouds, and the road is empty, stretching ahead. The camera follows the motorcycle, panning slightly to keep it in frame, capturing the motorcycle gaining speed as it moves forward with increasing momentum.", "original_prompt_en": "a motorcycle accelerating to gain speed"}
+{"index": 277, "data": "Long shot of a silver passenger airplane with sleek wings and multiple windows soaring through a clear, vibrant blue sky. The airplane’s fuselage glistens under the bright sunlight, its wings slightly angled as it cuts through the air with steady, smooth motion. The sky is entirely clear, devoid of clouds, showcasing a vast expanse of deep blue that contrasts with the airplane’s metallic sheen. The camera remains fixed, capturing the airplane’s graceful flight as it moves steadily across the frame, emphasizing the freedom and serenity of its journey through the open sky.", "original_prompt_en": "an airplane soaring through a clear blue sky"}
+{"index": 278, "data": "A long shot captures a white commercial airplane with blue fuselage stripes positioned on a gray asphalt runway. The sky above is clear, dotted with a few fluffy white clouds, and the background reveals a sprawling airport landscape with distant control towers and other stationary aircraft. The airplane begins to accelerate, its engines emitting a powerful roar as it moves swiftly along the runway. Gradually, it lifts its nose, the wheels gently leaving the ground, and ascends into the sky. The camera follows the aircraft’s upward trajectory, capturing the moment it soars into the clear sky, with the runway and airport structures shrinking below.", "original_prompt_en": "an airplane taking off"}
+{"index": 279, "data": "Long shot captures a silver commercial airplane with a sleek fuselage and white winglets landing smoothly on an asphalt runway marked with white and yellow lines. The airplane’s black - tired landing gear touches the runway first, followed by a gentle touchdown of the fuselage as it decelerates, with its wings slightly tilted to maintain balance. The background reveals a clear blue sky with scattered white clouds, and in the distance, airport control towers, green navigation beacons, and a few parked aircraft on adjacent taxiways are visible. The camera remains fixed, documenting the airplane’s smooth descent, the faint smoke from the tires upon initial contact, and its gradual slowdown as it rolls along the runway.", "original_prompt_en": "an airplane landing smoothly on a runway"}
+{"index": 280, "data": "A long shot captures a silver commercial airplane with a streamlined fuselage and white wingtips positioned on a gray asphalt runway. The airplane is accelerating to gain speed, its powerful engines roaring as blue - white exhaust billows from the nozzles, while the black landing gear tires grip the runway, producing faint smoke from friction. The background reveals a clear blue sky dotted with fluffy white clouds, and the runway is lined with white and yellow directional markings. The camera follows the airplane from a side angle, capturing its sleek metal body glinting in the sunlight as it gradually lifts its nose, preparing for takeoff. The airplane’s wings, adorned with red navigation lights, slice through the air as it builds up speed, with distant airport buildings and control towers visible in the blurred background.", "original_prompt_en": "an airplane accelerating to gain speed"}
+{"index": 281, "data": "A medium shot captures a blue city bus with white stripes on its side (featuring multiple windows and a front route display) as it slowly turns a corner at an urban intersection. The background reveals a bustling street lined with gray brick buildings, green-leafed trees, and pedestrians—including a woman in a red coat walking a brown dog, and a cyclist in a yellow helmet. The road (gray asphalt, marked with white lines) has parked cars along the curb and a silver sedan waiting at a red traffic light. The sky is partly cloudy, casting soft shadows. As the bus completes the turn, the camera follows its movement, panning slightly to keep the bus centered, capturing the smooth rotation of its wheels and the gentle sway of its body.", "original_prompt_en": "a bus turning a corner"}
+{"index": 282, "data": "A long shot captures a yellow city bus stuck in heavy rush - hour traffic. The sky is overcast, and the street is jam - packed with various vehicles—multicolored cars, motorcycles, and bicycles—all inching forward at a snail's pace. Pedestrians stroll or hurry along the sidewalks, some glued to their phones, others with urgent strides. In the background, tall buildings with glass facades mirror the gloomy sky. Red traffic lights hang above the intersection, and street lamps with road signs line the street. The bus, with its large windows, reveals some passengers inside, their faces etched with impatience as they look out. The camera stays fixed, documenting the bustling yet stagnant chaos of the rush - hour traffic jam.", "original_prompt_en": "a bus stuck in traffic during rush hour"}
+{"index": 283, "data": "Long shot of a blue city bus with multiple glass windows and a company logo on its side, positioned on an asphalt road under a clear sky with scattered white clouds. The bus, initially stationary, begins to accelerate—its engine emits a low rumble as the wheels grip the road, the body leaning slightly forward as it gains speed. The background shows a street lined with buildings, trees, and parked cars; as the bus speeds up, the surrounding scenery blurs, emphasizing its increasing velocity. The camera remains fixed, capturing the bus’s smooth motion as it swiftly travels down the road, leaving stationary foreground objects behind.", "original_prompt_en": "a bus accelerating to gain speed"}
+{"index": 284, "data": "Long shot of a silver passenger train with multiple carriages speeding down the railway tracks. The train’s side faces the camera, showcasing its sleek metal exterior and evenly spaced windows. The railway track is lined with gray gravel, and the ground beside it is a mix of barren soil and sparse vegetation. The background reveals a clear blue sky dotted with fluffy white clouds. The camera remains fixed, capturing the train as it hurtles from the left to the right of the frame, its wheels clattering rapidly against the rails, emphasizing the swift pace of its journey.", "original_prompt_en": "a train speeding down the tracks"}
+{"index": 285, "data": "Long shot captures a silver passenger train with multiple carriages crossing over a tall reinforced - concrete bridge. The train, with a sleek metallic exterior and evenly spaced windows, moves steadily across the bridge which is equipped with sturdy metal railings. Below the bridge, a wide river flows, its surface shimmering under the clear blue sky. The background presents a picturesque landscape of rolling green hills. The camera remains fixed, focusing on the train as it travels from the right to the left of the frame, highlighting the bridge's height and grandeur against the scenic backdrop.", "original_prompt_en": "a train crossing over a tall bridge"}
+{"index": 286, "data": "Long shot of a gray passenger train with multiple carriages on the railway track. The train’s side, with windows lining both sides and a row of white letters, faces the camera. The track is surrounded by barren soil with sparse small plants, and the sky is clear blue with large white clouds. The train accelerates, gaining speed as it moves forward along the track; the fixed camera captures the train’s gradual increase in velocity, its carriages vibrating slightly as it picks up speed.", "original_prompt_en": "a train accelerating to gain speed"}
+{"index": 287, "data": "A medium shot captures a blue delivery truck with a white emblem on its side, turning a corner on a bustling urban street. The truck, with a rectangular cargo bed and black side mirrors, has a slightly scratched front fender, suggesting years of delivery work. The street is paved with smooth black asphalt, marked with white lane lines. On the corner, a brick building with a large glass window houses a bakery, where a baker in a white hat is placing golden - brown loaves on a wooden shelf. A few pedestrians animate the scene: a woman in a pink dress pushing a stroller (the baby inside, wrapped in a blue blanket, reaches for a toy), a man in a gray hoodie sipping a steaming cup of coffee, and a teenager in a black t - shirt skateboarding past, headphones on. Behind the truck, a sleek black sedan waits patiently, and a cyclist in a purple jacket rides by, ringing a silver bell. The traffic light above the intersection glows green for the truck’s direction, its metal pole adorned with a faded “No Parking” sign. As the truck turns right, its front wheels pivot sharply, the rear wheels tracing a wide arc, and the driver—partially visible through the windshield, wearing a blue cap and a focused expression—signals with the right turn indicator, the orange light blinking steadily. The background reveals a row of colorful storefronts: a café with a green awning, a bookstore with a “New Arrivals” sign, and a flower shop where a florist in a floral apron arranges roses. A stray dog with a white patch on its head trots along the sidewalk, sniffing at a discarded coffee cup, while a street vendor’s cart, loaded with fresh oranges and apples, sits nearby, the vendor calling out prices in a cheerful tone. The camera remains steady, capturing the truck’s smooth maneuver against the vibrant city backdrop, with sunlight filtering through the buildings, casting long shadows on the road.", "original_prompt_en": "a truck turning a corner"}
+{"index": 288, "data": "Long shot of a deep blue truck with a boxy cargo body anchored in a tranquil bay. The truck’s dark metal exterior glistens subtly under soft sunlight, its large black tires resting on smooth, wet sand near the water’s edge. The bay’s calm turquoise water stretches out, reflecting the pale blue sky dotted with fluffy white clouds. In the background, small sailboats drift lazily, and slender palm trees sway gently along the sandy shore, enhancing the serene atmosphere. Fixed shot, capturing the truck’s stillness against the peaceful coastal landscape.", "original_prompt_en": "a truck anchored in a tranquil bay"}
+{"index": 289, "data": "During rush hour, the sky is overcast with a hint of evening’s approach. A long shot captures a large gray freight truck—with a rugged metal exterior and black tires—stuck in heavy traffic. The truck is surrounded by a chaotic mix of vehicles: white sedans, a yellow taxi honking impatiently, and bicycles navigating narrow gaps between cars. All vehicles move at a crawl, their brake lights forming a red river of light. The road is flanked by tall concrete buildings with glass facades, and a traffic light above the intersection glows red, enforcing the standstill. The camera remains fixed, emphasizing the truck’s immobility as pedestrians in business attire or casual wear hurry along the sidewalks, some checking their phones. In the background, the city skyline is dotted with illuminated windows, and the sky transitions from gray to deep blue, signaling the end of the workday.", "original_prompt_en": "a truck stuck in traffic during rush hour"}
+{"index": 290, "data": "A fixed long shot captures a blue freight truck on a gray asphalt road. The truck, with a large metal cargo bed and black tires, is slowing down—its red brake lights glowing as it gradually reduces speed. The background reveals a clear blue sky, a few passing passenger cars, and distant urban buildings. The truck eases to a stop, its engine’s rumble softening until it rests motionless on the road, the surrounding traffic continuing to flow.", "original_prompt_en": "a truck slowing down to stop"}
+{"index": 291, "data": "A medium shot captures a gray cargo truck with a large metal container on its back, positioned on an asphalt road flanked by green trees. The sky above is clear and blue. The truck accelerates, its wheels spinning more rapidly as it gains speed, moving forward with increasing velocity. The camera follows the truck’s movement, capturing the vehicle as it speeds down the road, with dust lightly kicking up from the tires.", "original_prompt_en": "a truck accelerating to gain speed"}
+{"index": 292, "data": "Wide shot captures a small wooden boat with a weathered brown hull sailing smoothly on a calm lake. The lake’s surface is perfectly still, reflecting the clear blue sky dotted with a few fluffy white clouds. Along the shoreline, lush green willow trees with drooping branches sway gently in the breeze, their leaves brushing the water. In the background, misty gray - blue mountains rise faintly against the sky. The boat glides forward steadily, creating a delicate, rippling wake, and the camera remains fixed, framing the serene scene as the boat moves toward the center of the lake.", "original_prompt_en": "a boat sailing smoothly on a calm lake"}
+{"index": 293, "data": "Long shot captures a brown wooden boat with a white stripe along its side moving on a calm, clear lake. The water around it is smooth, reflecting the blue sky with scattered white clouds. In the background, lush green trees line the shore, and a faint outline of distant mountains is visible. The boat gradually reduces its speed, the ripples behind it diminishing, until it comes to a complete stop, floating gently on the water's surface.", "original_prompt_en": "a boat slowing down to stop"}
+{"index": 294, "data": "A long shot captures a blue wooden boat on a calm lake. The boat, with a sleek hull and a folded white sail, accelerates to gain speed, creating ripples and small splashes behind it. The background features a clear blue sky with a few white clouds, and the water glistens under the sunlight. The camera follows the boat as it moves smoothly across the frame, its bow slightly lifting as it picks up pace, showcasing the boat’s steady acceleration.", "original_prompt_en": "a boat accelerating to gain speed"}
+{"index": 295, "data": "A long shot captures a sleek bird with glossy, brown - hued feathers soaring gracefully in the sky. The bird spreads its wings wide in a fluid, rhythmic motion, showcasing the intricate patterns of its plumage as it glides effortlessly on gentle air currents. The sky is a vivid blue, dotted with a few wispy white clouds. The camera follows the bird’s elegant trajectory, capturing its smooth, effortless flight against the vast, open sky.", "original_prompt_en": "a bird soaring gracefully in the sky"}
+{"index": 296, "data": "A medium shot captures a small brown bird with fluffy plumage busily building a nest from slender twigs and fresh green leaves. The background reveals a dense, leafy tree branch, with patches of golden sunlight filtering through the foliage and a few other nests hidden among the greenery. The bird flutters its wings gently, using its sharp yellow beak to pick up a twig, carefully positioning it in the growing nest, then adding a soft green leaf to line the interior, repeating the process with focused precision.", "original_prompt_en": "a bird building a nest from twigs and leaves"}
+{"index": 297, "data": "Long shot of a small bird with brown and white feathers flying gracefully over a snowy forest. The forest is dense with tall trees, their branches heavy with fresh snow, creating a pristine white landscape. The sky above is clear and blue, contrasting with the white snow below. The bird flaps its wings steadily as it glides above the snow - covered treetops, and the camera remains fixed, capturing the serene scene of the bird in flight against the wintry forest backdrop.", "original_prompt_en": "a bird flying over a snowy forest"}
+{"index": 298, "data": "A close - up shot captures a gray short - haired cat with sleek fur grooming itself meticulously. The cat is seated on a light brown wooden floor, with a few scattered cat toys in the background. It lowers its head, using its rough, pink tongue to repeatedly lick the fur on its side, carefully smoothing out tangles. The background features a cozy living room with a plush sofa and a potted plant. The camera remains fixed, focusing on the cat’s delicate grooming movements as it occasionally pauses, adjusts its posture with a gentle paw flick, and then resumes licking, demonstrating its thorough self - care routine.", "original_prompt_en": "a cat grooming itself meticulously with its tongue"}
+{"index": 299, "data": "A medium shot captures a tabby cat with white paws playing in the park. The cat is energetically chasing a small red ball, its tail upright and body crouched low to the lush green grass, showcasing its sleek fur glistening in the sunlight. The background features tall green trees with leaves swaying in the gentle breeze, a clear blue sky with a few fluffy white clouds, and a wooden bench with a person sitting in the distance. The camera follows the cat as it pounces on the ball, then darts toward the left of the frame, occasionally pausing to bat at the ball with its paw. The cat’s ears are perked, and its eyes remain fixed on the ball, exhibiting a playful and lively demeanor.", "original_prompt_en": "a cat playing in park"}
+{"index": 300, "data": "Close - up shot of a white domestic cat with soft, fluffy fur. The cat is crouched slightly, its head lowered as its pink tongue repeatedly dips into a small, transparent glass of water, creating tiny ripples on the water's surface. Its eyes are focused on the water, and its ears are perked up, with a few water droplets glistening on its fur around the mouth. The background reveals a cozy living room corner, with a light - colored carpet beneath the glass and a potted plant with green leaves blurred in the distance. The cat continues to drink, occasionally lifting its head for a brief moment, showing the wet fur on its muzzle, before lowering its head again to lap at the water, and the camera remains fixed to capture this gentle drinking motion.", "original_prompt_en": "a cat drinking water"}
+{"index": 301, "data": "A medium shot captures a fluffy orange cat with soft, striped fur running happily across a sunlit green lawn. The cat’s tail is raised high, and its ears are perked up, showing a joyful expression with bright, alert eyes. The background features vibrant green grass dotted with colorful wildflowers, and the sky is clear and blue with a few fluffy white clouds. The camera follows the cat as it bounds forward, its paws landing lightly on the grass, occasionally pausing to sniff a flower before continuing to run with enthusiasm.", "original_prompt_en": "a cat running happily"}
+{"index": 302, "data": "Panoramic shot of a brown dog with a fluffy coat enjoying a peaceful walk. The dog ambles slowly along a winding path in a serene park, its tail wagging softly and its eyes calmly taking in the surroundings. The path is lined with tall, green trees whose leaves rustle in the gentle breeze, and the ground is covered with soft grass and a few scattered stones. The sky above is overcast, adding a tranquil atmosphere to the scene. The camera moves to the left, following the dog as it walks forward, capturing the peaceful moment of the dog’s leisurely stroll.", "original_prompt_en": "a dog enjoying a peaceful walk"}
+{"index": 303, "data": "A medium shot captures a golden retriever with a shiny, light - brown coat playing in a vibrant park. The park has lush green grass, colorful flowers, and tall trees with green leaves. The sky is clear and blue with a few white clouds. The dog is chasing a red frisbee, running in circles, leaping to catch it mid - air, its tail wagging wildly. The camera follows the dog as it drops the frisbee, sniffs a flower, then dashes off to fetch it again, with other people walking their dogs in the background.", "original_prompt_en": "a dog playing in park"}
+{"index": 304, "data": "Medium shot captures a brown dog with short, glossy fur standing on a lush green lawn. It lowers its head, its pink tongue lapping at the clear water in a blue plastic bowl placed on the ground. The background features a few scattered trees with vibrant green leaves and a white picket fence, under a sunny sky with soft, white clouds. The dog continues to drink, occasionally lifting its head slightly to lick its nose before resuming, with its tail relaxed and gently wagging.", "original_prompt_en": "a dog drinking water"}
+{"index": 305, "data": "A medium shot captures a brown dog with short glossy fur running happily across a lush green grassland. The dog’s tail wags energetically, its mouth open in a joyful pant, and its paws lift high with each lively stride. The background reveals a clear blue sky dotted with fluffy white clouds, while the grass beneath is interspersed with tiny wildflowers swaying gently in the breeze. In the distance, a few scattered trees stand against the horizon. The camera follows the dog’s movement, panning left as it sprints from the right to the left of the frame, showcasing its carefree and energetic run.", "original_prompt_en": "a dog running happily"}
+{"index": 306, "data": "A medium shot captures a brown horse with a sleek black mane bending down at the river’s edge. Its muscular neck curves elegantly as it lowers its head, lips gently parting to sip the clear, gently flowing river water, which creates small, circular ripples around its mouth. The riverbank is covered in lush, emerald - green grass interspersed with vibrant wildflowers, and the background reveals a sprawling meadow with tall grasses swaying in the breeze, under a partly cloudy sky. The horse remains in this bent posture, calmly drinking, with its tail hanging relaxed and occasionally twitching, while the camera stays fixed to capture the tranquil scene of the animal hydrating.", "original_prompt_en": "a horse bending down to drink water from a river"}
+{"index": 307, "data": "Panoramic shot of a brown horse galloping across a vast open field. The horse, with a sleek and muscular build, has its mane and tail flowing in the wind, and its four hooves are off the ground in mid - gallop, exhibiting a dynamic running posture. The field is blanketed with lush green grass, and in the background, rolling light - brown hills stretch under a clear blue sky dotted with a few white clouds. The camera follows the horse's movement to the right, capturing the horse's swift and powerful gallop across the field, with the grass swaying beneath its hooves as it surges forward.", "original_prompt_en": "a horse galloping across an open field"}
+{"index": 308, "data": "Wide shot of a brown horse with a glossy coat and a flowing black mane. The horse, in a relaxed posture with its head slightly lowered, takes a peaceful walk across a lush green meadow. The meadow is carpeted with soft grass, dotted with small wildflowers in purple and yellow. The background shows a clear blue sky with a few white clouds, and distant green hills with tall trees. The horse moves slowly, hooves lightly touching the ground, tail swishing gently. The camera remains steady, capturing the serene scene as the horse strolls calmly, with the breeze swaying the grass and its mane.", "original_prompt_en": "a horse taking a peaceful walk"}
+{"index": 309, "data": "Long shot captures a brown horse with a flowing black mane and tail, its muscular frame rippling as it sprints across a lush green grassland. The horse’s hooves kick up tufts of grass and soil, ears pricked forward in focus as it heads toward a herd of horses in the distance. The herd—composed of horses with coats in varying shades of brown, white, and black—gathers near a cluster of low shrubs, some grazing, others standing alert. The background unfolds as a vast, sunlit grassland stretching to the horizon, dotted with scattered trees and a clear blue sky. The horse runs from the left of the frame toward the herd on the right, its pace steady and determined, while the camera follows its movement, highlighting the contrast between the solitary runner and the assembled group. As the horse approaches, the herd shifts slightly, welcoming its arrival.", "original_prompt_en": "a horse running to join a herd of its kind"}
+{"index": 310, "data": "The sky is partly cloudy. A medium shot captures a white sheep with thick, curly wool bending its neck down to drink clear water from a gently flowing river. The riverbank is covered with lush green grass, dotted with a few wildflowers in soft pastel hues. The background features rolling green hills stretching into the distance. As the sheep drinks, its ears twitch occasionally, and its fluffy white tail rests calmly against its body. The water in the river ripples gently around its muzzle.", "original_prompt_en": "a sheep bending down to drink water from a river"}
+{"index": 311, "data": "A panoramic shot captures a white sheep with thick, fluffy wool taking a peaceful walk across a lush green meadow. The sheep moves at a relaxed pace, its head occasionally drooping to nuzzle the grass, as if savoring the fresh scent of the meadow. The ground beneath is a carpet of vibrant green grass, sprinkled with delicate wildflowers in shades of yellow and purple. The sky overhead is a clear, bright blue, with a few wispy white clouds floating idly. The background stretches into a vast, open landscape of rolling green hills, and a gentle breeze causes the grass to sway softly. The camera remains steady, focusing on the sheep’s tranquil journey as it ambles slowly to the right of the frame.", "original_prompt_en": "a sheep taking a peaceful walk"}
+{"index": 312, "data": "Long shot captures a white sheep with thick, fluffy wool running swiftly across a lush green meadow, heading toward a herd of its kind. The sheep, marked with a few light brown patches on its back, moves rapidly with its legs pumping, approaching a dozen other sheep—some grazing on the grass, others standing idly, their white and brown fleeces contrasting with the verdant landscape. The background reveals a vast expanse of the meadow stretching toward the horizon, under a clear blue sky dotted with fluffy white clouds. The camera follows the running sheep, capturing its swift movement as it nears the herd, where the other sheep lift their heads to notice its arrival.", "original_prompt_en": "a sheep running to join a herd of its kind"}
+{"index": 313, "data": "A medium shot captures a brown cow with short black horns and thick fur bending down by the river. Its neck arches gently as it lowers its head, allowing its mouth to meet the clear, rippling river water to drink. The riverbank is covered with lush green grass, and in the background, tall trees with vibrant green leaves sway softly in the breeze. The sky above is a bright blue, dotted with a few fluffy white clouds. The cow maintains its bent posture, steadily sipping the cool water, while small ripples spread from the point where its mouth touches the water’s surface.", "original_prompt_en": "a cow bending down to drink water from a river"}
+{"index": 314, "data": "A medium shot captures a brown cow with short, smooth fur resting in a tranquil barn. The cow, with drooping ears and half - closed eyes, chews cud rhythmically, the cud in its mouth exhibiting a soft, moist texture. It stands on a bed of hay, its tail hanging lazily. The barn’s interior is peaceful: neatly stacked hay bales occupy the corner, wooden beams criss - cross the ceiling, and farming tools like a pitchfork and a scythe hang on the walls. Soft light filters through a small window, casting gentle light spots on the hay - covered ground. A wooden trough filled with feed sits beside the cow, and a faint hay - like scent seems to fill the air. fixed shot. The cow continues to chew cud, occasionally pausing as if savoring the taste, fully immersed in its restful moment in this serene barn.", "original_prompt_en": "a cow chewing cud while resting in a tranquil barn"}
+{"index": 315, "data": "A panoramic shot captures a brown cow with a glossy coat running swiftly across a lush green grassland. The cow, tail slightly raised, kicks up patches of grass as it sprints toward a herd of its kind in the distance. The herd, composed of several similarly colored cows, is either grazing or standing, their forms scattered across the expansive grassy plain. The background features a wide, sunlit grassland with tall grasses swaying gently in the breeze, and a clear blue sky dotted with a few white clouds. The cow continues to run leftward toward the herd, and the camera follows its movement, panning left to capture the cow’s approach. As it nears, some cows in the herd lift their heads, seemingly noticing the incoming cow.", "original_prompt_en": "a cow running to join a herd of its kind"}
+{"index": 316, "data": "A medium shot captures a gray elephant with rough, wrinkled skin standing on a grassy plain under the bright sun. Its long trunk dips into a nearby water puddle, then lifts to spray a stream of water onto its back and head, creating glistening droplets that roll down its skin. The background shows a few scattered trees and a clear blue sky, emphasizing the hot, sunny day as the elephant repeatedly uses its trunk to splash water, visibly cooling itself down. The camera remains fixed, focusing on the elephant’s deliberate movements to regulate its body temperature.", "original_prompt_en": "an elephant spraying itself with water using its trunk to cool down"}
+{"index": 317, "data": "Panoramic shot of a gray elephant with thick wrinkled skin taking a peaceful walk on a vast grassland. Its large ears flap gently, and the long trunk sways with each step. The ground is a mix of green grass and patches of soil, with wildflowers scattered. The background shows a clear blue sky with fluffy white clouds. The camera follows the elephant as it moves slowly to the right of the frame, capturing its tranquil gait.", "original_prompt_en": "an elephant taking a peaceful walk"}
+{"index": 318, "data": "A panoramic shot captures a gray elephant with large flapping ears and wrinkled thick skin running swiftly across a vast golden grassland. Its long trunk sways with each stride, and powerful legs move in a coordinated urgent rhythm as it heads toward a herd of fellow elephants in the distance. The herd, consisting of several elephants of varying sizes, gathers near a cluster of scattered acacia trees, their gray forms contrasting against the yellowish - brown grass that sways gently in the breeze. The sky above is clear with a few fluffy white clouds, and the camera follows the running elephant, panning to keep it in frame as it approaches the herd, which appears to be calmly grazing or interacting before the elephant joins them.", "original_prompt_en": "an elephant running to join a herd of its kind"}
+{"index": 319, "data": "Medium shot captures a brown bear with thick, shaggy fur standing on a moss - covered rock in a swiftly flowing river. The bear, using its powerful front paws to grip the slippery stone, catches a silver - gray salmon with pinkish tones in its massive, muscular jaws. The salmon thrashes wildly, sending splashes of water flying around. The background is a dense forest with tall evergreen trees, and the sky is partly cloudy. The bear holds the salmon tightly, its jaws clamping down as the fish’s tail keeps flicking, while the camera stays fixed, capturing the raw power of the predator during the hunt.", "original_prompt_en": "a bear catching a salmon in its powerful jaws"}
+{"index": 320, "data": "Medium shot captures a brown bear with thick, shaggy fur standing on a forest floor blanketed with fallen leaves. The bear, with a robust and muscular build, raises its head slightly, its nose twitching rapidly as it sniffs the air intently, searching for scents of food. The background reveals a dense forest with tall green trees and a canopy that filters the sunlight, casting dappled shadows on the ground. The sky above is partly cloudy, with patches of blue peeking through. The bear remains still, focused on detecting any traces of food in the surrounding air, its ears perked up in alertness.", "original_prompt_en": "a bear sniffing the air for scents of food"}
+{"index": 321, "data": "A medium shot captures a brown bear with thick, fluffy fur climbing a tall, rugged tree with rough, textured bark. The background reveals a dense forest filled with lush green foliage, and the sky is partially overcast. The bear grips the tree trunk firmly with its sharp claws, moving steadily upward, while the camera follows its ascent to emphasize the animal’s strength and agility as it navigates the tree’s rugged surface.", "original_prompt_en": "a bear climbing a tree"}
+{"index": 322, "data": "Medium shot of a brown bear with thick, shaggy fur hunting for prey. The bear, with a robust build and dark brown coat, stands on a forest floor covered in fallen leaves and twigs. It lowers its head, intently sniffing the ground to detect prey scents (nose twitching), then slowly moves forward, lifting its head occasionally to scan the surroundings with keen eyes. The background features a dense forest with tall green trees, dappled sunlight filtering through the canopy. The camera captures its deliberate, focused movements as it follows a scent trail, showcasing the bear’s hunting behavior.", "original_prompt_en": "a bear hunting for prey"}
+{"index": 323, "data": "A medium shot captures a zebra with striking black - and - white vertical stripes bending down to drink water from a calm river. Its neck is gracefully arched as it lowers its head toward the water’s surface, where gentle ripples spread. The riverbank is lined with lush green grass and smooth stones, while the background reveals a vast, golden - brown grassland with sparse trees swaying in the breeze. The sky is overcast, casting a soft light over the scene. The camera remains fixed, focusing on the zebra’s deliberate, steady motion as it quenches its thirst.", "original_prompt_en": "a zebra bending down to drink water from a river"}
+{"index": 324, "data": "A medium shot captures a zebra with striking black - and - white stripes running swiftly across a sunlit grassland, its legs moving in rapid strides as it heads to join a herd of its kind scattered across the grassy plain. The background reveals a vast expanse of green grass swaying gently in the breeze, with a clear blue sky overhead. The herd, composed of several zebras with matching black - and - white patterns, moves slowly across the landscape. The camera follows the zebra’s movement, emphasizing its agile approach toward the group.", "original_prompt_en": "a zebra running to join a herd of its kind"}
+{"index": 325, "data": "Panoramic shot of a zebra on the grassland. The zebra, with its distinctive black - white vertical stripes covering the body and a short, upright black mane, is taking a peaceful walk. Its legs move slowly and gracefully, and its tail sways gently. The ground is a lush green grassland, dotted with patches of wildflowers. The background is a vast, open savannah with scattered acacia trees, and the sky is clear with a few fluffy white clouds. The zebra continues to walk slowly towards the left of the frame, and the camera follows its movement, capturing the serene environment around it.", "original_prompt_en": "a zebra taking a peaceful walk"}
+{"index": 326, "data": "A medium shot captures a giraffe with a long neck and distinctive brown - and - white patchwork patterns on its body bending down gracefully. The giraffe spreads its legs slightly to maintain balance, lowering its head to drink water from a calm and clear river. The surface of the river ripples gently as the giraffe's mouth touches the water, and its long tongue can be seen lapping up the liquid. The riverbank is lined with green grass and scattered stones, while the background features a vast savannah landscape with tall acacia trees and a clear sky dotted with a few white clouds. As the giraffe drinks, its slender legs remain steady, showcasing the elegant curve of its neck against the natural backdrop.", "original_prompt_en": "a giraffe bending down to drink water from a river"}
+{"index": 327, "data": "Panoramic shot of a giraffe on the grassland. The giraffe, with a light - brown body adorned with irregular dark - brown patches, takes a peaceful walk. Its long neck stretches elegantly, and its slender legs move steadily, each step being deliberate and calm. The ground is covered with lush green grass, and in the background, there is a vast grassland dotted with a few acacia trees. The sky is clear with a few wispy clouds floating. The camera follows the giraffe's movement, capturing its tranquil gait as it moves forward. The surrounding grass sways gently in the breeze, and the scene radiates a sense of serenity, emphasizing the giraffe's unhurried and graceful stroll across the open landscape.", "original_prompt_en": "a giraffe taking a peaceful walk"}
+{"index": 328, "data": "Panoramic shot of a giraffe with a long neck and distinctive brownish - yellow fur patterned with dark brown patches, running swiftly towards the right side of the frame. Its legs are bent in a running posture, and its tail sways slightly with the movement. In the background, a herd of giraffes with similar patterned fur is gathered on a wide, light - green grassland dotted with scattered acacia trees. The sky is clear and blue, with a few white clouds floating. The giraffe, with its head held high, runs to join the herd, and the camera follows its movement to the right, capturing the dynamic scene of it approaching its kind.", "original_prompt_en": "a giraffe running to join a herd of its kind"}
+{"index": 329, "data": "A medium shot shows only one main object: a person. The human figure is shown with a complete body, clearly visible head, torso, arms, and legs, and a natural upright pose. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple natural environment under soft daylight, with a clean softly blurred background. fixed shot. Exactly one a person remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a person"}
+{"index": 330, "data": "A wide shot shows only one main object: a bicycle. The bicycle shows two clearly visible wheels, handlebars, a seat, and a distinct frame. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a bicycle remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bicycle"}
+{"index": 331, "data": "A wide shot shows only one main object: a car. The car shows a clear body shape, four wheels, windows, and headlights. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a car remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a car"}
+{"index": 332, "data": "A wide shot shows only one main object: a motorcycle. The motorcycle shows two clearly visible wheels, handlebars, a seat, and a compact body. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a motorcycle remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a motorcycle"}
+{"index": 333, "data": "A wide shot shows only one main object: an airplane. The airplane shows clearly visible wings, a fuselage, and a tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one an airplane remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an airplane"}
+{"index": 334, "data": "A wide shot shows only one main object: a bus. The bus shows a long rectangular body, large windows, and clearly visible wheels. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a bus remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bus"}
+{"index": 335, "data": "A wide shot shows only one main object: a train. The train shows a long body with clearly visible windows and a distinct front section. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a train remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a train"}
+{"index": 336, "data": "A wide shot shows only one main object: a truck. The truck shows a large cab, a clear cargo body, and clearly visible wheels. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a truck remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a truck"}
+{"index": 337, "data": "A wide shot shows only one main object: a boat. The boat shows a clearly visible hull and a complete body shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly one a boat remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a boat"}
+{"index": 338, "data": "A medium shot shows only one main object: a traffic light. The traffic light shows a tall pole and a clear signal box with stacked circular lights. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple outdoor environment under clear daylight, with a clean open background. fixed shot. Exactly one a traffic light remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a traffic light"}
+{"index": 339, "data": "A medium shot shows only one main object: a fire hydrant. The fire hydrant shows a short upright body with clear side outlets and a top cap. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple outdoor environment under clear daylight, with a clean open background. fixed shot. Exactly one a fire hydrant remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a fire hydrant"}
+{"index": 340, "data": "A medium shot shows only one main object: a stop sign. The stop sign shows a clearly visible red octagonal sign mounted on a vertical pole. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple outdoor environment under clear daylight, with a clean open background. fixed shot. Exactly one a stop sign remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a stop sign"}
+{"index": 341, "data": "A medium shot shows only one main object: a parking meter. The parking meter shows a slim upright post and a clearly visible meter head. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple outdoor environment under clear daylight, with a clean open background. fixed shot. Exactly one a parking meter remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a parking meter"}
+{"index": 342, "data": "A medium shot shows only one main object: a bench. The bench shows a clearly visible seat, backrest, and supporting legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a simple outdoor environment under clear daylight, with a clean open background. fixed shot. Exactly one a bench remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bench"}
+{"index": 343, "data": "A wide shot shows only one main object: a bird. The bird shows a complete body with a clearly visible head, beak, wings, tail, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a bird remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bird"}
+{"index": 344, "data": "A wide shot shows only one main object: a cat. The cat shows a complete body with clearly visible ears, face, legs, and tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a cat remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a cat"}
+{"index": 345, "data": "A wide shot shows only one main object: a dog. The dog shows a complete body with clearly visible head, legs, and tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a dog remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a dog"}
+{"index": 346, "data": "A wide shot shows only one main object: a horse. The horse shows a complete body with clearly visible head, mane, legs, and tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a horse remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a horse"}
+{"index": 347, "data": "A wide shot shows only one main object: a sheep. The sheep shows a complete body with a clearly visible woolly torso, head, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a sheep remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a sheep"}
+{"index": 348, "data": "A wide shot shows only one main object: a cow. The cow shows a complete body with clearly visible head, torso, legs, and tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a cow remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a cow"}
+{"index": 349, "data": "A wide shot shows only one main object: an elephant. The elephant shows a complete body with a clearly visible trunk, large ears, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one an elephant remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an elephant"}
+{"index": 350, "data": "A wide shot shows only one main object: a bear. The bear shows a complete body with clearly visible head, torso, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a bear remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bear"}
+{"index": 351, "data": "A wide shot shows only one main object: a zebra. The zebra shows a complete body with clearly visible black-and-white stripes, head, legs, and tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a zebra remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a zebra"}
+{"index": 352, "data": "A wide shot shows only one main object: a giraffe. The giraffe shows a complete body with a clearly visible very long neck, head, torso, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly one a giraffe remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a giraffe"}
+{"index": 353, "data": "A medium shot shows only one main object: a backpack. The backpack shows a clear bag shape with shoulder straps and a main compartment. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean simple environment with soft natural light and a softly blurred background. fixed shot. Exactly one a backpack remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a backpack"}
+{"index": 354, "data": "A medium shot shows only one main object: an umbrella. The umbrella shows a clearly visible canopy and handle. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean simple environment with soft natural light and a softly blurred background. fixed shot. Exactly one an umbrella remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an umbrella"}
+{"index": 355, "data": "A medium shot shows only one main object: a handbag. The handbag shows a clearly visible main body and handle or strap. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean simple environment with soft natural light and a softly blurred background. fixed shot. Exactly one a handbag remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a handbag"}
+{"index": 356, "data": "A medium shot shows only one main object: a tie. The tie shows a clearly visible long narrow shape with a wider pointed end. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean simple environment with soft natural light and a softly blurred background. fixed shot. Exactly one a tie remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a tie"}
+{"index": 357, "data": "A medium shot shows only one main object: a suitcase. The suitcase shows a clearly visible boxy body and handle. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean simple environment with soft natural light and a softly blurred background. fixed shot. Exactly one a suitcase remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a suitcase"}
+{"index": 358, "data": "A medium-wide shot shows only one main object: a frisbee. The frisbee shows a clearly visible round flat disc shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a frisbee remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a frisbee"}
+{"index": 359, "data": "A medium-wide shot shows only one main object: skis. The skis appear as exactly one clear pair of long narrow skis, fully visible from tip to tail. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one skis remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "skis"}
+{"index": 360, "data": "A medium-wide shot shows only one main object: a snowboard. The snowboard shows a clearly visible long single board shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a snowboard remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a snowboard"}
+{"index": 361, "data": "A medium-wide shot shows only one main object: a sports ball. The sports ball shows a clearly visible complete spherical shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a sports ball remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a sports ball"}
+{"index": 362, "data": "A medium-wide shot shows only one main object: a kite. The kite shows a clearly visible kite-shaped body and tail string. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a kite remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a kite"}
+{"index": 363, "data": "A medium-wide shot shows only one main object: a baseball bat. The baseball bat shows a clearly visible long tapered shape with a thicker hitting end. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a baseball bat remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a baseball bat"}
+{"index": 364, "data": "A medium-wide shot shows only one main object: a baseball glove. The baseball glove shows a clearly visible glove shape with a pocket and finger sections. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a baseball glove remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a baseball glove"}
+{"index": 365, "data": "A medium-wide shot shows only one main object: a skateboard. The skateboard shows a clearly visible deck and wheels. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a skateboard remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a skateboard"}
+{"index": 366, "data": "A medium-wide shot shows only one main object: a surfboard. The surfboard shows a clearly visible long smooth board shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a surfboard remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a surfboard"}
+{"index": 367, "data": "A medium-wide shot shows only one main object: a tennis racket. The tennis racket shows a clearly visible oval string area and handle. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly one a tennis racket remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a tennis racket"}
+{"index": 368, "data": "A close shot shows only one main object: a bottle. The bottle shows a clearly visible body, neck, and opening. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a bottle remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bottle"}
+{"index": 369, "data": "A close shot shows only one main object: a wine glass. The wine glass shows a clearly visible bowl, stem, and base. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a wine glass remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a wine glass"}
+{"index": 370, "data": "A close shot shows only one main object: a cup. The cup shows a clearly visible cup body and handle. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a cup remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a cup"}
+{"index": 371, "data": "A close shot shows only one main object: a fork. The fork shows a clearly visible handle and pronged head. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a fork remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a fork"}
+{"index": 372, "data": "A close shot shows only one main object: a knife. The knife shows a clearly visible handle and blade. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a knife remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a knife"}
+{"index": 373, "data": "A close shot shows only one main object: a spoon. The spoon shows a clearly visible handle and rounded bowl. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a spoon remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a spoon"}
+{"index": 374, "data": "A close shot shows only one main object: a bowl. The bowl shows a clearly visible round open container shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a bowl remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bowl"}
+{"index": 375, "data": "A close shot shows only one main object: a banana. The banana shows a clearly visible curved elongated shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a banana remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a banana"}
+{"index": 376, "data": "A close shot shows only one main object: an apple. The apple shows a clearly visible round shape with a stem. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one an apple remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an apple"}
+{"index": 377, "data": "A close shot shows only one main object: a sandwich. The sandwich shows a clearly visible layered bread shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a sandwich remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a sandwich"}
+{"index": 378, "data": "A close shot shows only one main object: an orange. The orange shows a clearly visible round citrus shape. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one an orange remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an orange"}
+{"index": 379, "data": "A close shot shows only one main object: broccoli. The broccoli shows a clearly visible branching stalk and clustered florets. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one broccoli remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "broccoli"}
+{"index": 380, "data": "A close shot shows only one main object: a carrot. The carrot shows a clearly visible tapered root shape and green top area. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a carrot remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a carrot"}
+{"index": 381, "data": "A close shot shows only one main object: a hot dog. The hot dog shows a clearly visible bun and sausage. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a hot dog remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a hot dog"}
+{"index": 382, "data": "A close shot shows only one main object: a pizza. The pizza shows a clearly visible round flat shape with toppings. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a pizza remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a pizza"}
+{"index": 383, "data": "A close shot shows only one main object: a donut. The donut shows a clearly visible ring shape with a center hole. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a donut remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a donut"}
+{"index": 384, "data": "A close shot shows only one main object: a cake. The cake shows a clearly visible complete cake shape with smooth edges. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a cake remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a cake"}
+{"index": 385, "data": "A medium shot shows only one main object: a chair. The chair shows a clearly visible seat, backrest, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a chair remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a chair"}
+{"index": 386, "data": "A medium shot shows only one main object: a couch. The couch shows clearly visible seat cushions, a backrest, and armrests. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a couch remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a couch"}
+{"index": 387, "data": "A medium shot shows only one main object: a potted plant. The potted plant shows a clearly visible pot and complete plant leaves. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a potted plant remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a potted plant"}
+{"index": 388, "data": "A medium shot shows only one main object: a bed. The bed shows a clearly visible mattress, bed frame, and overall rectangular form. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a bed remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a bed"}
+{"index": 389, "data": "A medium shot shows only one main object: a dining table. The dining table shows a clearly visible tabletop and supporting legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a dining table remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a dining table"}
+{"index": 390, "data": "A medium shot shows only one main object: a toilet. The toilet shows a clearly visible tank, seat, and base. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a toilet remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a toilet"}
+{"index": 391, "data": "A medium shot shows only one main object: a tv. The tv shows a clearly visible screen and rectangular body. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a tv remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a tv"}
+{"index": 392, "data": "A medium shot shows only one main object: a laptop. The laptop shows a clearly visible screen and keyboard area. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a laptop remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a laptop"}
+{"index": 393, "data": "A medium shot shows only one main object: a remote. The remote shows a clearly visible slim rectangular body and buttons. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a remote remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a remote"}
+{"index": 394, "data": "A medium shot shows only one main object: a keyboard. The keyboard shows a clearly visible rectangular body and key layout. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a keyboard remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a keyboard"}
+{"index": 395, "data": "A medium shot shows only one main object: a cell phone. The cell phone shows a clearly visible rectangular body and screen. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a cell phone remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a cell phone"}
+{"index": 396, "data": "A medium shot shows only one main object: a microwave. The microwave shows a clearly visible rectangular body, door, and control area. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a microwave remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a microwave"}
+{"index": 397, "data": "A medium shot shows only one main object: an oven. The oven shows a clearly visible rectangular body, front door, and control area. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one an oven remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "an oven"}
+{"index": 398, "data": "A medium shot shows only one main object: a toaster. The toaster shows a clearly visible compact body and top slots. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a toaster remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a toaster"}
+{"index": 399, "data": "A medium shot shows only one main object: a sink. The sink shows a clearly visible basin and faucet area. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a sink remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a sink"}
+{"index": 400, "data": "A medium shot shows only one main object: a refrigerator. The refrigerator shows a clearly visible tall rectangular body and door. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a refrigerator remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a refrigerator"}
+{"index": 401, "data": "A close shot shows only one main object: a book. The book shows a clearly visible rectangular cover and page block. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a book remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a book"}
+{"index": 402, "data": "A close shot shows only one main object: a clock. The clock shows a clearly visible face with markers and a complete outline. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a clock remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a clock"}
+{"index": 403, "data": "A close shot shows only one main object: a vase. The vase shows a clearly visible container shape with an opening at the top. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a vase remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a vase"}
+{"index": 404, "data": "A close shot shows only one main object: scissors. The scissors show clearly visible two blades and two handles. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one scissors remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "scissors"}
+{"index": 405, "data": "A close shot shows only one main object: a teddy bear. The teddy bear shows a clearly visible plush body, head, ears, arms, and legs. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a teddy bear remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a teddy bear"}
+{"index": 406, "data": "A medium shot shows only one main object: a hair drier. The hair drier shows a clearly visible handle and nozzle. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly one a hair drier remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a hair drier"}
+{"index": 407, "data": "A close shot shows only one main object: a toothbrush. The toothbrush shows a clearly visible handle and bristle head. The object is placed near the center of the frame with comfortable margins from all image borders, so it is not too close to the edges. It is fully visible, complete, and unobstructed, with no cropping or occlusion in any frame. No additional prominent objects appear in the scene, so the target object remains the only clearly recognizable subject. The object is placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly one a toothbrush remains in the scene throughout, and every frame shows its appearance clearly and consistently for precise recognition.", "original_prompt_en": "a toothbrush"}
+{"index": 408, "data": "A wide shot shows only one main object: a red bicycle. The bicycle is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red bicycle"}
+{"index": 409, "data": "A wide shot shows only one main object: a green bicycle. The bicycle is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green bicycle"}
+{"index": 410, "data": "A wide shot shows only one main object: a blue bicycle. The bicycle is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue bicycle"}
+{"index": 411, "data": "A wide shot shows only one main object: a yellow bicycle. The bicycle is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow bicycle"}
+{"index": 412, "data": "A wide shot shows only one main object: an orange bicycle. The bicycle is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange bicycle"}
+{"index": 413, "data": "A wide shot shows only one main object: a purple bicycle. The bicycle is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple bicycle"}
+{"index": 414, "data": "A wide shot shows only one main object: a pink bicycle. The bicycle is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink bicycle"}
+{"index": 415, "data": "A wide shot shows only one main object: a black bicycle. The bicycle is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black bicycle"}
+{"index": 416, "data": "A wide shot shows only one main object: a white bicycle. The bicycle is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one bicycle with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white bicycle"}
+{"index": 417, "data": "A wide shot shows only one main object: a red car. The car is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red car"}
+{"index": 418, "data": "A wide shot shows only one main object: a green car. The car is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green car"}
+{"index": 419, "data": "A wide shot shows only one main object: a blue car. The car is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue car"}
+{"index": 420, "data": "A wide shot shows only one main object: a yellow car. The car is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow car"}
+{"index": 421, "data": "A wide shot shows only one main object: an orange car. The car is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange car"}
+{"index": 422, "data": "A wide shot shows only one main object: a purple car. The car is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple car"}
+{"index": 423, "data": "A wide shot shows only one main object: a pink car. The car is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink car"}
+{"index": 424, "data": "A wide shot shows only one main object: a black car. The car is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black car"}
+{"index": 425, "data": "A wide shot shows only one main object: a white car. The car is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean open setting under clear daylight. fixed shot. Exactly one car with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white car"}
+{"index": 426, "data": "A medium shot shows only one main object: a red bird. The bird is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red bird"}
+{"index": 427, "data": "A medium shot shows only one main object: a green bird. The bird is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green bird"}
+{"index": 428, "data": "A medium shot shows only one main object: a blue bird. The bird is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue bird"}
+{"index": 429, "data": "A medium shot shows only one main object: a yellow bird. The bird is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow bird"}
+{"index": 430, "data": "A medium shot shows only one main object: an orange bird. The bird is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange bird"}
+{"index": 431, "data": "A medium shot shows only one main object: a purple bird. The bird is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple bird"}
+{"index": 432, "data": "A medium shot shows only one main object: a pink bird. The bird is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink bird"}
+{"index": 433, "data": "A medium shot shows only one main object: a black bird. The bird is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black bird"}
+{"index": 434, "data": "A medium shot shows only one main object: a white bird. The bird is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one bird with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white bird"}
+{"index": 435, "data": "A medium shot shows only one main object: a black cat. The cat is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one cat with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black cat"}
+{"index": 436, "data": "A medium shot shows only one main object: a white cat. The cat is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one cat with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white cat"}
+{"index": 437, "data": "A medium shot shows only one main object: an orange cat. The cat is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one cat with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange cat"}
+{"index": 438, "data": "A medium shot shows only one main object: a yellow cat. The cat is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment under clear daylight. fixed shot. Exactly one cat with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow cat"}
+{"index": 439, "data": "A medium shot shows only one main object: a red umbrella. The umbrella is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red umbrella"}
+{"index": 440, "data": "A medium shot shows only one main object: a green umbrella. The umbrella is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green umbrella"}
+{"index": 441, "data": "A medium shot shows only one main object: a blue umbrella. The umbrella is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue umbrella"}
+{"index": 442, "data": "A medium shot shows only one main object: a yellow umbrella. The umbrella is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow umbrella"}
+{"index": 443, "data": "A medium shot shows only one main object: an orange umbrella. The umbrella is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange umbrella"}
+{"index": 444, "data": "A medium shot shows only one main object: a purple umbrella. The umbrella is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple umbrella"}
+{"index": 445, "data": "A medium shot shows only one main object: a pink umbrella. The umbrella is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink umbrella"}
+{"index": 446, "data": "A medium shot shows only one main object: a black umbrella. The umbrella is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black umbrella"}
+{"index": 447, "data": "A medium shot shows only one main object: a white umbrella. The umbrella is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one umbrella with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white umbrella"}
+{"index": 448, "data": "A medium shot shows only one main object: a red suitcase. The suitcase is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red suitcase"}
+{"index": 449, "data": "A medium shot shows only one main object: a green suitcase. The suitcase is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green suitcase"}
+{"index": 450, "data": "A medium shot shows only one main object: a blue suitcase. The suitcase is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue suitcase"}
+{"index": 451, "data": "A medium shot shows only one main object: a yellow suitcase. The suitcase is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow suitcase"}
+{"index": 452, "data": "A medium shot shows only one main object: an orange suitcase. The suitcase is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange suitcase"}
+{"index": 453, "data": "A medium shot shows only one main object: a purple suitcase. The suitcase is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple suitcase"}
+{"index": 454, "data": "A medium shot shows only one main object: a pink suitcase. The suitcase is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink suitcase"}
+{"index": 455, "data": "A medium shot shows only one main object: a black suitcase. The suitcase is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black suitcase"}
+{"index": 456, "data": "A medium shot shows only one main object: a white suitcase. The suitcase is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one suitcase with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white suitcase"}
+{"index": 457, "data": "A close shot shows only one main object: a red bowl. The bowl is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red bowl"}
+{"index": 458, "data": "A close shot shows only one main object: a green bowl. The bowl is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green bowl"}
+{"index": 459, "data": "A close shot shows only one main object: a blue bowl. The bowl is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue bowl"}
+{"index": 460, "data": "A close shot shows only one main object: a yellow bowl. The bowl is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow bowl"}
+{"index": 461, "data": "A close shot shows only one main object: an orange bowl. The bowl is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange bowl"}
+{"index": 462, "data": "A close shot shows only one main object: a purple bowl. The bowl is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple bowl"}
+{"index": 463, "data": "A close shot shows only one main object: a pink bowl. The bowl is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink bowl"}
+{"index": 464, "data": "A close shot shows only one main object: a black bowl. The bowl is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black bowl"}
+{"index": 465, "data": "A close shot shows only one main object: a white bowl. The bowl is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one bowl with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white bowl"}
+{"index": 466, "data": "A medium shot shows only one main object: a red chair. The chair is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red chair"}
+{"index": 467, "data": "A medium shot shows only one main object: a green chair. The chair is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green chair"}
+{"index": 468, "data": "A medium shot shows only one main object: a blue chair. The chair is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue chair"}
+{"index": 469, "data": "A medium shot shows only one main object: a yellow chair. The chair is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow chair"}
+{"index": 470, "data": "A medium shot shows only one main object: an orange chair. The chair is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange chair"}
+{"index": 471, "data": "A medium shot shows only one main object: a purple chair. The chair is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple chair"}
+{"index": 472, "data": "A medium shot shows only one main object: a pink chair. The chair is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink chair"}
+{"index": 473, "data": "A medium shot shows only one main object: a black chair. The chair is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black chair"}
+{"index": 474, "data": "A medium shot shows only one main object: a white chair. The chair is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a clean indoor setting with a plain uncluttered background. fixed shot. Exactly one chair with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white chair"}
+{"index": 475, "data": "A close shot shows only one main object: a red clock. The clock is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red clock"}
+{"index": 476, "data": "A close shot shows only one main object: a green clock. The clock is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green clock"}
+{"index": 477, "data": "A close shot shows only one main object: a blue clock. The clock is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue clock"}
+{"index": 478, "data": "A close shot shows only one main object: a yellow clock. The clock is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow clock"}
+{"index": 479, "data": "A close shot shows only one main object: an orange clock. The clock is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange clock"}
+{"index": 480, "data": "A close shot shows only one main object: a purple clock. The clock is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple clock"}
+{"index": 481, "data": "A close shot shows only one main object: a pink clock. The clock is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink clock"}
+{"index": 482, "data": "A close shot shows only one main object: a black clock. The clock is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black clock"}
+{"index": 483, "data": "A close shot shows only one main object: a white clock. The clock is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one clock with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white clock"}
+{"index": 484, "data": "A close shot shows only one main object: a red vase. The vase is shown with a clear red color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear red color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a red vase"}
+{"index": 485, "data": "A close shot shows only one main object: a green vase. The vase is shown with a clear green color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear green color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a green vase"}
+{"index": 486, "data": "A close shot shows only one main object: a blue vase. The vase is shown with a clear blue color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear blue color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a blue vase"}
+{"index": 487, "data": "A close shot shows only one main object: a yellow vase. The vase is shown with a clear yellow color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear yellow color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a yellow vase"}
+{"index": 488, "data": "A close shot shows only one main object: an orange vase. The vase is shown with a clear orange color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear orange color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "an orange vase"}
+{"index": 489, "data": "A close shot shows only one main object: a purple vase. The vase is shown with a clear purple color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear purple color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a purple vase"}
+{"index": 490, "data": "A close shot shows only one main object: a pink vase. The vase is shown with a clear pink color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear pink color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a pink vase"}
+{"index": 491, "data": "A close shot shows only one main object: a black vase. The vase is shown with a clear black color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear black color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a black vase"}
+{"index": 492, "data": "A close shot shows only one main object: a white vase. The vase is shown with a clear white color, and the whole object is fully visible, complete, and unobstructed in every frame. It is placed near the center of the frame with comfortable margins from all image borders, and no additional prominent objects appear in the scene. The scene is set in a simple natural environment with a clean softly blurred background. fixed shot. Exactly one vase with a clear white color remains in the scene throughout, and every frame shows it clearly and consistently for precise recognition.", "original_prompt_en": "a white vase"}
+{"index": 493, "data": "A panoramic shot of a beautiful coastal beach in spring, styled in Van Gogh’s artistic manner with bold, swirling brushstrokes. Gentle waves, dyed in turquoise and sapphire hues that ripple like animated paint, lap rhythmically against the golden sand—its surface textured like thickly applied pigment, each grain a vivid dab of color. The sky is a dynamic mix of soft blues and warm yellows, mirroring Van Gogh’s dreamy vibrancy, with wispy clouds formed by swirling strokes. In the background, the horizon blends sea and sky in harmonious hues, and the shoreline has delicate, brush - like green vegetation, hinting at spring’s freshness. The camera stays fixed, capturing waves caressing the sand in this painterly scene, all elements glowing with Van Gogh’s intense, emotive color palette.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, Van Gogh style"}
+{"index": 494, "data": "A medium shot, rendered in the style of an oil painting, captures a beautiful coastal beach in spring. The golden sand stretches smoothly, with gentle waves rhythmically lapping against it—their frothy white crests contrasting the deep blue seawater. The air carries a fresh, breezy feel, and the background reveals a hazy horizon where the pale blue sky merges with the calm, distant sea, all depicted with rich, textured brushstrokes typical of an oil painting. The waves’ lapping is the sole motion, crafting a serene, timeless mood across the sun - kissed, still shore.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, oil painting"}
+{"index": 495, "data": "Panoramic shot of a beautiful coastal beach in spring, where gentle turquoise waves lap rhythmically against the pale golden sand. The scene is rendered in the style of Ukiyo - e, reminiscent of Hokusai’s masterful works, with delicate brushstrokes capturing the tranquil motion of the waves and the stillness of the sandy shore. The sky above is a soft, muted blue with wispy clouds, while scattered seashells and smooth pebbles adorn the beach, enhancing the serene, artistic ambiance. The camera remains steady, focusing on the harmonious interplay between the moving waves and the static shore, evoking the timeless beauty of traditional Japanese ukiyo - e art.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo"}
+{"index": 496, "data": "A fixed shot of a beautiful coastal beach in spring, rendered in black and white. Gentle waves with frothy crests lap rhythmically against the fine sand (appearing in varying shades of gray in the monochrome view). The shoreline stretches into the distance, with faint outlines of distant rock formations or coastal vegetation visible in the hazy background. The waves continuously roll in, creating subtle ripples that spread across the sand before receding, while the overall scene exudes a serene, timeless quality due to the black - and - white aesthetic.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, black and white"}
+{"index": 497, "data": "Long shot in pixel - art style, presenting a beautiful coastal beach in spring. The golden - colored sand extends along the shoreline, and gentle waves with frothy white edges lap against the sand rhythmically, producing small ripples that spread and then recede. The sky is bright blue with a few fluffy white clouds, and in the distance, swaying palm trees line the beach. The scene, rendered in pixel - art, has noticeable pixelated textures on the waves and sandy ground. Fixed shot, capturing the peaceful wave movement and the tranquil spring beach beauty.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pixel art"}
+{"index": 498, "data": "Panoramic shot of a beautiful coastal beach in spring, styled in cyberpunk aesthetics. The golden sandy shore stretches along the coastline, with gentle waves lapping the sand in rhythmic motions—their frothy white crests glistening under the ambient, neon - tinged light. The background showcases a skyline of futuristic skyscrapers, their facades embedded with glowing neon strips in electric purple and cyan, alongside holographic billboards projecting flickering advertisements against the hazy, technologically altered spring sky. The beach’s natural tranquility contrasts sharply with the high - tech, dystopian charm of the cyberpunk cityscape behind, where flying vehicles (drones or hovercars) occasionally zip across the sky. The camera holds steady, capturing the continuous motion of waves caressing the sand while the cyberpunk metropolis buzzes with artificial light and futuristic activity in the distance.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in cyberpunk style"}
+{"index": 499, "data": "Wide shot of a beautiful coastal beach in spring, rendered in an animated style. The shoreline is lined with soft golden - yellow sand, and gentle light - blue waves lap rhythmically against it, creating frothy white crests that dissolve into the sand. The sky above is a clear and vibrant blue with fluffy white clouds drifting lazily. In the distance, slender palm trees with lush green fronds sway gently in the breeze, their trunks casting delicate shadows on the sand. The water shimmers with bright, cartoonish hues, and the waves’ motion is fluid and stylized, typical of animated visuals. The scene exudes a cheerful and lively atmosphere, with the animated style enhancing the vivid colors and smooth, playful movement of the waves.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, animated style"}
+{"index": 500, "data": "A panoramic shot captures a beautiful coastal beach in spring, rendered in a watercolor - painting style with soft and blended hues that give the scene a dreamy and ethereal quality. Gentle waves, their surfaces glistening like liquid aquamarine under the mild spring sunlight, lap rhythmically against the golden, fine - grained sand, leaving subtle and shimmering ripples in their wake. The sky above is a pale, misty blue, with wispy clouds drifting lazily. On the distant horizon, faint silhouettes of rocky cliffs or islands can be seen, adding depth to the tranquil seascape. Scattered across the sand are delicate seashells with iridescent surfaces that catch the light, and patches of seaweed in muted greens and browns, gently swaying with the ebb and flow of the tide. The camera remains fixed, capturing the serene motion of the waves as they caress the shore. The entire scene is bathed in the soft and diffused light characteristic of a spring day by the coast.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, watercolor painting"}
+{"index": 501, "data": "Long shot captures a beautiful coastal beach in spring, rendered in surrealism style. Gentle waves with iridescent hues (typical of surrealist imagery) lap rhythmically against the fine, ivory - hued sand that glitters as if sprinkled with stardust. The sky above is a dreamlike blend of pastel pinks and blues, with clouds shaped like floating sea creatures, enhancing the surreal atmosphere. The camera remains steady, emphasizing the otherworldly stillness of the scene while the waves’ motion contrasts with the dreamy, static beauty of the beachscape.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, surrealism style"}
+{"index": 502, "data": "Panoramic shot of The Bund in Shanghai, presented in a Van Gogh - style. The historic waterfront buildings feature intricate facades rendered with bold, swirling brushstrokes of warm yellows and oranges, echoing Van Gogh’s expressive artistic technique. The Huangpu River below mirrors the vibrant, distorted colors of the sky—swirling blues and golden yellows that capture the dynamism of Van Gogh’s iconic brushwork, reminiscent of *Starry Night*. Pedestrians stroll along the promenade, their forms outlined with thick, expressive lines, while boats on the river float with shapes exaggerated by the artistic style. The camera pans slowly across the scene, capturing the interplay of light and color on the buildings’ surfaces and the rippling water, as the sky (reimagined in Van Gogh’s signature palette of vivid, swirling hues) looms overhead, enhancing the dreamy, painterly atmosphere.", "original_prompt_en": "The bund Shanghai, Van Gogh style"}
+{"index": 503, "data": "A panoramic shot in an oil - painting style captures The Bund in Shanghai. Grand colonial - style buildings with elaborate architectural details line the riverside, their facades rendered in rich, warm hues characteristic of oil paintings. The Huangpu River flows serenely in the foreground, with a few cruise ships gliding on the water, their reflections on the river surface blurred in a way that mimics the brushstrokes of an oil painting. The sky above is a harmonious mix of soft oranges and purples, evoking the glow of a sunset, with clouds depicted in gentle, flowing forms. Along the riverside walkway, silhouettes of pedestrians amble, their figures softened by the oil - painting effect, infusing the scene with a nostalgic and artistic atmosphere. The camera stays fixed, enabling a full appreciation of this picturesque, painterly portrayal of the iconic Shanghai landmark.", "original_prompt_en": "The bund Shanghai, oil painting"}
+{"index": 504, "data": "Panoramic shot of The Bund in Shanghai, rendered in the Ukiyo - e style of Hokusai. The scene showcases riverside buildings with Hokusai’s distinctive Ukiyo - e details: bold black outlines, delicate brushstrokes, and a muted yet vivid color palette. The Huangpu River flows calmly in the foreground, with traditional - styled boats (adorned with Ukiyo - e - like patterns) floating on the water. Along the riverbank, figures in attire blending local and Ukiyo - e influences move leisurely, some gazing at the water. The sky above is a soft gradient of blues and whites, with wispy clouds drawn in Hokusai’s characteristic lines. The camera remains steady, capturing the serene urban landscape reimagined through Hokusai’s Ukiyo - e artistry, merging historical artistry with Shanghai’s iconic waterfront scene.", "original_prompt_en": "The bund Shanghai by Hokusai, in the style of Ukiyo"}
+{"index": 505, "data": "Panoramic shot in black - and - white, showcasing the Bund in Shanghai. The scene is dominated by retro - styled European - architectural structures with stone - textured facades, their elaborate window patterns discernible in the monochromatic palette. The Huangpu River stretches before these buildings, with a few dark - toned vessels cruising on the tranquil water, their hulls and masts forming stark, elegant silhouettes. Along the riverside walkway, pedestrians in diverse outfits stroll leisurely—some in groups chatting, others pausing to admire the river view. The background sky, rendered in grayscale, exhibits subtle tonal variations, hinting at a clear or mildly overcast day. The camera slowly pans from left to right, capturing the continuous expanse of historic architecture and the bustling yet serene ambiance of the Bund.", "original_prompt_en": "The bund Shanghai, black and white"}
+{"index": 506, "data": "Panoramic shot in pixel art style of The Bund in Shanghai. The scene is composed of vibrant pixelated color blocks, with the Huangpu River in the foreground showing pixelated ripples. On the opposite bank, iconic buildings like the Oriental Pearl Tower stand with their pixelated outlines, displaying a retro video - game - like aesthetic. Along the riverside promenade, pixel - styled pedestrians, some strolling and some taking photos, move about. The sky is a clear blue with pixelated white clouds floating. In the river, pixelated boats with simple geometric shapes sail slowly. The camera remains fixed, capturing the nostalgic pixel - art - rendered bustling scene of Shanghai's Bund, where the pixelated architecture and people create a unique visual experience reminiscent of classic 8 - bit games.", "original_prompt_en": "The bund Shanghai, pixel art"}
+{"index": 507, "data": "Panoramic shot of The Bund in Shanghai, presented in cyberpunk style. It’s nighttime, with the sky a deep indigo hue, faintly illuminated by scattered neon glows that tint the humid air. The historic riverfront buildings are adorned with futuristic cyberpunk enhancements: holographic advertisements in neon pink and electric blue hover above their facades, while glowing LED strips trace the architectural outlines. The Huangpu River below mirrors the vibrant, chaotic light display, ripples distorting the reflections of illuminated skyscrapers and occasional futuristic drones hovering in the distance. On the promenade, pedestrians in cyberpunk - inspired attire—some with glowing cybernetic implants, others in sleek, neon - lined jackets—move slowly, their shadows elongated by multicolored streetlights. The camera pans from left to right, capturing the stark contrast between the antique architecture and the high - tech, dystopian - futuristic additions. Neon signs flicker, and digital billboards cycle through vivid, glitchy animations, blending The Bund’s old - world charm with a gritty, futuristic aesthetic.", "original_prompt_en": "The bund Shanghai, in cyberpunk style"}
+{"index": 508, "data": "Panoramic shot of The Bund in Shanghai, presented in an animated style. The scene showcases cartoon - styled historical buildings with vibrant, saturated colors lining the riverside, their architectural details simplified into playful, exaggerated shapes. The Huangpu River below has a smooth, stylized surface with gentle, animated ripples shimmering in bright hues. Cartoon - like boats, with rounded edges and bold color schemes, float slowly on the river, moving from the right to the left of the frame. The sky is a clear, bright blue, dotted with fluffy, white, cartoon clouds featuring soft, rounded outlines. In the foreground, cartoon pedestrians with exaggerated features and colorful clothing stroll along the riverside promenade. The camera pans slowly from left to right, capturing the lively, whimsical atmosphere of the animated Bund, where the iconic skyline and animated elements together create a cheerful, cartoonish scene.", "original_prompt_en": "The bund Shanghai, animated style"}
+{"index": 509, "data": "Panoramic shot of The Bund in Shanghai, presented in a watercolor painting style. The scene features the iconic European - styled buildings of the Bund, their facades adorned with soft, blended watercolor hues, their reflections gently rippling on the tranquil Huangpu River. The sky is dotted with light, wispy clouds, rendered with the delicate brushstrokes characteristic of watercolors. Small boats with softly - outlined forms float on the river, and faint, mist - like figures of pedestrians wander along the riverside promenade, their shapes slightly blurred to fit the watercolor aesthetic. The overall color scheme is delicate, dominated by pastel tones, creating a dreamy and artistic atmosphere that encapsulates the charm of Shanghai's Bund in the watercolor medium.", "original_prompt_en": "The bund Shanghai, watercolor painting"}
+{"index": 510, "data": "Panoramic shot of The Bund in Shanghai, presented in a surrealism style. The historic buildings lining the Bund exhibit dreamlike distortions—their stone facades curve and flow like liquid, merging into the sky in surreal, undulating silhouettes. The Huangpu River below mirrors a kaleidoscopic array of neon glows and impossible colors, its surface rippling with glowing, otherworldly patterns that ripple without natural logic. The sky is a surreal canvas of swirling, multicolored clouds, defying natural hues, and a hazy, shimmering mist shrouds the scene in an otherworldly blur. In the foreground, pedestrians move in slow, dreamlike motions, their forms slightly warped as if in a waking dream. The camera pans gently across the scene, capturing the surreal fusion of Shanghai’s iconic architecture and fantastical elements: some buildings hover above the ground, the river’s surface undulates with impossible wave - like textures, and light casts surreal, shifting patterns. The atmosphere is ethereal, with reality and dreamscape merging—architectural details melt into the air, and the river glows with unearthly, pulsating light, crafting a scene both familiar (The Bund) and utterly surreal.", "original_prompt_en": "The bund Shanghai, surrealism style"}
+{"index": 511, "data": "A medium shot captures a shark swimming in the ocean, rendered in Vincent van Gogh’s iconic artistic style. The shark, with a sleek form, is depicted with swirling, vibrant blues and yellows across its gray - toned body, echoing the dynamic brushstrokes of Van Gogh’s works. The surrounding ocean water bursts with chaotic, painterly patterns: deep cobalt waves intermingle with golden - hued currents, mimicking the turbulent, textured skies of *The Starry Night*. The background dissolves into a dreamlike expanse of stylized, brushstroke - filled water, as the shark glides gracefully forward, its fins slicing through the vividly rendered sea.", "original_prompt_en": "a shark is swimming in the ocean, Van Gogh style"}
+{"index": 512, "data": "A medium long shot in an oil - painting style captures a grayish - blue shark with a sleek, streamlined body swimming in the deep blue ocean. The shark glides gracefully, its tail fin undulating rhythmically as it moves, with the ocean water depicted in rich, brush - stroked turquoise and deep blue tones, showing gentle waves. In the distance, faint, painterly outlines of coral reefs and small fish add to the scene’s depth. The oil - painting effect enhances the vivid colors and gives the shark’s fluid motion a dreamy, artistic quality as it swims towards the left of the frame.", "original_prompt_en": "a shark is swimming in the ocean, oil painting"}
+{"index": 513, "data": "A medium shot captures a sleek gray shark with a streamlined body swimming gracefully in the ocean, rendered in the Ukiyo - e style reminiscent of Hokusai’s masterful works. The shark’s smooth, gray skin glistens as it undulates its powerful tail fin, moving slowly towards the right of the frame. The ocean around it showcases the characteristic Ukiyo - e aesthetics: waves with bold, curvilinear brushstrokes in varying shades of blue, evoking the traditional woodblock - print texture, and the water surface is dotted with white, stylized foam that mirrors Hokusai’s iconic wave depictions. The background reveals a vast expanse of the sea, with hints of distant, misty horizons rendered in the soft, muted tones typical of Ukiyo - e, while the shark continues its elegant swim, embodying the dynamic yet serene essence of Hokusai’s oceanic visions.", "original_prompt_en": "a shark is swimming in the ocean by Hokusai, in the style of Ukiyo"}
+{"index": 514, "data": "A medium shot captures a black - and - white shark swimming in the ocean. The shark has a streamlined body, and its black - and - white patterned skin stands out against the deep - blue ocean water. Gentle waves roll around it, and some light rays penetrate the water surface, creating a shimmering effect. The shark moves smoothly, with its body undulating rhythmically as it swims towards the right of the frame, and the camera follows its movement to keep it in the center of the shot.", "original_prompt_en": "a shark is swimming in the ocean, black and white"}
+{"index": 515, "data": "A medium shot in pixel art style captures a gray shark with blocky, pixelated features swimming in the ocean. The shark has a streamlined body with pixelated dorsal, pectoral, and caudal fins, its mouth slightly open to reveal pixelated white teeth. The ocean background consists of pixelated blue water with pixelated waves and distant pixelated seaweed. The retro pixel art style gives the scene a low - resolution, square - pixel appearance. The shark swims steadily toward the right of the frame, and the camera follows its movement, maintaining focus on the shark as it glides through the pixelated ocean.", "original_prompt_en": "a shark is swimming in the ocean, pixel art"}
+{"index": 516, "data": "The visual style is cyberpunk, with dimness and neon glows. A panoramic shot captures a gray shark with a streamlined body swimming in the cyberpunk - themed ocean. The ocean shimmers with blue - purple neon reflections, and the background reveals submerged futuristic city ruins, metallic structures, and flickering neon lights cutting through the dark water. The shark’s skin has faint, glowing circuit - like patterns, and it swims forward smoothly, tail fin swaying rhythmically. The camera follows the shark from a side angle, capturing its fluid motion as it navigates the neon - lit, dystopian underwater space, with floating digital particles dancing around.", "original_prompt_en": "a shark is swimming in the ocean, in cyberpunk style"}
+{"index": 517, "data": "An animated - style medium shot captures a cartoonish gray shark with a streamlined body and sharp fins swimming gracefully in the deep blue ocean. The water around it is filled with bubbly trails and faint silhouettes of colorful tropical fish darting in the background. The shark swings its tail side to side, propelling itself forward with smooth, exaggerated motions typical of animation. The camera follows the shark’s movement, panning slightly to keep it centered in the frame, while the ocean’s surface above shows gentle, stylized waves reflecting bright, vibrant colors characteristic of the animated art style.", "original_prompt_en": "a shark is swimming in the ocean, animated style"}
+{"index": 518, "data": "A medium shot in a watercolor painting style captures a grayish - blue shark with a streamlined body and a triangular dorsal fin swimming gracefully in the deep blue ocean. The ocean water shows soft, blended light - blue waves, and there are a few floating seaweeds and tiny silver - colored fish in the background, creating a tranquil marine scene. The shark moves smoothly from the right to the left of the frame, its tail fin swaying rhythmically, and the camera follows its movement to keep the shark in focus.", "original_prompt_en": "a shark is swimming in the ocean, watercolor painting"}
+{"index": 519, "data": "A medium shot in a surrealism style captures a gray - blue shark with a streamlined body swimming in the ocean. The shark’s dorsal fin slices through the water, which shimmers with surreal hues of deep purple and turquoise, dotted with bioluminescent jellyfish emitting faint blue light. The ocean floor is lined with distorted, otherworldly coral formations in neon pink and green, twisting in impossible shapes. The shark moves gracefully, its tail fin undulating in a fluid, almost dreamlike motion as it glides from the right to the left of the frame. The camera follows the shark’s movement, panning left to keep it centered, while the surreal background—with floating, translucent geometric shapes and a sky (visible through the water’s surface) painted in pastel oranges and purples—enhances the dreamlike atmosphere.", "original_prompt_en": "a shark is swimming in the ocean, surrealism style"}
+{"index": 520, "data": "Medium shot captures a giant panda in a Van Gogh - styled Parisian café, sipping coffee from a white ceramic cup. The panda’s black - and - white fur is recreated with Van Gogh’s signature bold, swirling brushstrokes, interspersed with vivid yellows and deep blues. The café’s interior is full of color: the walls are decorated with starry - night - inspired swirling patterns, the wooden tables and chairs have exaggerated, brushstroke - like edges, and a window shows a Paris street in impressionistic, sun - lit hues. The panda holds the cup with its right paw, head slightly tilted as it drinks, and the scene is rendered with Van Gogh’s characteristic thick, textured strokes, creating a dreamy, painterly atmosphere of a Parisian café.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, Van Gogh style"}
+{"index": 521, "data": "In an oil - painting style, a medium shot depicts a giant panda with its iconic black - and - white fur seated at a rustic wooden table in a quaint Parisian café. The panda, with one paw curled around a delicate porcelain coffee cup adorned with golden floral patterns, is gently sipping the rich, dark coffee. The café, rendered with thick, vibrant oil - paint brushstrokes, features plush velvet chairs in deep red, walls decorated with impressionistic scenes of Parisian boulevards, and warm, amber light filtering through lace - curtained windows. Outside, the faint, painterly outlines of Parisian architecture—like stone buildings with wrought - iron balconies and the distant silhouette of the Eiffel Tower—add to the scene’s charm. The entire composition, with its textured brushwork and dreamy color palette, captures the panda’s tranquil moment of enjoying coffee in the heart of Paris, evoking the timeless beauty of an oil painting.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, oil painting"}
+{"index": 522, "data": "Medium shot in the style of Ukiyo - e (evoking Hokusai's artistic style) captures a panda with black - and - white fur seated at a wooden table in a Parisian café. The panda, with a rounded body and a relaxed demeanor, holds a white coffee cup adorned with delicate Ukiyo - e - style patterns in its right paw, sipping the dark brown coffee. The café’s interior is warmly lit, featuring wooden furniture, a vintage bar counter, and walls decorated with retro Paris street - scene prints. Outside the window, the silhouettes of Parisian buildings with European - style facades and a softly hued sky (in the muted, elegant color palette characteristic of Ukiyo - e) are visible. The scene is rendered with the delicate linework and soft color gradients of traditional Ukiyo - e prints, blending Eastern artistic aesthetics with the charm of a Parisian café. The camera remains fixed, capturing the panda’s leisurely coffee - drinking moment, while blurred figures of other patrons in the background enhance the dreamy, woodblock - print - like atmosphere.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris by Hokusai, in the style of Ukiyo"}
+{"index": 523, "data": "Medium shot captures a black - and - white giant panda in a cozy Parisian café. The panda, with its distinctive black - and - white fur (black patches around the eyes, ears, and limbs, white on the body), sits on a wooden chair, holding a white ceramic coffee cup with both paws, sipping the brown coffee as steam gently drifts up. The café’s interior features warm yellow lighting, wooden tables, and framed Parisian street - scene paintings on the walls. Outside the window, the backdrop reveals Paris’s iconic cobblestone streets, pastel - hued buildings, and a few pedestrians strolling. The camera remains fixed, documenting the panda’s relaxed demeanor as it savors the coffee.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, black and white"}
+{"index": 524, "data": "[A medium shot in pixel - art style depicts a panda with blocky black - and - white pixels. It is seated at a wooden table in a Parisian café. The café's interior has pixel - styled French - inspired decor: floors with pixel - patterned tiles, a counter with pixelated coffee machines, and framed pixel - art pictures of Parisian streets on the walls. The panda, with its pixel - rendered fur (black patches on a white base, made up of square pixels), holds a pixelated brown coffee cup in its right paw, bringing it to its mouth as if sipping coffee. Outside the café's large pixelated windows, the background shows pixelated Parisian architecture with stone - faced buildings and wrought - iron balconies. The scene has a crisp, nostalgic pixel - art look, and the panda's sipping motion is animated in smooth, block - based frames, typical of classic pixel - art aesthetics.]", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, pixel art"}
+{"index": 525, "data": "Medium shot of a giant panda in a cyberpunk - styled Parisian café. The panda, with its iconic black - and - white fur, is seated at a sleek, metallic table. It holds a transparent glass coffee cup with its paw, gently bringing it to its mouth as if savoring the coffee. The café is filled with cyberpunk elements: neon lights in vibrant hues of purple and blue illuminate the space, holographic menus float in the air, and the walls are adorned with futuristic graffiti. Outside the large, tinted glass windows, the Parisian street is transformed into a cyberpunk scene, with rain - slicked roads reflecting the neon glow, tall buildings covered in digital billboards, and flying vehicles hovering in the overcast sky. The panda remains seated, slowly sipping the coffee, while the camera subtly pans to capture the blend of classic Parisian architecture and cyberpunk aesthetics in the background.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, in cyberpunk style"}
+{"index": 526, "data": "An animated medium shot depicts a panda drinking coffee in a charming Parisian café. The panda, with its classic black - and - white fur, round black eye patches, and fluffy ears, holds a white coffee cup (with a wisp of steam rising) in its paw, sipping leisurely. The café’s interior is cozy and vintage - inspired, with wooden tables and chairs, floral - patterned curtains, and a framed print of the Eiffel Tower on the wall. Outside the window, the silhouette of the Eiffel Tower is visible against a soft, pastel - colored sky, which is characteristic of the animated style. The animation features vibrant colors, smooth lines, and a playful tone, highlighting the panda’s relaxed and adorable demeanor as it savors its coffee in this Parisian setting.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, animated style"}
+{"index": 527, "data": "In a watercolor painting style with soft, blended edges, a medium shot reveals a panda with distinctive black - and - white fur seated at a rustic wooden table in a charming Parisian café. The panda, holding a delicate white coffee cup in its paw, is gently sipping the coffee, and its posture is relaxed. The café’s interior is warm and inviting, with brown leather chairs, vintage - style lamps, and framed posters adorning the walls. Outside the large, paned windows, the iconic Parisian streetscape unfolds—stone buildings with wrought - iron balconies, cobblestone streets, and distant tree - lined avenues. The watercolor technique lends a dreamy, artistic quality to the scene, emphasizing the whimsical contrast of a panda enjoying coffee in a quintessentially French setting.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, watercolor painting"}
+{"index": 528, "data": "A medium shot in surrealism style captures a giant panda with distinctive black - and - white fur, large black eye patches, and a rounded belly, seated at a wooden table in a charming Parisian café. The café features vintage wooden chairs, soft warm lighting, and walls adorned with classic French art prints. The panda holds a delicate white porcelain coffee cup in its paw, sipping the dark brown coffee slowly; steam from the cup twists into surreal, cloud - like shapes. Outside the café’s glass window, the Eiffel Tower stands in the distance, bathed in a dreamy, pastel - colored light that blends reality and fantasy. The panda’s expression is calm and content, and its striking fur contrasts with the café’s warm, cozy tones, amplifying the surreal atmosphere.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, surrealism style"}
+{"index": 529, "data": "A medium shot captures a cute, happy Corgi with short, fluffy tricolor fur playing energetically in a park, bathed in the warm, swirling hues of a Van Gogh - style sunset. The Corgi, with perky ears and a wagging tail, bounds across the grassy area, chasing a fallen leaf. The park’s background features tall trees with twisting branches (echoing Van Gogh’s brushstrokes), and the sky blazes with vibrant oranges, yellows, and deep purples, mimicking Van Gogh’s dynamic, textured style. The ground is a patchwork of green grass and golden sunset light. As the Corgi plays, it occasionally pauses to gaze at the colorful sky, its eyes bright with joy. The camera follows its playful antics, capturing the whimsical, painterly quality of the scene, with long, expressive shadows echoing the artistic style.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, Van Gogh style"}
+{"index": 530, "data": "The image has an oil - painting style with soft, brush - like textures. A medium shot captures a cute and happy Corgi with a fluffy body, short legs, and a coat of warm brown (with white patches) playing in a park. The park is filled with lush green grass, and colorful flowers are scattered here and there. Tall trees stand in the background, their leaves gently swaying in the evening breeze. As the sun sets, the sky is painted in warm shades of orange and pink, casting a golden glow over the entire scene. The Corgi jumps around, chasing a small butterfly, its tail wagging joyfully, fully enjoying the playful moment under the beautiful sunset in the park.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, oil painting"}
+{"index": 531, "data": "Panoramic shot of a cute, happy Corgi with short brown - and - white fur, stubby legs, and perky ears playing in a park. The Corgi bounds around, chasing its tail and wagging its fluffy tail energetically. The background features a Hokusai - style sunset, with the sky dyed in warm oranges and reds, clouds depicted with the delicate, flowing lines typical of Ukiyo - e. The park has lush green grass, scattered cherry blossom trees with pink petals, and a winding stone path. Rendered in the style of Ukiyo - e, the scene boasts soft, muted colors and elegant brushstrokes. The camera follows the Corgi’s movements, capturing its playful antics against the picturesque sunset - lit park.", "original_prompt_en": "A cute happy Corgi playing in park, sunset by Hokusai, in the style of Ukiyo"}
+{"index": 532, "data": "The picture is in black and white, with a warm sunset ambiance implied by the lighting. A medium shot captures a cute, happy Corgi with short, fluffy fur and a stubby tail playing energetically in a park. The Corgi bounds across a grassy area, occasionally stopping to wag its tail or nuzzle at the grass. The background includes park trees with dark silhouettes, a few benches, and the sky—where the sunset’s glow translates to soft light - dark contrasts in the monochrome frame. The camera follows the Corgi as it runs, capturing its joyful leaps and spins. The Corgi looks toward the camera with a cheerful expression, its ears perked, embodying a playful spirit. The park’s serene environment, with the sunset casting long shadows in grayscale, enhances the black - and - white aesthetic.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, black and white"}
+{"index": 533, "data": "Pixel art style, medium shot: A cute, happy Corgi with short, fluffy brown - white fur plays energetically in a park at sunset. The Corgi wags its stumpy tail, bounces around the grassy field, and occasionally pauses to nuzzle the green grass dotted with vibrant wildflowers. The background shows a park scene bathed in the warm glow of sunset: the sky is a gradient of orange, pink, and deep purple, and tall trees with leaves glowing golden in the light line the edge of the park. A wooden bench is on the left, and a winding stone - tiled path curves through the grass. The ground is lush green grass with patches of exposed soil. The camera stays fixed, capturing the Corgi as it joyfully chases a fluttering leaf or runs in circles. The pixel art style gives a retro, blocky aesthetic, enhancing the nostalgic sunset ambiance with sharp, colorful pixel details.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, pixel art"}
+{"index": 534, "data": "Panoramic shot of a cute, happy Corgi with short yellow - white fur and a plump rump playing in a cyberpunk - styled park. The sky glows with warm orange and deep purple during sunset, while the park is filled with neon - lit holographic billboards, futuristic metallic structures, and glowing neon pathways. The Corgi, with its short legs bouncing, joyfully chases a cyan - hued light spot (from a neon sign) on the ground, its tail wagging rapidly. In the background, towering buildings with cascading LED strips and floating drones hover, blending the sunset’s warmth with the park’s cool cyberpunk aesthetics. The camera follows the Corgi as it prances left, capturing its playful movements against the sci - fi - infused park and the colorful sunset sky.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, in cyberpunk style"}
+{"index": 535, "data": "An animated - style medium full shot captures a cute and happy Corgi with a short, fluffy tricolor (brown, white, and black) coat and a stumpy tail playing in a park. The Corgi has bright, sparkling eyes and a wide, joyful grin, bounding around on its short legs, chasing a colorful butterfly that flutters near the grass. The park background features lush green lawns, a few tall trees with leaves gently swaying in the breeze, and a stunning sunset painting the sky in vibrant hues of orange, pink, and purple, with golden - hued clouds drifting slowly. The animation style brings vivid, saturated colors, smooth, exaggerated movements (like the Corgi’s body stretching comically as it leaps), and adorable, cartoon - like facial expressions (its eyes turning into star shapes when excited). The camera follows the Corgi’s playful movements, panning slightly to the right as it darts across the grass, with the warm glow of the sunset casting long, whimsical shadows on the ground.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, animated style"}
+{"index": 536, "data": "A medium shot with a watercolor painting aesthetic captures a cute, happy Corgi—with short, fluffy tricolor fur (predominantly brown with white markings) and a cheerfully wagging stumpy tail—playing energetically in a park at sunset. The scene is rendered in soft, blended watercolor strokes: the sky glows with warm orange and pink hues, casting a golden glow over the park’s lush green grass and scattered, silhouette - like trees. The Corgi bounces around, chasing a fallen leaf or pouncing on dappled sunlight, its ears perked and tongue lolling in joy. The background features a serene park landscape with gentle grassy slopes, a few distant flower beds, and the sunset’s radiant light reflecting off a small, shimmering pond (styled as a watercolor wash). The camera follows the Corgi’s playful movements, panning left as it darts right, preserving the dreamy, artistic watercolor effect throughout. The sunset’s warm tones and the watercolor style combine to create a whimsical, cheerful scene of the Corgi’s park adventure.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, watercolor painting"}
+{"index": 537, "data": "A medium full shot captures a cute happy Corgi with short brown fur and white markings on its chest and face, playing energetically in a park. The Corgi bounds across lush green grass dotted with vibrant, surreal flowers that glow faintly under the warm sunset light. The sky is ablaze with vivid oranges, pinks, and a dreamy purple hue as the sun sets, creating a surreal, painterly atmosphere. The park’s trees have twisted, whimsical shapes, and the grass ripples like water in patches, enhancing the surrealist style. The Corgi chases a large, iridescent butterfly that floats in mid - air, its short legs moving quickly as it bounds around. The camera follows its playful movements, capturing the long, distorted shadows cast by the surreal sunset light on the ground. In the background, the trees’ leaves shimmer, and the clouds morph into fantastical forms, further emphasizing the surreal aesthetic. The warm golden light of the setting sun bathes the scene, blending reality and dream as the Corgi continues its joyful play.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, surrealism style"}
+{"index": 538, "data": "Medium shot captures Gwen Stacy, a young woman with shoulder - length blonde hair, dressed in a blue - and - white striped dress, seated as she reads a book. Her gaze is fixed on the pages, and her fingers gently hold the book’s edge. The background is infused with Van Gogh’s signature style: swirling, vividly colored brushstrokes form a sky of deep indigo and blazing golden - yellow spirals (reminiscent of “The Starry Night”), creating a dreamy, painterly atmosphere that contrasts with her still, focused posture. The camera stays still, highlighting her quiet concentration as she reads, while the Van Gogh - inspired backdrop throbs with dynamic, expressive color and texture, merging the real - life scene with the artist’s iconic vision.", "original_prompt_en": "Gwen Stacy reading a book, Van Gogh style"}
+{"index": 539, "data": "A medium shot captures Gwen Stacy reading a book, rendered in the style of an oil painting. She has long, wavy blonde hair and wears a delicate white blouse with lace trims. The book, with a dark brown leather cover, rests on her lap as her fingers lightly trace the pages. The background is filled with soft, impressionistic brushstrokes—warm amber and muted blue hues blend across the canvas - like scene, with a vintage wooden chair and a faded floral tapestry adding to the painterly, nostalgic atmosphere. Her posture is relaxed yet focused, eyes fixed on the text, capturing the serene, timeless quality of an oil painting.", "original_prompt_en": "Gwen Stacy reading a book, oil painting"}
+{"index": 540, "data": "Medium shot captures Gwen Stacy deeply absorbed in reading a book by Hokusai, presented in the Ukiyo - e style. She sports her characteristic blonde hair, styled in gentle, cascading waves, and dons a flowing outfit with subtle patterns evoking traditional Japanese textiles. The book she holds boasts a cover decorated with Hokusai’s renowned artwork—maybe the dynamic Great Wave or a peaceful landscape—with its pages showcasing intricate woodblock - print - like illustrations. The backdrop is a serene, traditional Japanese interior: tatami mats lie beneath her, shoji doors softly diffuse the natural light, and framed Ukiyo - e prints adorn the walls, their vivid hues and detailed scenes harmonizing with the book’s aesthetic. Gwen sits in a relaxed yet attentive stance, her eyes meticulously following the text and images, occasionally lingering on a striking illustration. The soft, diffused light bathes her and the book in a warm radiance, amplifying the Ukiyo - e - inspired atmosphere that permeates the scene.", "original_prompt_en": "Gwen Stacy reading a book by Hokusai, in the style of Ukiyo"}
+{"index": 541, "data": "A medium shot in black - and - white captures Gwen Stacy engrossed in reading a book. She has shoulder - length hair (appearing as soft gray in the monochrome style) and is dressed in a simple, dark - toned blouse (presented in shades of black and white). The book, a hard - covered volume with faint text on its cover, is held in her hands as she focuses intently on the pages. The background features a quiet room with wooden bookshelves filled with various books, their spines creating a patterned backdrop. Gwen sits upright, her eyes scanning the text, occasionally moving her fingers to trace the lines or turn a page. The black - and - white aesthetic casts gentle shadows on the plain walls, lending a classic, serene atmosphere to the scene.", "original_prompt_en": "Gwen Stacy reading a book, black and white"}
+{"index": 542, "data": "A medium close - up shot in pixel art style captures Gwen Stacy. She has long blonde hair and wears her iconic blue - white checkered school uniform. She sits upright, holding a pixelated book with a simple block - patterned cover in both hands, her eyes intently fixed on the pages as she reads. The background is a pixel - constructed room, with walls composed of vibrant, neatly arranged pixel blocks, a pixel - art table with a tiny pixel - styled plant, and other retro - pixel decorative items. The entire scene exhibits the classic pixel art aesthetic, with distinct blocky edges and a nostalgic, colorful palette.", "original_prompt_en": "Gwen Stacy reading a book, pixel art"}
+{"index": 543, "data": "Medium shot captures Gwen Stacy, with her blonde hair styled in a sleek bob, seated on a metallic bench amidst a cyberpunk cityscape. She dons a black leather jacket etched with neon - blue circuit - like patterns, futuristic goggles perched on her forehead, and fingerless gloves. In her hands, a book with a glowing magenta cover rests, her eyes fixed on the pages as she traces them with her right index finger. The backdrop erupts with neon - lit skyscrapers, holographic ads flickering with garbled text, and rain - drenched streets reflecting vibrant hues—purple, cyan, and red neon signs cast glows on the wet pavement. A hovering drone drifts by in the distance, and the air thrums with the low hum of hovercars. Gwen remains still, engrossed in her reading, as the cyberpunk world’s neon chaos swirls around her, neon light streaks from passing vehicles intermittently illuminating the scene.", "original_prompt_en": "Gwen Stacy reading a book, in cyberpunk style"}
+{"index": 544, "data": "Medium shot in an animated style captures Gwen Stacy reading a book. She has long blonde hair with a white headband, wearing a blue - and - white striped top. Her eyes are fixed on the book, fingers gently holding the pages, and a faint smile on her face. The background is a cozy animated room with pastel - colored walls, a window revealing a sunny sky with fluffy clouds, and a wooden bookshelf filled with colorful books. The camera remains fixed, emphasizing her calm reading posture, with soft, vibrant animation lines and bright, saturated colors characteristic of the animated style.", "original_prompt_en": "Gwen Stacy reading a book, animated style"}
+{"index": 545, "data": "Medium shot of Gwen Stacy, her long blonde hair flowing over her shoulders, clad in a white blouse and a blue skirt, seated upright on a wooden chair. She holds a hardcover book with both hands, her gaze fixed on the pages, a faint smile gracing her lips. The backdrop is a cozy art studio: a wooden easel with a half - completed watercolor painting of a blooming rose stands to her right, a palette brimming with vivid watercolor shades (blues, pinks, yellows) lies on a cluttered desk, along with scattered paintbrushes and sheets of watercolor paper. Sunlight streams through a window, casting a warm radiance over the scene, and the walls are decorated with framed watercolor artworks. As she reads, she occasionally looks up, her attention momentarily drawn to the watercolor painting on the easel before she resumes reading.", "original_prompt_en": "Gwen Stacy reading a book, watercolor painting"}
+{"index": 546, "data": "A medium shot presents Gwen Stacy, with long golden hair cascading down, clad in a white button - up shirt and a blue plaid skirt (evoking her iconic school uniform look), deeply engrossed in reading a book. The book boasts an ornate, vintage - styled cover adorned with swirling, iridescent patterns that emit a faint, otherworldly glow, perfectly suiting the surrealism aesthetic. The background unfolds as a surreal realm: books of all sizes float weightlessly around her, the walls undulate like rippling water, and vivid, neon - colored light beams intersect in the air, crafting a dreamlike ambiance. Gwen’s gaze is fixed on the pages, her right hand steadying the book while her left hand occasionally flicks to turn a page. The camera stays in a fixed shot, highlighting the surreal elements surrounding her as she remains motionless, fully absorbed in her reading.", "original_prompt_en": "Gwen Stacy reading a book, surrealism style"}
+{"index": 547, "data": "Long shot, rendered in Van Gogh’s signature style with vivid, swirling brushstrokes and richly saturated colors. A wooden boat, its hull adorned with warm, earthy hues, sails leisurely along the Seine River—where the water shimmers with dynamic patches of deep blue and golden yellow, echoing the textural vibrancy of Van Gogh’s landscapes. In the background, the Eiffel Tower rises, its iron framework rendered with bold, expressive lines that mirror the artist’s distinctive technique. The sky above is a tumult of swirling clouds in fiery oranges and moody purples, emblematic of Van Gogh’s celestial scenes. The camera holds steady, capturing the boat’s gentle glide across the river, while the Eiffel Tower stands as a striking, artistically stylized backdrop.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, Van Gogh style"}
+{"index": 548, "data": "A medium long shot captures a small boat with a rustic wooden hull sailing leisurely along the calm Seine River, its gentle movement creating soft, silvery ripples on the water’s surface. In the background, the iconic Eiffel Tower rises majestically, its lattice - like iron framework rendered with thick, vibrant brushstrokes that are typical of an oil painting, standing out against a sky brushed with warm, painterly shades of orange and purple. Crafted in an oil - painting style, the scene features rich, textured brushstrokes that enhance the tranquil atmosphere. The boat glides slowly towards the right of the frame, and the camera remains fixed, allowing the viewer to fully absorb the serene, artistic depiction of the river, the tower, and the surrounding landscape.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, oil painting"}
+{"index": 549, "data": "A medium full shot captures a traditional wooden boat, styled in the delicate Ukiyo - e manner of Hokusai, sailing leisurely along the Seine River. The boat, with its curved hull and understated decorative patterns, glides smoothly over the rippling, turquoise water. In the background, the iconic Eiffel Tower rises, its iron latticework silhouetted against a pale, cloud - dotted sky. The scene is imbued with Ukiyo - e’s signature flat colors and flowing brushstrokes: the boat moves steadily toward the right of the frame, while the fixed camera lets the viewer take in the peaceful fusion of the historic vessel, the tranquil river, and the stately Eiffel Tower, all filtered through Hokusai - inspired artistic sensibilities.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Hokusai, in the style of Ukiyo"}
+{"index": 550, "data": "A long - shot in black - and - white captures a small boat with a simple, classic design sailing leisurely along the Seine River. The boat glides smoothly over the river's surface, where gentle ripples spread out in the monochromatic tones. In the background, the iconic Eiffel Tower, with its delicate iron - framed structure, stands tall and prominent, its form presented in striking black - and - white contrasts. The riverbanks are lined with the shadowy outlines of historic buildings, which blend into the grayscale sky above. As the boat continues its unhurried journey along the Seine, the black - and - white aesthetic gives the scene a timeless, nostalgic atmosphere, with the Eiffel Tower serving as a majestic, constant backdrop.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, black and white"}
+{"index": 551, "data": "Pixel art style, a medium shot captures a small boat with blocky, pixelated textures sailing leisurely along the calm, blue waters of the Seine River. The boat’s hull, rendered in simple pixelated hues, moves smoothly, leaving gentle, pixel - formed ripples on the water. In the background, the iconic Eiffel Tower stands tall, its metal framework stylized into distinct pixel blocks, silhouetted against a softly colored sky. The camera stays steady, focusing on the boat’s tranquil journey as it glides, with the Eiffel Tower’s pixel - art - like form offering a picturesque, nostalgic backdrop characteristic of retro pixel art.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pixel art"}
+{"index": 552, "data": "Panoramic shot of a cyberpunk - styled boat sailing leisurely along the Seine River. The boat, with a sleek metallic body adorned with neon - colored (blue and purple) light strips, glides smoothly on the river, whose surface reflects the colorful lights from the boat and the surroundings. In the background, the Eiffel Tower, reimagined in cyberpunk style, is wrapped with holographic projections and neon light tubes, emitting a cool - toned glow against the dark, misty sky typical of cyberpunk aesthetics. The boat continues its leisurely journey forward, and the camera remains fixed, capturing the harmonious blend of the futuristic vessel, the iconic tower, and the cyberpunk - infused river scene.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in cyberpunk style"}
+{"index": 553, "data": "A medium long shot (animated style, with vibrant colors and smooth lines) captures a charming cartoon - styled boat with a sleek white hull and blue accents sailing leisurely along the calm, glistening Seine River. In the background, the iconic Eiffel Tower stands tall, its iron lattice structure rendered in warm golden hues, with fluffy white clouds drifting across a bright blue sky. The river’s surface reflects the vivid colors of the scene, and the boat gently moves from the right to the left of the frame, while the camera remains fixed, emphasizing the relaxed pace of the voyage against the picturesque Parisian backdrop.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, animated style"}
+{"index": 554, "data": "A long shot in a watercolor painting style captures a small boat with a wooden hull sailing leisurely along the Seine River. The boat glides smoothly on the calm, reflective water that shimmers with soft, blended hues of blue and green, characteristic of watercolor’s delicate color - blending. In the background, the iconic Eiffel Tower stands tall, its iron lattice structure rendered in gentle, muted tones, partially veiled by a light, misty atmosphere that enhances the dreamy watercolor effect. The sky above is a wash of pale grayish - blue with faint, wispy clouds. The boat continues its tranquil journey downstream, and the camera remains steady to emphasize the serene, painterly quality of the scene.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, watercolor painting"}
+{"index": 555, "data": "The sky is suffused with surreal, pastel - hued clouds, blending lavender and gold. A long shot captures a sleek, white sailboat gliding leisurely along the calm, reflective waters of the Seine River. The boat’s sails, billowing gently, carry faint, surreal patterns as if painted by an otherworldly brush. In the background, the iconic Eiffel Tower stands, its iron latticework distorted in subtle, dreamlike ways—edges blurred, colors shifting in a surreal play of light and shadow, as if the structure is both solid and ethereal. The river’s surface mirrors the sky’s surreal palette, rippling with iridescent waves that defy natural physics. The camera remains steady, framing the boat’s tranquil journey and the Eiffel Tower’s surreal silhouette, emphasizing the scene’s otherworldly, artistic (surrealism) atmosphere.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, surrealism style"}
+{"index": 556, "data": "Medium full shot, styled in Van Gogh’s signature swirling, vivid brushwork. A couple in elegant formal evening wear—he in a tailored black tuxedo with a crisp white shirt and bow tie, she in a flowing, rich - hued gown trailing to the ground—are caught in a heavy downpour while heading home, each holding a dark umbrella. Raindrops splash dynamically against the umbrellas, forming a colorful, turbulent pattern. The background reveals a city street lit by hazy, warm streetlights, with the sky swirling in deep blue and purple tones, echoing Van Gogh’s stormy skies. The couple walks slowly, their attire glistening with rain, the fabric of their evening wear clinging slightly as the downpour soaks through. The camera stays steady, capturing the contrast between their refined outfits and the wildly painterly storm, with the wet pavement reflecting lights in a Van Gogh - esque play of light and reflection.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, Van Gogh style"}
+{"index": 557, "data": "Medium full shot of a couple in elegant formal evening wear—he in a sharp black tuxedo with a white dress shirt and bow tie, she in a flowing satin gown with delicate lace trims—caught in a heavy downpour as they head home. Both hold dark umbrellas, raindrops hammering the canopies and splattering onto the glistening wet street. The scene has an oil - painting texture: rich, brush - stroked colors, soft edges, and a dreamy, painterly glow. Behind them, a dim street lined with vintage street lamps casts amber light through the rain, buildings with wet, reflective facades looming in the mist. The couple huddles under their umbrellas, walking slowly; their attire glistens with moisture, the gown’s fabric clinging slightly, the tuxedo’s lapels damp. The camera stays fixed, framing their poised figures against the dramatic rain - soaked urban backdrop—street lamps, shadowed buildings, and the wild downpour—emphasizing the contrast between their refined elegance and the tempestuous weather, all rendered in the lush, textured style of an oil painting.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, oil painting"}
+{"index": 558, "data": "Panoramic shot capturing a couple in formal evening wear—the man in a tailored black tuxedo with a crisp white shirt and bow tie, the woman in an elegant floor - length gown with delicate lace trimmings—on their way home, caught in a heavy downpour. They each hold an ornate umbrella, the umbrella fabric patterned in the Ukiyo - e style, evoking the artistry of Hokusai. The sky is overcast with dark storm clouds, and dense raindrops pound the ground, creating ripples in the puddles that reflect the faint glow of street lamps. The background features traditional Japanese - style buildings with tiled roofs, their forms softened by the misty rain, mirroring the atmospheric aesthetics of Ukiyo - e. The couple huddles closer under their umbrellas, carefully navigating the water - logged street, and the camera remains fixed, capturing the dramatic contrast between their refined attire and the tempestuous weather, all rendered in the vivid, stylized manner characteristic of Hokusai's Ukiyo - e.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas by Hokusai, in the style of Ukiyo"}
+{"index": 559, "data": "In black - and - white, a medium shot captures a couple in formal evening wear — the man in a tailored black tuxedo, the woman in a flowing white gown — heading home, caught in a heavy downpour. They hold umbrellas: his is black, sleekly shielding him from the rain; hers is white, gently deflecting the cascading raindrops. The street is slick with pooled water, and the background reveals blurred city buildings, their outlines softened by the relentless rain. The couple presses on, their formal attire dampening, as the camera holds on their resolute steps through the storm.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, black and white"}
+{"index": 560, "data": "Pixel - art style (with low resolution and block - like textures). A medium full shot captures a couple dressed in elegant formal evening wear — the man in a black tuxedo (featuring a white dress shirt and a black bow - tie) and the woman in a long, flowing dark - hued evening gown adorned with delicate lace trims. They are on their way home and get caught in a heavy downpour, where raindrops are rendered as tiny square pixels splashing all around them. Both of them hold black umbrellas with pixelated patterns, trying to shield themselves from the rain: the man leans towards the woman, tilting the umbrella more to cover her, while the woman clutches her gown to prevent it from getting soaked. The background presents a dimly lit city street in pixel art, with block - shaped street lamps, low - resolution buildings with glowing window blocks, and a dark, overcast sky (composed of gray and black pixel blocks). The couple walks slowly, their steps cautious on the pixel - styled wet pavement. The fixed camera captures the scene of them navigating through the heavy rain while wearing their formal attire.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pixel art"}
+{"index": 561, "data": "A medium wide shot in cyberpunk aesthetic captures a couple in sophisticated formal evening wear—he in a tailored black tuxedo with silver - lined lapels, she in a floor - length emerald - green gown with subtle, glowing circuitry - inspired patterns—on their way home, suddenly caught in a heavy downpour. They hold futuristic umbrellas: his umbrella has a neon - red metallic frame and a canopy that displays shifting cyberpunk cityscapes, hers features a cobalt - blue LED - lit edge and a translucent surface reflecting the rain - slicked street. The background reveals a cyberpunk metropolis: towering buildings with holographic advertisements, neon - lit alleyways, and wet pavement glistening under the city’s vibrant, chaotic lights. The sky is stormy gray, rain pouring down in sheets. The couple, huddled slightly under their umbrellas, walks slowly toward home, and the camera follows them, capturing the gritty, luminous ambiance of the rainy cyberpunk night.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in cyberpunk style"}
+{"index": 562, "data": "An animated medium full shot captures a couple in formal evening wear—he in a sleek black tuxedo with a crisp white dress shirt, she in a flowing, jewel - toned gown with delicate lace detailing—hurrying home as they’re caught in a heavy downpour. They clutch matching black umbrellas with silver handles, yet the relentless rain (raindrops dense and glistening like crystal beads) cascades over the edges, soaking the hems of their attire: her gown’s silk skirt clings softly to her legs, while his trousers glisten with water. The backdrop is a nighttime city street, with warm - glowing streetlamps casting halos through the rain, tall buildings’ silhouettes looming in the distance, and puddles reflecting the faint neon of shopfronts. The couple moves briskly, shoulders close, the man subtly angling his umbrella to shield the woman as they navigate the wet pavement (raindrops splashing around their feet in playful, animated arcs). The animation style is vibrant: colors are rich, lines are smooth, raindrops have a translucent, shimmering quality, and the formal garments’ folds are rendered in a charmingly exaggerated, cartoonish manner—emphasizing the contrast between their refined attire and the chaotic downpour.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, animated style"}
+{"index": 563, "data": "Medium full shot in watercolor painting style. The sky is overcast with dark clouds, and a heavy downpour drenches the scene. A couple in formal evening wear—he in a black tuxedo with a white bow tie, she in a flowing white evening gown adorned with delicate lace—are caught in the rain, holding black umbrellas glistening with raindrops. The man’s tuxedo pants and the woman’s gown hem are speckled with mud from the wet street. They walk briskly toward home, steps hurried as rain splatters against their umbrellas (drooping slightly under the water’s weight). The background reveals a misty street lined with street lamps, their light diffused into warm yellow halos through the watercolor’s soft, blurred textures. Distant buildings fade into the rainy haze, and the scene’s edges are tinged with watercolor washes—blending the downpour into the painting’s dreamy, muted palette, with colors晕染 (diffused) to mimic the fluid, painterly texture of rain merging with the canvas.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, watercolor painting"}
+{"index": 564, "data": "In a surrealism - styled medium shot, a couple dressed in sophisticated formal evening wear is captured— the man in a tailored black tuxedo with a crisp white shirt and a glossy bow tie, the woman in a flowing, floor - length gown of deep emerald—on their way home and caught in a heavy, surreal downpour. They grip umbrellas with distorted, almost melting frames (the fabric of the umbrellas rippling as if alive), while the rain falls in twisting, gravity - defying streams. The background presents a warped urban street: buildings lean at impossible angles, streetlights cast eerie, elongated glows, and the sky churns with stormy gray and surreal, swirling hues. The couple huddles under the umbrellas, moving slowly through the downpour, and the camera pans to follow their steps, amplifying the disorienting, dreamlike atmosphere of the rain - soaked scene.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, surrealism style"}
+{"index": 565, "data": "A medium shot captures an astronaut clad in a white spacesuit with a reflective helmet, floating gracefully in space, rendered in a Van Gogh - style aesthetic. The astronaut’s body is slightly angled, one arm extended as if gliding through the cosmos. The backdrop is a mesmerizing Van Gogh - inspired space: deep blue voids swirl with golden - yellow, brushstroke - like nebulae, bright yellow stars speckle the expanse, and wispy, swirling stardust clouds mimic the dynamic textures of Van Gogh's art. The astronaut drifts slowly, and the camera tracks his movement, highlighting the dreamy, painterly quality of the cosmic scene around him.", "original_prompt_en": "An astronaut flying in space, Van Gogh style"}
+{"index": 566, "data": "A medium shot, rendered in the style of an oil painting, captures an astronaut floating in the vast expanse of space. The astronaut is clad in a crisp white spacesuit, with the helmet’s visor reflecting faint, swirling hues of cosmic colors—deep blues, purples, and hints of gold, all rendered with the thick, textured brushstrokes characteristic of oil paint. Their body drifts gently, limbs relaxed in a weightless posture, as if suspended by the void’s embrace. The background unfolds as a deep, inky black canvas, speckled with pinprick stars that glow with soft, diffused light, and wisps of nebulous clouds in vibrant, blended tones of pink and orange, echoing the layered, painterly look of oil on canvas. The astronaut’s form, defined by expressive, heavy brushwork, slowly rotates, emphasizing the serene, dreamlike quality of this celestial scene. The overall composition bears the hallmarks of an oil painting: rich, saturated colors, visible brushstrokes, and a soft, ethereal blur around the edges, enhancing the otherworldly atmosphere of space exploration.", "original_prompt_en": "An astronaut flying in space, oil painting"}
+{"index": 567, "data": "A stylized medium shot (in the Ukiyo - e style, evoking Hokusai’s artistic vision) depicts an astronaut floating weightlessly in the boundless cosmos. The astronaut wears a spacesuit adorned with intricate, wood - block - print - like patterns (in line with Ukiyo - e aesthetics), their form angled as if gliding through space. The backdrop is a star - filled expanse, with sections rendered in Hokusai’s signature bold, wavy lines and muted yet vivid color palettes—suggesting celestial “waves” or nebulae stylized like his famed ocean surf. The astronaut’s helmet reflects faint glimmers of this cosmic scene, and their posture conveys serene motion, as if traversing a surreal, Edo - era - inspired vision of space. The camera stays fixed, highlighting the fusion of futuristic spaceflight with classic Ukiyo - e art: the astronaut’s flight mirrors the dynamic movement of traditional Ukiyo - e figures, while the starry void is reimagined with Hokusai - esque flourishes.", "original_prompt_en": "An astronaut flying in space by Hokusai, in the style of Ukiyo"}
+{"index": 568, "data": "Long shot, in black - and - white quality. An astronaut in a black - and - white space suit is floating in space. The astronaut’s body is slightly tilted, with arms gently outstretched as if navigating through the void. The background is a vast, dark expanse of space, dotted with faint, blurry star - like specks due to the monochromatic and low - clarity visual style. The astronaut moves slowly, maintaining a weightless posture typical of spaceflight, and the camera remains fixed, capturing the serene yet solitary motion of the astronaut in the endless black - and - white cosmos.", "original_prompt_en": "An astronaut flying in space, black and white"}
+{"index": 569, "data": "A medium shot in pixel art style captures an astronaut flying in space. The astronaut wears a white spacesuit with colorful pixelated patterns, and the helmet reflects pixelated starlight (stars appear as square - shaped blocks). The background is a deep black space dotted with pixelated stars (square blocks) and distant pixelated planets (block - like clusters in light blue or red). The astronaut floats with slightly bent limbs, as if moving slowly in a weightless environment. The pixelated outline is clear, and the colors are presented in a retro 8 - bit style.", "original_prompt_en": "An astronaut flying in space, pixel art"}
+{"index": 570, "data": "A panoramic long shot in a cyberpunk aesthetic captures an astronaut floating and flying in the vast expanse of space. The astronaut is clad in a futuristic cyberpunk - styled spacesuit, adorned with glowing neon - colored lines (shades of electric blue and magenta) running along the contours, and metallic armor plates with intricate circuit - like patterns. The helmet features a sleek, reflective visor with holographic data streams flickering across its surface, and a pair of mechanical - looking, cybernetic arm enhancements with exposed, glowing circuitry. Behind the astronaut, a set of jet - black thrusters emit bursts of bright blue - purple energy, propelling them forward. The backdrop is a breathtaking cyberpunk - themed space: a deep, inky blackness of the universe is interspersed with swirling nebulae in hues of violet and cyan, dotted with floating cyberpunk - style satellite debris that have neon - lit panels and rusted, industrial - looking metal frames. In the distance, a colossal space station looms, its surface crisscrossed with glowing neon pipelines and holographic billboards projecting advertisements in an otherworldly script. The astronaut glides slowly through the void, their body surrounded by floating, translucent holographic navigation interfaces that display complex data. Occasionally, they adjust their posture, and the thrusters on their back pulse rhythmically, sending out ripples of light that illuminate the nearby floating cybernetic fragments, which have sharp, angular designs and glowing red or green accent lights. The sky (space) is a canvas of cosmic darkness, punctuated by the faint, pulsing lights of distant cyberpunk - inspired orbital structures, creating a striking contrast with the astronaut’s vividly lit, high - tech gear as they continue their flight through this stylized, futuristic cosmos.", "original_prompt_en": "An astronaut flying in space, in cyberpunk style"}
+{"index": 571, "data": "An animated - style medium shot depicts an astronaut flying in space. The astronaut is dressed in a streamlined white spacesuit adorned with silver accents, and a transparent helmet encloses their head, revealing a portion of their face. The backdrop is the boundless cosmos, sprinkled with twinkling stars of differing luminosities and faint, vibrant nebulae drifting leisurely. The astronaut’s body is slightly angled, and their arms and legs are arranged in a manner that conveys a gentle, floating movement as they traverse the cosmic void. The overall visual style is vivid and cartoon - like, with smooth lines and bright colors accentuating the animated space - themed scene.", "original_prompt_en": "An astronaut flying in space, animated style"}
+{"index": 572, "data": "A medium shot rendered in a watercolor painting style captures an astronaut floating weightlessly in the vast, inky blackness of space. The astronaut, dressed in a white spacesuit with subtle blue - gray watercolor - hued details, has their arms gently outstretched in a relaxed, weightless posture. The helmet’s visor reflects faint, softly blended starlight and distant, pastel - toned nebulas, all rendered with the translucent, fluid quality of watercolor. The background is a deep black expanse dotted with delicate, watercolor - like stars that bleed into the darkness, and wispy, muted - colored cosmic clouds drift in the distance, imbuing the scene with the dreamy, airy texture of watercolor art. The astronaut drifts slowly, as if suspended in the gentle flow of watercolor pigments, and the entire scene exudes a delicate, ethereal charm, with colors bleeding gently at the edges, characteristic of a watercolor painting.", "original_prompt_en": "An astronaut flying in space, watercolor painting"}
+{"index": 573, "data": "Long shot in surrealism style, capturing an astronaut floating in space. The astronaut is dressed in a white spacesuit with detailed textures, the helmet visor reflecting swirling, iridescent nebulae. His body drifts slowly, arms relaxed yet slightly bent, as if navigating through a dreamlike cosmic realm. The background is a surreal expanse of deep black space, dotted with pulsating, multicolored stars and wispy, luminous nebulae that twist in impossible patterns. Floating fragments of metallic debris and ethereal, translucent space - like creatures (in the surreal style) drift around him. The camera follows the astronaut’s movement, panning gently to emphasize the weightless, otherworldly motion. The astronaut’s suit glows with faint, shifting hues, enhancing the surreal atmosphere as he floats deeper into the surreal cosmic landscape.", "original_prompt_en": "An astronaut flying in space, surrealism style"}
+{"index": 574, "data": "A panoramic shot captures snow - blanketed rocky mountain peaks surrounding and casting shadows over deep canyons. The canyons twist and bend sinuously through the high - elevated, rugged mountain peaks, with their rocky surfaces partially exposed beneath the thick, pristine snow blankets. Rendered in a Van Gogh - style, the scene bursts with swirling, vibrant brushstrokes — the blues and whites of the snow - capped peaks merge with the earthy browns and grays of the rocky outcrops, while the depths of the canyons ripple with dark, undulating shadows that echo the dynamic, emotional textures of Van Gogh’s masterpieces. The high - elevated mountain peaks, with their snow - laden slopes glistening, frame the twisting canyons that carve through the mountainous landscape, all bathed in the passionate, turbulent vision of Van Gogh’s distinctive artistic style.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, Van Gogh style"}
+{"index": 575, "data": "A panoramic shot presents an oil - painting - like scene of snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains, with rugged gray - brown rock faces partially hidden under thick layers of glistening white snow, surround the deep canyons and cast long dark shadows over them. The canyons, with steep rocky walls, twist and bend sinuously through the high - elevated, snow - capped mountain peaks, creating a dramatic and majestic landscape.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, oil painting"}
+{"index": 576, "data": "Panoramic shot of snow - blanketed rocky mountains surrounding and shadowing deep canyons that twist and bend through high - elevated peaks, presented in the Ukiyo - e style reminiscent of Hokusai’s art. Snow clings to the rugged rock faces, with the canyons’ dark depths contrasting sharply against the bright, wind - swept mountain summits. The scene captures the dramatic, undulating landscape as if crafted with delicate brushstrokes, emphasizing the natural drama of the mountains and canyons interweaving in a still, majestic setting, true to the artistic vision of Ukiyo - e.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks by Hokusai, in the style of Ukiyo"}
+{"index": 577, "data": "A long shot captures a breathtaking landscape of snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains surround and cast shadows over the deep canyons, which twist and bend sinuously through the high - elevated mountain peaks. The entire scene is rendered in black - and - white tones, emphasizing the stark contrast between the white snow, the dark rocky surfaces, and the shadowed depths of the canyons.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, black and white"}
+{"index": 578, "data": "Panoramic shot of a pixel - art landscape showcasing snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains surround the area, casting shadows on the deep canyons that twist and bend through the high - elevated mountain peaks. The scene is rendered in pixel art, with distinct pixelated textures on the snow - covered rocks and the meandering canyon paths.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pixel art"}
+{"index": 579, "data": "A panoramic shot in cyberpunk style captures snow - blanketed rocky mountains surrounding and casting shadows over deep canyons. These canyons twist and bend through the high - elevated mountain peaks, with the snow - capped rocky mountain peaks exhibiting rugged textures and the deep canyons, veiled in the mountains' shadows, winding sinuously amidst the lofty peaks. The cyberpunk - styled backdrop adds a futuristic ambiance, highlighting the stark contrast between the snow - covered rocky mountains and the shadow - filled, twisting canyons within the elevated terrain.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in cyberpunk style"}
+{"index": 580, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and deep canyons. The snow - capped rocky mountains, with gray, rugged rock faces partially covered in pristine white snow, surround and cast shadows over the deep canyons below. These canyons twist and bend sinuously through the high - elevated, jagged mountain peaks, all rendered in an animated style. The background, consistent with the animated aesthetic, presents a stylized landscape where the mountains’ rugged textures and the canyons’ winding paths are vividly depicted, and the camera holds steady to capture this dramatic, stylized scene of snow - clad peaks enclosing the shadowed, twisting canyons.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, animated style"}
+{"index": 581, "data": "Panoramic shot of snow - blanketed rocky mountain peaks enclosing deep canyons. The rocky mountains are covered with a thick layer of glistening white snow, and the rugged gray stone surfaces are partially exposed from beneath the snow. The deep canyons, shrouded in the mountains' shadows, twist and bend sinuously through the high - elevated, snow - capped peaks. The scene has a watercolor - painting - like quality, with soft and blended hues of white (snow), gray (rock), and blue (shadows) creating a dreamy and artistic atmosphere. The overcast sky in the background adds to the ethereal and painterly feel of the landscape. The dark and winding canyons contrast sharply with the bright snow - covered peaks, emphasizing the dramatic topography as they wind through the mountain range.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, watercolor painting"}
+{"index": 582, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and deep canyons. Snow - covered rocky mountains surround the deep canyons, casting shadows over them. The canyons twist and bend sinuously through the high - elevated mountain peaks, creating a surrealistic scene. The background features rugged, snow - capped mountain ridges with sharp edges, and the sky above is a pale, misty gray, enhancing the surreal atmosphere. The camera stays fixed, capturing the otherworldly landscape where snow - blanketed mountains and winding canyons merge in a dreamlike, surrealist style.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, surrealism style"}
+{"index": 583, "data": "A long shot captures a beautiful coastal beach in spring. The shore is covered with golden sand that glistens softly under the light. Clear turquoise waves, topped with delicate white foam, lap against the sand in super slow motion, their gentle movement unfolding in a leisurely, serene manner. The background features a bright blue sky dotted with a few fluffy white clouds, while the distant horizon merges softly with the calm sea, enhancing the tranquil and picturesque atmosphere of the scene.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, in super slow motion"}
+{"index": 584, "data": "A medium shot transitions to a zoom - in view of a beautiful coastal beach in spring. The sky is a clear pastel blue with wispy white clouds drifting lazily. The golden sand, with fine and glistening grains, stretches along the shore. Gentle waves, with crests tinted pale turquoise, lap rhythmically against the sand, forming delicate foamy patterns that vanish swiftly into the shore. In the background, distant rocky outcrops and swaying palm trees line the coastline, adding to the tropical spring ambiance. As the camera zooms in, it captures the intricate details of the waves’ movement—each wave curls, breaks, and caresses the sand, while the shore reveals subtle indentations from the receding tide, with the soft spring sunlight enhancing the sand’s golden sheen and the waves’ sparkling surface.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom in"}
+{"index": 585, "data": "A long shot captures a beautiful coastal beach in spring. The golden sand stretches along the shore, with gentle waves lapping rhythmically on the sand, creating tiny ripples that glisten under the soft spring sunlight. The sky above is clear and blue, dotted with a few fluffy white clouds, while the deep blue sea meets the horizon in the distance. Patches of green coastal plants sway slightly in the breeze along the beach. As the camera zooms out, a broader view of the serene coastline unfolds, revealing more of the sandy expanse, the undulating waves, and the calm ocean under the spring sky.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, zoom out"}
+{"index": 586, "data": "Panoramic shot of a beautiful coastal beach in spring. The golden sand stretches smoothly, with delicate ripples tracing the shore where waves recede. Gentle, turquoise - hued waves lap rhythmically against the sand, their crests breaking into frothy white as they reach the shore. The sky above is a clear, vibrant blue with a few wispy white clouds, and in the distance, the deep blue ocean merges with the horizon. Sparse palm trees line the beach, their fronds swaying gently in the spring breeze. As the camera pans left, it reveals more of the tranquil coastline—tiny seashells scattered across the sand and a few seagulls gliding low over the water, enhancing the peaceful springtime coastal scene with the soothing sound of waves lapping on the shore.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan left"}
+{"index": 587, "data": "Panoramic shot of a beautiful coastal beach in spring. The golden sandy shore stretches along the coastline, with gentle turquoise waves lapping rhythmically against the sand, creating small foamy ripples that recede back into the ocean. The sky is clear and blue, with a few fluffy white clouds drifting by, and in the distance, faint silhouettes of distant landmasses or rocky formations are visible. Sparse patches of green coastal plants sway softly in the light breeze. The camera pans right, slowly unveiling more of the serene shoreline, capturing the continuous, soothing motion of the waves caressing the sandy beach as the scene extends.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, pan right"}
+{"index": 588, "data": "Long shot of a beautiful coastal beach in spring. The creamy - white sand lines the shore, and gentle blue waves with frothy edges lap rhythmically on the sand, forming tiny ripples that spread and fade. The camera tilts up: starting from the wave - kissed sand, it moves upward to reveal the clear blue sky with fluffy white clouds, and in the distance, slender coconut trees sway in the spring breeze, their fronds rustling. A few sailboats dot the calm sea, adding serenity to the lively spring beach.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt up"}
+{"index": 589, "data": "The sun shines gently over a picturesque coastal beach in spring. A tilt - down shot captures the scene: fine golden sand stretches along the shore, and gentle waves, frothy with white foam, lap rhythmically against the sand, creating small, glistening ripples that spread and then ebb. The background reveals an expansive, deep - blue sea merging with a clear, cloud - dotted sky at the horizon, with a few seagulls gliding leisurely. Scattered across the beach are seashells of assorted shapes and hues, and the distant coastline is lined with lush green coastal plants swaying softly in the spring breeze. As the camera tilts down, it focuses on the delicate interplay between the waves and the sand, showcasing the wet sand glimmering under the warm sunlight, with the waves’ gentle motion leaving intricate patterns on its surface.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, tilt down"}
+{"index": 590, "data": "A panoramic shot of a beautiful coastal beach in spring. The golden sandy shore stretches along the coastline, and gentle waves lap rhythmically against the sand. An intense shaking effect permeates the scene, causing the waves and the glistening sand to tremble visibly, as if the entire beach is quivering. The background showcases a clear blue sky dotted with soft white clouds, while the distant sea blends seamlessly with the horizon. The shaking effect persists, infusing the tranquil spring beach with a dynamic, almost unsettling vibrancy, as the waves continue to caress the shore amidst the rhythmic trembling.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, with an intense shaking effect"}
+{"index": 591, "data": "A panoramic shot of a beautiful coastal beach in spring. Gentle waves lap smoothly against the golden, fine - grained sand, their translucent crests glistening under the soft spring sunlight. The sky above is a clear blue, dotted with fluffy white clouds, and the distant sea merges seamlessly with the horizon. This scene is captured with a steady and smooth perspective, showcasing the rhythmic motion of the waves and the tranquil, picturesque beauty of the spring coastal landscape.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, featuring a steady and smooth perspective"}
+{"index": 592, "data": "A medium shot with racking focus captures a beautiful coastal beach in spring. The golden sand stretches along the shore, with gentle waves lapping rhythmically on it, creating small, foamy ripples that glisten under the soft spring sunlight. The sky above is a clear, vibrant blue with a few fluffy white clouds drifting lazily. In the background, the deep blue ocean meets the horizon, and some distant seagulls can be seen soaring. The racking focus shifts subtly, first highlighting the delicate texture of the wet sand where the waves recede, then bringing the rolling waves into sharp focus, and finally emphasizing the expansive, serene beachscape that extends into the distance.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand, racking focus"}
+{"index": 593, "data": "Panoramic shot of The Bund in Shanghai, captured in super slow motion. The riverside is lined with iconic colonial - style buildings, their intricate facades (featuring arched windows and decorative moldings) facing the Huangpu River. The river flows gently, with a few white - hulled cruise ships (adorned with colorful accents) moving at a slowed - down pace, their wakes spreading slowly across the water. On the wide promenade, pedestrians in diverse outfits stroll—their motions elongated, some pausing to take photos of the riverside view, others chatting leisurely. In the distance, Lujiazui’s modern skyscrapers rise, their glass curtain walls reflecting the overcast sky. The camera pans slightly, capturing the bustling yet serene scene, where every movement (from the pedestrians’ steps to the ships’ glide) is prolonged, showcasing The Bund’s unique charm in this super slow - motion perspective.", "original_prompt_en": "The bund Shanghai, in super slow motion"}
+{"index": 594, "data": "A zoom - in shot of The Bund in Shanghai. The scene presents a line of magnificent, historic buildings with European - style architecture along the waterfront. Their facades are decorated with elaborate carvings and large windows. In front, the Huangpu River flows gently, with ripples glistening under the light. Along the promenade, tourists in different clothes stroll leisurely, some taking photos. A few vehicles, like sightseeing buses and cars, move slowly on the nearby road. The sky above is clear with a few white clouds. As the camera zooms in, it focuses on the detailed decorations of the building facades, capturing the bustling atmosphere of this iconic landmark.", "original_prompt_en": "The bund Shanghai, zoom in"}
+{"index": 595, "data": "A zoom - out shot of The Bund in Shanghai. Initially, the frame presents a section of the classic European - style buildings along the Bund, with their elaborate facades and historical architectural details. As the camera zooms out, the view expands to include the Huangpu River flowing calmly in front of the buildings, with several cruise ships and cargo vessels sailing on the water. On the riverside promenade, numerous tourists, some taking photos and others strolling, are visible. The background reveals the modern skyscrapers of Lujiazui on the opposite bank, including the Oriental Pearl Tower and other high - rise buildings with glass curtain walls, standing tall against the sky. The sky above is partly cloudy, with white clouds floating in the blue expanse. The camera continues to zoom out, gradually presenting the entire scene of the Bund, the river, and the contrasting architectural styles of the two riverbanks, highlighting the blend of history and modernity in Shanghai.", "original_prompt_en": "The bund Shanghai, zoom out"}
+{"index": 596, "data": "A panoramic shot of The Bund in Shanghai. The sky is clear with a few white clouds. Along the Huangpu River, the historic European - style buildings with elaborate architectural details line the waterfront, their vibrant facades reflecting the sunlight. On the river, several cruise ships with colorful decorations sail gently. In the background, the modern skyscrapers of Lujiazui, including the Oriental Pearl Tower and the Shanghai Tower, rise against the sky. The camera pans left, capturing the busy street with tourists strolling, taking photos, and cyclists riding, as well as cars and sightseeing buses moving along the road. As the camera pans left, it reveals more of the iconic buildings, the flowing river with boats, and the lively mix of historical architecture and modern city life.", "original_prompt_en": "The bund Shanghai, pan left"}
+{"index": 597, "data": "Panoramic shot of The Bund in Shanghai. The sky is clear with a few white clouds floating. Along the Huangpu River’s edge, grand historical buildings with intricate European - style facades line the promenade, their stone surfaces glistening under the sunlight. The river flows gently, with a couple of cruise ships cruising on the water. On the opposite bank, the modern skyline of Lujiazui unfolds, showcasing the Oriental Pearl Tower and sleek skyscrapers with reflective glass exteriors. The promenade is crowded with tourists: some are posing for photos in front of the buildings, others are leisurely walking or chatting in small groups. The camera pans right, capturing more of the Bund’s magnificent architecture, the bustling riverfront activities—like street performers and vendors selling souvenirs—and the continuous flow of pedestrians and cyclists along the path.", "original_prompt_en": "The bund Shanghai, pan right"}
+{"index": 598, "data": "A medium shot of The Bund in Shanghai. The camera tilts up, starting from the Huangpu River with rippling blue - green water and a few white - hulled cruise ships, then moving up to reveal the iconic European - style buildings with elaborate facades—some featuring arched windows, decorative cornices, and red - tiled roofs. The promenade beside the river is lined with pedestrians in casual and formal wear, strolling along the stone walkway. As the tilt - up progresses, the camera captures the upper stories of the buildings, their ornate architectural details, and the overcast (or clear) sky above, with soft clouds floating. The buildings’ grand silhouettes against the sky, along with the river’s gentle flow below, highlight the Bund’s historic charm.", "original_prompt_en": "The bund Shanghai, tilt up"}
+{"index": 599, "data": "A tilt - down shot of The Bund in Shanghai. The scene starts with a view of the iconic colonial - style buildings, their facades decorated with intricate details and roofs in colors like red and gray. As the camera tilts down, the Huangpu River comes into sight, its surface rippling gently under the sunlight. Along the riverside promenade, pedestrians stroll—some take photos of the scenery, while others chat in groups. A few colorful cruise ships are docked at the pier, their hulls reflecting on the blue water. In the distance, the Lujiazui skyline rises, with skyscrapers such as the Oriental Pearl Tower and Shanghai Tower piercing a clear sky with scattered white clouds. The ground shows stone - paved walkways, greenery, street lamps, and decorative sculptures. The camera’s tilt - down motion reveals more of the bustling riverside, combining historical architecture with modern city life.", "original_prompt_en": "The bund Shanghai, tilt down"}
+{"index": 600, "data": "Panoramic shot of The Bund in Shanghai, with an intense shaking effect causing the image to jolt. The Bund’s historic buildings with ornate facades line the Huangpu River, their light - colored exteriors contrasting with the overcast sky. The Huangpu River’s water shimmers, with a few ships sailing slowly. In the foreground, tourists in various outfits stroll or take photos on the waterfront promenade, their figures swaying as the camera trembles. The background features modern skyscrapers like the Oriental Pearl Tower, standing against the sky. The camera shakes vigorously, making the buildings, river, and people in the scene jolt and blur intermittently, creating a sense of dynamic instability.", "original_prompt_en": "The bund Shanghai, with an intense shaking effect"}
+{"index": 601, "data": "Panoramic shot of The Bund in Shanghai, featuring a steady and smooth perspective. The scene presents the historic Western - style buildings with various architectural details, such as arched windows and decorative facades, lining the Huangpu River on the near side. On the opposite bank, modern skyscrapers including the Oriental Pearl Tower with its distinctive spherical structures stand tall against the clear sky dotted with a few white clouds. The Huangpu River below has several cruise ships and cargo vessels moving slowly, their reflections glimmering on the water’s surface. Along the riverside promenade, pedestrians in casual and tourist - like attire stroll, some stopping to take photos of the scenic view. Sightseeing buses and private cars also move along the road adjacent to the buildings. The camera maintains a steady and smooth movement, panning slowly to capture the harmonious combination of historical architecture, modern skyline, and the bustling riverside activity.", "original_prompt_en": "The bund Shanghai, featuring a steady and smooth perspective"}
+{"index": 602, "data": "A racking focus shot of The Bund in Shanghai. The sky is clear and blue with a few white clouds drifting. In the foreground, the iconic European - style buildings of The Bund stand, their facades adorned with intricate carvings and in various colors like beige and white, with neatly arranged windows. As the focus shifts, the glistening Huangpu River comes into view, with several colorful cruise ships, some white and some red, either moored or sailing slowly on its surface. Then the focus moves to the bustling promenade, where pedestrians in diverse outfits, from casual wear to windbreakers, stroll leisurely or stop to take photos, capturing the vibrant scene of The Bund that combines historical architecture, the flowing river, and lively human activity.", "original_prompt_en": "The bund Shanghai, racking focus"}
+{"index": 603, "data": "Super slow - motion long shot. A gray shark with a sleek, streamlined body and a pointed dorsal fin is swimming in the deep blue ocean. The ocean water is clear, with some small, silvery fish darting around and delicate seaweed gently swaying in the water current. The shark’s tail fin moves slowly from side to side, and its pectoral fins glide smoothly through the water as it progresses forward. The camera remains fixed, capturing every subtle movement of the shark in this slowed - down sequence, while the vast, blue expanse of the ocean stretches out in the background, with light shimmering on the water’s surface.", "original_prompt_en": "a shark is swimming in the ocean, in super slow motion"}
+{"index": 604, "data": "A long shot captures a gray shark with a streamlined body swimming in the deep - blue ocean. Its tail sways rhythmically from side to side, and its dorsal fin slices through the water, creating subtle ripples. The ocean water is a rich blue, with sunlight filtering down from the surface, and some floating seaweed can be seen in the background. The camera zooms in, gradually bringing the shark into a closer view, capturing the texture of its rough skin and the gentle movement of its gills as it propels itself forward with steady and powerful strokes.", "original_prompt_en": "a shark is swimming in the ocean, zoom in"}
+{"index": 605, "data": "A medium shot captures a gray shark with a sleek, streamlined body swimming in the deep blue ocean. Its tail fin undulates rhythmically, propelling it forward through the water that shimmers with light reflections and has faint traces of floating seaweed. As the camera zooms out, the shark becomes a smaller figure against the vast expanse of the ocean, revealing the broader marine environment with gentle water currents and the distant, misty horizon where the sea meets the sky.", "original_prompt_en": "a shark is swimming in the ocean, zoom out"}
+{"index": 606, "data": "[A medium shot captures a gray shark with a streamlined body and a prominent dorsal fin swimming in the deep blue ocean. Its tail fin sways rhythmically, propelling it forward with smooth, fluid motions. The ocean water is a rich, deep blue, with faint beams of sunlight filtering through from the surface, casting dappled light on the shark’s scales. Subtle hints of underwater vegetation can be seen swaying gently in the background. The shark swims steadily toward the left of the frame, and the camera pans left to follow its movement, capturing the graceful, powerful motion of its body cutting through the water.\n]", "original_prompt_en": "a shark is swimming in the ocean, pan left"}
+{"index": 607, "data": "A medium shot captures a gray shark with a sleek, streamlined body swimming in the deep turquoise ocean. Its dorsal fin slices through the water’s surface, and its powerful tail undulates rhythmically, propelling it forward with smooth, fluid motions. The surrounding ocean is filled with faint sunlit water columns, and small silver fish dart in the distance, their scales glinting under the filtered sunlight from above. As the shark swims steadily toward the right of the frame, the camera pans right, following its graceful movement and capturing the gentle ripples and bubbles trailing behind it.", "original_prompt_en": "a shark is swimming in the ocean, pan right"}
+{"index": 608, "data": "A medium long shot captures a gray shark with a streamlined body and a sharp dorsal fin swimming gracefully in the deep blue ocean. The water, tinted turquoise near the surface, reveals faint coral reefs and small silver fish darting in the background. As the shark moves forward with rhythmic tail strokes, the camera tilts up, revealing the calm ocean surface with gentle ripples and an overcast sky, where a few seagulls glide in the distance.", "original_prompt_en": "a shark is swimming in the ocean, tilt up"}
+{"index": 609, "data": "Medium full shot with a tilt - down movement captures a gray shark with a streamlined body swimming gracefully in the deep blue ocean. The shark’s tail fin sways rhythmically as it moves, and its dorsal fin cuts through the water surface. Within the ocean, some light - colored aquatic plants drift gently in the water column, and sunlight filters through the water, creating a shimmering effect. The camera executes a tilt - down movement, initially framing the upper part of the shark and then gradually lowering to reveal the shark’s full body as it navigates through the ocean depths.", "original_prompt_en": "a shark is swimming in the ocean, tilt down"}
+{"index": 610, "data": "A medium shot captures a gray shark with a streamlined body and a prominent dorsal fin swimming in the deep blue ocean. The surrounding water holds faint shadows of swaying seaweed and a few small, darting fish. The shark propels forward, its tail swaying rhythmically, while an intense shaking effect ripples through the scene—making the entire frame quiver, as if the camera is buffeted by strong underwater currents. The shark continues its steady, fluid motion, cutting through the water, with the shaking persisting to amplify the ocean’s turbulence.", "original_prompt_en": "a shark is swimming in the ocean, with an intense shaking effect"}
+{"index": 611, "data": "A medium long shot captures a gray shark with a sleek, streamlined body and a prominent dorsal fin swimming in the deep blue ocean. The water is clear, with sunlight filtering through the surface, casting dappled light on the shark’s scales. In the background, faint silhouettes of coral reefs and small schools of fish dart around. The shark moves forward with steady, smooth motions, its tail fin undulating rhythmically, while the camera maintains a consistent perspective, following the shark’s graceful journey through the ocean.", "original_prompt_en": "a shark is swimming in the ocean, featuring a steady and smooth perspective"}
+{"index": 612, "data": "A medium long shot captures a gray shark with a streamlined body and a sharp dorsal fin swimming gracefully in the deep blue ocean. The background reveals faint silhouettes of colorful coral reefs and small fish darting about. Employing a racking focus technique, the camera shifts focus between the shark and the surrounding marine life, capturing the shark’s smooth, undulating movements as it glides through the water, its tail fin swaying rhythmically to propel itself forward.", "original_prompt_en": "a shark is swimming in the ocean, racking focus"}
+{"index": 613, "data": "Super slow - motion medium close - up shot captures a giant panda with its iconic black - and - white fur seated at a wooden table in a charming Parisian café. The café’s interior boasts warm, yellow lighting, neatly arranged chairs with plush cushions, and walls decorated with impressionist - style paintings. The panda, with a relaxed posture, holds a small, white coffee cup in its paw, the super slow - motion effect elongating its gentle sipping motion—any coffee droplets or the liquid’s flow appear in a delicate, stretched manner. Around the panda, a few human customers in casual clothing engage in quiet conversation, and a sleek, black coffee machine with silver accents rests on the countertop. The panda’s deliberate, slow movements as it enjoys the coffee emphasize the surreal yet endearing scene of a panda partaking in a coffee - drinking ritual in the heart of Paris.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, in super slow motion"}
+{"index": 614, "data": "A medium shot captures a giant panda with black - and - white fur sitting at a wooden table in a cozy Parisian café. The café is filled with warm yellow light, and there are vintage - style wooden chairs and tables around. A few Parisian patrons in casual clothes are chatting in the background. The panda holds a white ceramic coffee cup with a rich brown liquid inside and gently sips the coffee in a relaxed posture. Its round black ears and distinctive black eye patches are clearly visible, and its fluffy fur looks soft. The background also has a French - themed poster on the wall and a window that offers a glimpse of the Parisian street outside. Then the camera zooms in, focusing on the panda’s hands holding the cup and its face as it enjoys the coffee, highlighting the contrast between the panda’s unique appearance and the typical café setting.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, zoom in"}
+{"index": 615, "data": "Wide shot captures a giant panda with distinctive black - and - white fur seated at a wooden table in a charming Parisian café. The panda, with its round black ears and signature black eye - patches, holds a white ceramic coffee cup with both front paws, leisurely sipping the coffee. The café’s interior is warm and inviting, with soft yellow lighting, wooden chairs with elegant curves, and framed impressionist artworks on the walls. Through the café’s large glass windows, the bustling Parisian street outside is visible—cobblestone pavement, pedestrians in fashionable outfits walking by, and classic French buildings with wrought - iron balconies and green shutters lining the street. As the camera zooms out, it reveals the café’s full cozy setup, the adjacent street corner with a quaint boulangerie, and the distant outline of the Eiffel Tower against the clear sky, highlighting the surreal yet charming scene of a panda enjoying coffee in the city of Paris.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, zoom out"}
+{"index": 616, "data": "The sky is softly overcast, casting a gentle light over the scene. A medium shot captures a giant panda with striking black - and - white fur, seated at a wooden table in a charming Parisian café. The café’s interior boasts classic French touches: wrought - iron furniture, a marble counter, and vintage Paris posters on the walls. The panda holds a white ceramic coffee cup with its black paws, sipping the dark coffee slowly, its round black eyes focused on the drink. Outside the large windows, Paris’s cobblestone streets and pastel - colored buildings are visible. As the camera pans left, it reveals more of the café: a barista in a striped apron working behind the counter, and other patrons (humans and a few whimsical creatures) chatting. The panda continues to enjoy its coffee, occasionally glancing up, while the camera’s leftward movement showcases the café’s cozy ambiance and the panda’s endearing presence in this Parisian scene.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, pan left"}
+{"index": 617, "data": "Medium shot of a panda with black - and - white fur drinking coffee in a cozy Parisian café. The panda, seated at a wooden table, holds a white ceramic cup filled with brown coffee and sips it gently. The café is decorated with warm - colored lights, wooden chairs, and French - style wall art in the background. The camera pans right, revealing more of the café's interior, including a small potted plant, and a glimpse of the Parisian street outside with a few pedestrians and bicycles. The panda continues to drink, its relaxed posture highlighted against the charming café ambiance.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, pan right"}
+{"index": 618, "data": "A medium shot (tilt up) captures a giant panda with black - and - white fur and a plump figure sitting at a wooden table in a cozy Parisian café. The panda holds a white ceramic coffee cup with a delicate handle in its paw, sipping the dark - brown coffee slowly. The café’s interior is decorated with vintage floral - patterned wallpaper, and soft yellow light from pendant lamps fills the space. A small window beside the panda reveals the Paris street outside, with cobblestone roads and elegant buildings. As the camera tilts up, it shows the panda’s relaxed posture, its black ears standing upright, and the café’s ceiling with exposed wooden beams. The panda keeps drinking the coffee, occasionally pausing to glance around the charming café.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, tilt up"}
+{"index": 619, "data": "A tilt down shot captures a panda with distinctive black - and - white fur and a plump body seated on a wooden chair in a charming cafe in Paris. The cafe is adorned with warm - yellow pendant lights, rustic wooden tables and chairs, and framed paintings of Parisian streets on the walls. The panda, using its right paw to hold a white ceramic coffee cup filled with rich, dark - brown coffee, is gently sipping the beverage. To the panda's left, a pair of human patrons are engaged in lively conversation, and the background reveals a bar counter lined with aromatic coffee beans and a sleek silver espresso machine. As the camera tilts down, it reveals the panda's black - hued, furry feet resting on the light - colored, polished wooden floor, while through the cafe's window, the faint outline of the Eiffel Tower stands against the clear blue sky, adding a touch of Parisian charm to the scene.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, tilt down"}
+{"index": 620, "data": "A medium shot (with an intense shaking effect) captures a giant panda with distinctive black - and - white fur sitting in a cozy Parisian café. The panda, with its round black ears and eye patches, is holding a white ceramic coffee cup with both paws, gently sipping the coffee inside. The café’s interior has warm wooden furniture, soft yellow lighting, and French - style decor like paintings on the walls and a small potted plant on the table beside the panda. Outside the café window, the iconic Parisian streetscape with cobblestone roads and ornate street lamps is faintly visible through the glass. Throughout the scene, an intense shaking effect creates a sense of visual instability, as if the camera or the environment is trembling vigorously.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, with an intense shaking effect"}
+{"index": 621, "data": "A medium shot with a steady and smooth perspective captures a giant panda with black - and - white fur, a round and plump body, and distinct black eye patches, sitting leisurely on a wooden chair in a cozy Parisian café. The panda holds a delicate white coffee cup with its paw, sipping the dark brown coffee, and a wisp of steam is rising from the cup. The interior of the café is warm - toned, with vintage wooden tables, soft yellow lighting, and Impressionist - style paintings adorning the walls. Outside the large glass windows, Parisian streets with pedestrians and the faint silhouette of the Eiffel Tower can be seen. The camera, maintaining a steady and smooth perspective, smoothly follows the panda's relaxed posture as it enjoys the coffee, while other patrons in the café are either chatting or reading, creating a tranquil and whimsical Parisian café scene.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, featuring a steady and smooth perspective"}
+{"index": 622, "data": "A medium shot with rack focus captures a giant panda with distinctive black - and - white fur sitting at a wooden table in a cozy Parisian café. The panda holds a white ceramic cup filled with brown coffee, sipping the coffee gently. The café’s interior is decorated with vintage French posters on the walls, and through the large window, the iconic silhouette of the Eiffel Tower in the Parisian streetscape can be seen. As the rack focus shifts, the initial sharp focus on the panda’s face transitions to highlight the brown coffee in the cup, and then to the blurred Parisian street outside the window, emphasizing the whimsical scene of a panda drinking coffee in a Parisian café.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris, racking focus"}
+{"index": 623, "data": "A medium long shot captures a cute, happy Welsh Corgi with light - brown fur and white chest markings playing in a park at sunset, filmed in super slow motion. The Corgi, with its short legs and a fluffy tail wagging enthusiastically, bounds across the lush green grass. It occasionally pauses to sniff the ground or chase a dandelion seed floating in the air. Its ears flop gently with each movement, and its tongue lolls out in joyful excitement. The background features a serene park: tall trees with leaves gilded by the setting sun, a wooden bench partially visible, and a sky ablaze with warm orange and pink hues as the sun dips low, casting soft, elongated shadows. In super slow motion, details like the Corgi’s fur rippling, its tail’s gentle arc, and its paws’ delicate press into the grass are vividly highlighted. The fixed camera captures the Corgi’s playful antics against the picturesque sunset backdrop, emphasizing the relaxed and joyful atmosphere of the scene.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, in super slow motion"}
+{"index": 624, "data": "A medium shot captures a cute, happy Corgi with short, fluffy tricolor fur (brown, white, and black) playing energetically in a park during sunset. The sky is ablaze with warm oranges, pinks, and purples, casting a golden glow over the green grass dotted with fallen leaves. In the background, a wooden bench sits near a cluster of trees with branches swaying gently in the evening breeze. The Corgi, with its short legs bounding and fluffy tail wagging enthusiastically, chases a small, red ball or frolics freely, its mouth open in a joyful pant, ears perked, and eyes shining with excitement. As the camera zooms in, it focuses on the Corgi’s delighted expression, highlighting its playful demeanor against the backdrop of the glowing sunset.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, zoom in"}
+{"index": 625, "data": "A long shot (with a zoom - out) captures a cute, happy Corgi playing in a park during sunset. The Corgi has a short, fluffy coat with white and light - brown fur, its ears perked up and its stubby tail wiggling as it frolics on the green grass. The park is dotted with scattered trees, a few benches, and a small playground, all bathed in the warm, orange - hued light of the sunset. The sky is painted with a beautiful gradient of orange, pink, and purple as the sun sets, casting long shadows on the ground. As the camera zooms out, it reveals a wider view of the park, with a few other people either enjoying the sunset or walking their pets, and the distant horizon where the sun is partially hidden behind low - lying clouds.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, zoom out"}
+{"index": 626, "data": "The sun is setting, painting the sky with warm orange and pink hues. A medium full shot captures a cute, happy Corgi with fluffy brown and white fur—its bushy tail wagging enthusiastically, ears perked up—playing in a lush park. The Corgi frolics on the green grass, chasing a fallen leaf, sniffing vibrant flower beds, and bounding toward a fluttering butterfly. The park’s backdrop features scattered trees with golden leaves, a small stone pathway, and blooming flowers glowing in the sunset’s golden light. The camera pans left, following the Corgi’s lively movements as it explores, occasionally pausing to nuzzle a daisy or chase a rolling acorn, while the serene sunset casts long shadows across the grass.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, pan left"}
+{"index": 627, "data": "The sun is setting, casting a warm orange glow across the sky. A medium full shot captures a cute, happy Corgi with short brown and white fur playing in a park. The Corgi has perked ears, a wagging tail, and is joyfully running and jumping on the green grass, chasing a small ball. The park is filled with tall trees, colorful flowers, and a few benches scattered around. As the Corgi moves to the right of the frame, the camera pans right to follow its playful movements, capturing the serene sunset backdrop with soft light illuminating the scene.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, pan right"}
+{"index": 628, "data": "A tilt - up shot captures a cute and happy Corgi with short, fluffy brown - and - white fur playing in a park. The Corgi has a cheerful expression, its short legs moving briskly as it frolics on the green grass dotted with tiny flowers. The background shows a park scene with tall trees casting long shadows, and the sky is painted with warm orange and pink hues from the setting sun, with a few scattered clouds glowing softly. As the camera tilts up, it first focuses on the Corgi’s playful movements and then gradually reveals the beautiful sunset - lit sky above the park.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, tilt up"}
+{"index": 629, "data": "A medium - long shot captures a cute and happy Corgi with short legs, a fluffy coat in white and brown, and a stumpy tail playing in the park. The Corgi bounces on the lush green grass, occasionally wagging its short tail and seemingly holding a fallen leaf in its mouth. The background shows a sky dyed warm orange - red by the sunset. In the park, trees cast long shadows, there are wooden benches, and some wildflowers are scattered in the distance. The camera tilts down, following the Corgi's movements. It slowly pans from the Corgi's cheerful face down to its nimble legs, showing the Corgi nimbly moving across the grass under the golden glow of the sunset.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, tilt down"}
+{"index": 630, "data": "The sky glows with warm orange and pink hues at sunset. A medium shot, featuring an intense shaking effect, captures a cute, happy Corgi with short, fluffy fur playing in a park. The park’s background includes lush green grass, scattered trees with leaves tinted golden by the sunset, and a pathway adorned with blooming flowers. The Corgi, with its characteristic short legs and a wildly wagging tail, joyfully frolics—running in circles, chasing a fluttering leaf, or pausing to nuzzle the grass—while the intense shaking effect amplifies the lively, energetic atmosphere. The sunset casts long, soft shadows on the ground, and gentle breezes rustle the tree branches, complementing the Corgi’s playful movements in this vibrant, shaking scene.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, with an intense shaking effect"}
+{"index": 631, "data": "A medium shot with a steady and smooth perspective captures a cute, happy Corgi with short, fluffy white - and - brown fur (its stumpy tail wagging excitedly) playing in a park during sunset. The Corgi frolics on the lush green grass, leaping to chase a floating dandelion seed or bounding after its own shadow, with its ears perked and eyes bright with joy. The background reveals a park bathed in the warm glow of the setting sun: tall trees with golden - tinged leaves stand against a sky painted in vibrant oranges and soft pinks, while a few scattered clouds glow like embers. The camera follows the Corgi’s playful movements smoothly, maintaining a steady perspective as the dog trots toward the left of the frame, then turns to dash back, its short legs carrying it quickly across the grass, and the sunset casts a golden hue over the entire scene.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, featuring a steady and smooth perspective"}
+{"index": 632, "data": "A medium shot with racking focus captures a cute, happy Corgi—with short stubby legs, a fluffy white - and - brown coat, and perked - up ears—playing in a park. The Corgi bounds across a grassy area dotted with dandelions, its mouth open in a joyful grin as it chases a butterfly. The background features a park landscape: tall oak trees with golden leaves, a winding path, and a sunset sky ablaze with warm amber and lavender hues. The camera’s focus racks: initially sharp on the Corgi as it leaps and twists, then shifting to the glowing sunset behind the trees (where the sun dips below the horizon), before refocusing on the Corgi as it sits, wagging its thick tail and panting lightly at the camera.", "original_prompt_en": "A cute happy Corgi playing in park, sunset, racking focus"}
+{"index": 633, "data": "A medium close - up shot in super slow motion captures Gwen Stacy. She has long blonde hair and is dressed in her classic white - blue outfit. Seated, she holds an open book with dark covers, her eyes intently fixed on the pages. In the super slow motion, the gentle movement of her fingers resting on the book or the slight shift of her posture is exaggeratedly slow, emphasizing the tranquil, drawn - out nature of her reading. The background is a softly lit room with light - colored walls, and a few subtle decorations add to the scene’s ambiance.", "original_prompt_en": "Gwen Stacy reading a book, in super slow motion"}
+{"index": 634, "data": "A medium close - up shot captures Gwen Stacy, with her long blonde hair cascading over her shoulders and dressed in a blue - and - white striped dress, sitting upright and engrossed in reading a book with a dark - colored cover that has golden text on it. The background shows a cozy room with a window letting in soft light, a wooden bookshelf filled with various volumes, and a plush armchair beside her. As the camera zooms in, it focuses more closely on her concentrated expression and the intricate details of the book's pages, highlighting her gentle finger movements as she turns the pages.", "original_prompt_en": "Gwen Stacy reading a book, zoom in"}
+{"index": 635, "data": "Medium shot initially captures Gwen Stacy, a young woman with long blonde hair, seated and engrossed in reading a book with a brown cover. She wears a blue plaid shirt and denim jeans, her eyes focused on the pages. The background reveals a cozy room: a beige armchair, a small wooden side table with a white lamp, and a window with white curtains filtering soft natural light. As the camera zooms out, more of the room unfolds: a patterned rug on the wooden floor, a bookshelf brimming with books against the wall, and a potted plant in the corner. Gwen remains absorbed in her reading, posture relaxed, while the zoom - out expands the view to showcase the warm, inviting interior, including a framed poster on the wall and a plush throw blanket draped over the armchair.", "original_prompt_en": "Gwen Stacy reading a book, zoom out"}
+{"index": 636, "data": "Medium shot of Gwen Stacy reading a book. She has long blonde hair and wears a blue dress with white polka dots, seated on a wooden chair with a plush cushion. The book in her hands features a dark cover with golden lettering. The background reveals a cozy room with bookshelves filled with various books, a window with white curtains letting in soft light, and a small potted plant on a side table. The camera pans left, following her gentle head movements as she reads, revealing more of the room’s decor, including a framed artwork on the wall and a stack of magazines beneath the table, while she remains engrossed in her reading.", "original_prompt_en": "Gwen Stacy reading a book, pan left"}
+{"index": 637, "data": "A medium shot captures Gwen Stacy, a young woman with long wavy blonde hair, seated in a cozy armchair as she reads a book with a dark green cover. She wears a white blouse and a blue plaid skirt, her expression focused as she traces the text with her finger. The background features a sunlit room with wooden bookshelves filled with colorful books, a window with sheer curtains, and a small table holding a steaming cup of coffee and a vase of white flowers. The camera pans right, slowly revealing more of the room—including a potted fern on the windowsill and a framed painting on the wall—while Gwen remains absorbed in her reading, occasionally turning a page with a gentle flick of her wrist.", "original_prompt_en": "Gwen Stacy reading a book, pan right"}
+{"index": 638, "data": "A medium close - up shot captures Gwen Stacy, a young woman with blonde hair, dressed in a casual outfit, sitting in a quiet room. She holds a book open, her eyes fixed on the pages, deeply engrossed in reading. The background features a wall with framed pictures and a soft - lit lamp. Then, the camera tilts up, slowly moving upward to reveal the upper part of the book, the ceiling with a simple design, and a small window showing a glimpse of the overcast sky. Gwen remains focused on her reading, her fingers occasionally brushing against the book's edges as she continues to absorb the words.", "original_prompt_en": "Gwen Stacy reading a book, tilt up"}
+{"index": 639, "data": "A medium shot captures Gwen Stacy, a young woman with shoulder - length blonde hair, engrossed in reading a book. She is dressed in a white button - down shirt and dark blue jeans, holding the book with both hands, her eyes scanning the text intently. The background features a softly lit room with a wooden bookshelf filled with various books and a potted plant with green leaves. As the camera tilts down, it reveals her black leather shoes resting on a light brown carpet with a subtle geometric pattern, and the lower part of her jeans, which have a slight crease at the knees.", "original_prompt_en": "Gwen Stacy reading a book, tilt down"}
+{"index": 640, "data": "A medium shot captures Gwen Stacy, with her blonde hair flowing, intently reading a book with a dark - colored cover. She is dressed in a white blouse, and her eyes are fixed on the pages. Throughout the scene, an intense shaking effect is applied, making the entire frame, including Gwen and the book she holds, tremble vigorously. The background reveals a cozy room with wooden bookshelves filled with various books, and soft light filters through a window with white curtains, casting gentle shadows on the floor.", "original_prompt_en": "Gwen Stacy reading a book, with an intense shaking effect"}
+{"index": 641, "data": "A medium shot with a steady, smooth perspective captures Gwen Stacy engrossed in reading a book. She has shoulder - length blonde hair and is dressed in a white - collared blue dress. Seated on a wooden chair with a cushioned seat, she holds the book with both hands, her eyes moving across the pages. The background reveals a sun - lit room with a large window adorned with white curtains, through which soft light streams in, casting gentle shadows on the light - colored carpeted floor. A small potted plant with green leaves sits on the wooden side table beside her, and a few framed pictures hang on the light - colored wall. The camera maintains a steady, smooth perspective, following her subtle movements as she reads, creating a calm and focused atmosphere.", "original_prompt_en": "Gwen Stacy reading a book, featuring a steady and smooth perspective"}
+{"index": 642, "data": "Medium shot of Gwen Stacy, with long blonde hair and a concentrated expression, seated and reading a book with a dark embossed cover. She wears a white blouse, and the background features a sunlit room with wooden bookshelves lined with colorful volumes and a potted plant on a side table. The rack focus effect is applied: initially, Gwen’s face is sharply in focus, then the focus shifts to the book’s pages, revealing fine printed text, before refocusing on her as she gently turns a page, her eyes scanning the words with intent.", "original_prompt_en": "Gwen Stacy reading a book, racking focus"}
+{"index": 643, "data": "The sky is clear. A long shot captures a white leisure boat sailing leisurely along the Seine River, with the iconic Eiffel Tower standing tall in the background. In super slow motion, the boat glides smoothly over the calm, rippling water, its hull reflecting the soft sunlight. The riverbanks are lined with charming Parisian buildings, their facades adding a picturesque touch to the scene. The camera remains steady, focusing on the boat as it moves gently, emphasizing the tranquil and graceful motion enhanced by the super - slow - motion effect.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, in super slow motion"}
+{"index": 644, "data": "A medium long shot captures a sleek, white tourist boat sailing leisurely along the calm, turquoise waters of the Seine River. The iconic Eiffel Tower, with its intricate iron lattice structure glistening under an overcast sky, stands majestically in the background, framed by quaint stone buildings and leafy trees lining the riverbanks. The river’s surface ripples gently as the boat glides, creating soft reflections. As the camera zooms in, it brings the boat and the Eiffel Tower into sharper focus, highlighting the boat’s leisurely movement and the tower’s towering, intricate details against the overcast sky. The boat continues its slow, graceful journey along the river, with the Eiffel Tower’s silhouette becoming more prominent as the zoom - in progresses.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom in"}
+{"index": 645, "data": "A zoom - out shot captures a boat sailing leisurely along the Seine River, gliding smoothly on the calm water surface with faint ripples spreading out. In the background, the iconic Eiffel Tower, with its intricate iron - lattice structure, stands tall, its silhouette distinct against the sky. As the camera zooms out, more of the Seine River’s scenic surroundings are revealed: the riverbanks are lined with old - style stone buildings with warm - hued facades, and lush green trees sway gently by the water, adding a touch of natural beauty to the urban landscape.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, zoom out"}
+{"index": 646, "data": "The sky is clear with a few wispy white clouds. A medium - long shot captures a sleek white tourist boat with a blue stripe along its hull sailing leisurely on the calm, silver - hued waters of the Seine River. In the background, the iconic Eiffel Tower, with its distinctive iron lattice structure and pointed peak, stands tall against the bright sky. The riverbanks are lined with elegant stone buildings and lush green trees, their reflections rippling on the water's surface. As the boat glides smoothly forward, the camera pans left, following its movement and revealing more of the picturesque river scenery, including other small boats bobbing gently and the distant outline of Parisian landmarks emerging on the horizon.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan left"}
+{"index": 647, "data": "A medium shot captures a white - painted leisure boat sailing leisurely along the calm, turquoise - hued Seine River. The boat's bow gently cuts through the water, creating faint ripples. A few passengers can be seen on the deck: some are leaning against the railing, while others are seated, all immersed in the beautiful scenery. In the background, the iconic Eiffel Tower stands tall, its iron lattice structure glinting under a partly cloudy sky, with patches of blue peeking through the clouds. The riverbanks are lined with historic stone buildings, their ornate facades reflecting in the water, and lush green trees swaying gently in the breeze. The camera pans right, following the boat's peaceful journey and revealing more of the picturesque riverfront, including quaint stone bridges, bustling riverside promenades, and charming cafes where people gather to watch the passing vessels.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, pan right"}
+{"index": 648, "data": "A tilt - up shot captures a boat with a sleek hull sailing leisurely along the Seine River, its hull gently moving through the calm and reflective water. In the background, the iconic Eiffel Tower, with its distinctive iron - lattice structure, stands tall against a pale blue sky dotted with wispy white clouds. As the camera tilts up, it reveals more of the tower’s upper sections and the expansive sky, emphasizing the serene journey of the boat on the river with the magnificent Eiffel Tower as a backdrop.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt up"}
+{"index": 649, "data": "Long shot—clear blue sky overhead. A white sailboat with a sleek blue hull glides leisurely along the calm, sun - dappled Seine River, its bow cutting through the water to create soft ripples. In the background, the iconic Eiffel Tower rises majestically, its intricate iron framework catching the sunlight. The riverbanks are lined with historic stone buildings, their facades adorned with arched windows and green - shuttered balconies, while a few pedestrians stroll along the tree - lined promenade. The camera tilts down from the Eiffel Tower’s silhouette, slowly revealing the boat as it drifts serenely, with the city’s charming architecture framing the scene.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, tilt down"}
+{"index": 650, "data": "Long shot of a white leisure boat sailing leisurely along the Seine River, with the iconic Eiffel Tower, its iron - lattice structure distinct, standing tall in the background against a blue sky dotted with white clouds. The Seine’s surface ripples gently, and an intense shaking effect permeates the scene, as if the camera trembles, imbuing the tranquil river view with a dynamic, unstable touch.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, with an intense shaking effect"}
+{"index": 651, "data": "Panoramic shot of a white leisure boat with a polished hull sailing leisurely along the calm, glistening Seine River. The Eiffel Tower, with its iconic iron lattice structure, stands majestically in the background, silhouetted against a clear blue sky with a few wispy clouds. The river’s surface is smooth, reflecting the tower’s outline and the surrounding historic stone buildings lined with lush green trees. The camera maintains a steady and smooth perspective, capturing the boat’s gentle glide as it moves past the picturesque riverbanks, with the tower’s intricate design adding a touch of grandeur to the serene scene.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, featuring a steady and smooth perspective"}
+{"index": 652, "data": "A medium long shot captures a sleek white boat sailing leisurely along the calm, shimmering Seine River, its hull gently cutting through the water and creating faint, silvery ripples. In the background, the iconic Eiffel Tower—with its intricate iron latticework and a warm, golden - brown sheen—rises majestically against a pale blue sky dotted with soft, wispy clouds. The camera utilizes a racking focus technique: initially, the boat is in sharp focus, emphasizing its smooth, relaxed movement, while the Eiffel Tower appears softly blurred. Gradually, the focus shifts, blurring the boat to bring the tower’s detailed architecture into crisp clarity, showcasing its towering structure and the subtle play of light on its metal surfaces. The riverbanks are lined with elegant stone buildings and lush green trees, enhancing the picturesque Parisian ambiance.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background, racking focus"}
+{"index": 653, "data": "Super slow - motion medium shot captures a couple dressed in elegant formal evening wear — the man in a tailored black tuxedo with a crisp white shirt and bow - tie, the woman in a flowing navy evening gown adorned with delicate lace trims — as they make their way home. They are caught in a heavy downpour, holding black umbrellas that tilt slightly under the force of the rain, with raindrops suspended in mid - air like glistening pearls due to the slow - motion effect. The background reveals a dimly lit city street at night, with street lamps casting warm, blurry halos through the rain, and tall buildings with glowing windows lining the sidewalk. The couple’s hair is slightly damp at the edges, and their shoes splash through the puddles forming on the pavement as they walk, the camera maintaining a steady focus on their slow, deliberate movements amidst the cascading rain.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, in super slow motion"}
+{"index": 654, "data": "A medium shot (then zoom in) captures a couple in elegant formal evening wear — the man in a black tuxedo with a bow - tie, the woman in a floor - length, glossy dark - colored gown — heading home. They are caught in a heavy downpour, each holding a black umbrella with curved handles. The ground is wet and reflective, with raindrops splashing forcefully. The background shows a nighttime city street, where streetlights cast hazy halos through the rain, and the sky is covered with dark, overcast clouds. As the camera zooms in, it focuses on their hurried steps and slightly anxious expressions: the man holds the umbrella over the woman protectively, while the woman clutches her gown to keep it from getting soaked, both hastening their pace in the torrential rain.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom in"}
+{"index": 655, "data": "A zoom - out shot captures a couple dressed in elegant formal evening wear—the man in a sleek black tuxedo with a crisp white shirt and a bow - tie, the woman in a flowing, floor - length gown of deep navy blue—on their way home, caught in a heavy downpour. They each hold a black umbrella with silver trim, struggling slightly against the wind as thick, relentless raindrops batter the umbrellas and splash onto the wet, glistening pavement. The background reveals a city street lined with tall, dimly lit buildings, their windows reflecting the rain’s sheen, and street lamps casting hazy halos through the storm. As the camera zooms out, the couple becomes a small figure amidst the vast, rain - soaked urban landscape, with puddles forming rapidly on the road and a few distant cars with headlights on, navigating the stormy streets.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, zoom out"}
+{"index": 656, "data": "The sky is overcast, and a heavy downpour suddenly unleashes. A medium full shot captures a couple in formal evening wear—the man in a sleek black tuxedo with a crisp white shirt and black bow tie, the woman in a flowing, floor - length gown (with intricate beading that glimmers faintly through the rain)—hurrying home, each gripping a black umbrella with silver accents, though raindrops still splatter their elegant hems. They walk arm - in - arm along a wet, glistening city street at night, where street lamps cast warm, blurry light on the rain - soaked pavement, and distant buildings with lit windows blur against the stormy sky. A few other pedestrians dash by, some huddling under awnings, others clutching small umbrellas. The camera pans left, following the couple as they navigate puddle - ridden sidewalks, their shoes squelching in the water, while the downpour relentlessly drenches the scene.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan left"}
+{"index": 657, "data": "The sky is dark and overcast, unleashing a heavy downpour. A medium full shot captures a couple dressed in elegant formal evening wear—he in a tailored black tuxedo with a crisp white shirt, she in a flowing navy - blue gown with delicate lace details—hurrying home while clutching black umbrellas that struggle against the wind. The wet city street beneath them reflects the warm glow of streetlights, with vintage European - style buildings lining both sides, their shop windows emitting a cozy yellow light. Their formal attire is dampened by the rain; her gown’s hem and his trousers’ cuffs are speckled with water droplets, and the umbrella surfaces are slightly flipped by the gusts. The two walk side by side, their steps hasty yet striving to maintain grace. The camera pans right, following them through the rain curtain, as distant streets blur in the mist and a few cars splash through the water as they pass.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, pan right"}
+{"index": 658, "data": "A medium full shot captures a couple dressed in elegant formal evening wear—the man in a sharp black tuxedo with a crisp white shirt and bow tie, the woman in a flowing dark - colored gown with delicate embroidery—hurrying home while caught in a heavy downpour. They each hold a black umbrella, with raindrops densely hitting the ground and splashing around them, creating small puddles that reflect the dim light of the street lamps. The background shows a dimly lit urban street with tall buildings standing silently, their outlines faintly visible in the rain. The camera tilts up, starting from the rain - soaked hems of their clothes and the rain - covered ground, slowly rising to frame their determined expressions and the overcast, rain - filled sky above, capturing their urgent rush against the storm.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt up"}
+{"index": 659, "data": "A medium full shot captures a couple dressed in elegant formal evening wear—the man in a black tuxedo with a crisp white dress shirt and bow tie, the woman in a floor - length satin gown adorned with delicate lace trims—making their way home. They are caught in a heavy downpour, where dense, steely - gray raindrops cascade like a curtain. Both clutch black umbrellas with curved wooden handles, attempting to shield themselves from the relentless rain. The woman huddles slightly closer to the man, the hem of her gown dampening at the edges, while the man adjusts his umbrella to better cover her. The background reveals a dimly lit city street, the wet pavement reflecting the warm glow of street lamps, and a few blurred buildings with glowing windows in the distance. The camera tilts down, shifting focus from their upper bodies to the wet ground beneath their feet—water splashes around the man’s polished black dress shoes and the woman’s high - heeled shoes, now glistening with rainwater.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, tilt down"}
+{"index": 660, "data": "Medium full shot captures a couple in elegant formal evening wear—the man in a black tuxedo, the woman in a flowing evening gown—heading home, caught in a heavy downpour. They hold black umbrellas that strain against the gusty wind, with an intense shaking effect (as if the storm buffets the camera). The background shows a nocturnal street: rain pours down, forming puddles on the wet pavement that reflect the faint glow of street lamps. Rain - soaked buildings with lit windows loom in the distance, and raindrops streak the camera lens, enhancing the chaotic atmosphere. The couple, with damp hair and wet hems of their gowns, hurry forward, hunching against the rain, as the shaking effect persists, emphasizing the ferocity of the storm.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, with an intense shaking effect"}
+{"index": 661, "data": "A medium shot with a steady and smooth perspective captures a couple in formal evening wear—the man in a black tailored tuxedo with a bow tie, the woman in a flowing dark - hued evening gown with delicate lace trims—heading home and caught in a heavy downpour. Each holds a black umbrella, yet the relentless rain still dampens the edges of their elegant outfits. The background shows a dimly lit urban street at night, with wet pavement reflecting streetlights and warm glows from nearby building windows. The camera maintains a steady, smooth follow - shot, gliding alongside them as they carefully navigate the rain - slicked road, steps slow to avoid slipping. The downpour’s roar and wet fabric rustles fill the scene, while distant car headlights pierce the rain, enhancing the atmospheric nighttime city setting.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, featuring a steady and smooth perspective"}
+{"index": 662, "data": "A medium full shot captures a couple in formal evening wear—the man in a sleek black tuxedo paired with a crisp white dress shirt and a black bow tie, the woman in a floor - length navy - blue evening gown featuring delicate lace trimmings—getting caught in a heavy downpour as they head home. Each holds a black umbrella with a silver handle, trying to fend off the relentless rain. The backdrop is a dimly lit city street at night, where the wet pavement reflects the warm glow of street lamps, and faint silhouettes of tall buildings loom in the distance. The scene uses racking focus: at first, the couple’s resolute yet slightly disheveled postures are sharply focused, and then the focus shifts gently to the torrential rain, capturing the dense raindrops striking the umbrellas and the water splashing from the puddle - strewn ground.", "original_prompt_en": "A couple in formal evening wear going home get caught in a heavy downpour with umbrellas, racking focus"}
+{"index": 663, "data": "Super slow - motion long shot of an astronaut floating in space. The astronaut wears a white spacesuit with silver accents, and the transparent helmet reflects distant starlight. The background is the boundless, dark cosmos, dotted with twinkling stars and wispy nebulae. In super slow motion, the astronaut’s body drifts leisurely—limbs moving in an almost imperceptible, graceful rhythm, as if suspended in weightlessness. The spacesuit’s fabric and tubes shift subtly with each tiny motion. The camera holds steady, capturing every delicate movement, while the star - filled backdrop flows slowly in sync with the slowed - down action.", "original_prompt_en": "An astronaut flying in space, in super slow motion"}
+{"index": 664, "data": "A medium shot gradually zooms in to capture an astronaut floating in the vast, dark expanse of space. The astronaut is clad in a pristine white spacesuit with a reflective visor, which faintly mirrors the twinkling stars and nebulous clouds scattered across the cosmic backdrop. Their body slowly rotates, hands gently grasping the suit’s handles as they drift weightlessly—every movement emphasizing the serene, gravity - free environment of space. The background is a boundless canvas of inky blackness, dotted with distant, shimmering stars and wispy, colorful nebulae that stretch into the infinite distance.", "original_prompt_en": "An astronaut flying in space, zoom in"}
+{"index": 665, "data": "A zoom - out shot captures an astronaut flying in space. The astronaut is dressed in a silvery - white spacesuit, with the helmet reflecting faint starlight. He (or she) floats in a relaxed posture, limbs gently moving to control the direction. The background is the profound black universe, dotted with countless twinkling stars like scattered diamonds, and a hazy nebula or the outline of a blue planet can be seen in the distance. As the camera slowly zooms out, the astronaut appears tiny yet distinct against the vast expanse of space, and more details of the space environment, such as additional celestial bodies and cosmic dust, are revealed.", "original_prompt_en": "An astronaut flying in space, zoom out"}
+{"index": 666, "data": "A long shot captures an astronaut flying in space. The astronaut, clad in a white spacesuit with a visor reflecting the faint glimmer of distant stars, floats gracefully with limbs gently extended in the weightless void. The background reveals a deep black cosmos dotted with countless twinkling stars, while a faint blue nebula looms distantly. The camera pans left, following the astronaut as he slowly drifts toward the left side of the frame, capturing his solitary yet majestic flight through the vast universe.", "original_prompt_en": "An astronaut flying in space, pan left"}
+{"index": 667, "data": "A long shot captures an astronaut clad in a white spacesuit with a reflective helmet floating in the vast, dark expanse of space. The backdrop is dotted with twinkling stars and faint, wispy nebulae, evoking the boundless cosmos. The astronaut’s body is slightly angled, limbs positioned as if navigating the zero - gravity realm. The camera pans right, following the astronaut’s smooth drift, revealing more of the star - speckled cosmic landscape.", "original_prompt_en": "An astronaut flying in space, pan right"}
+{"index": 668, "data": "A tilt - up shot captures an astronaut flying in space. The astronaut, clad in a white spacesuit with subtle reflective details, floats gracefully with a slightly angled posture, as if navigating the weightless void. The background is the vast, inky blackness of space, dotted with twinkling stars and the faint glow of distant celestial bodies— a partial blue - and - white planet (likely Earth) peeks from the lower left, lending scale to the scene. The camera executes a tilt - up, gradually revealing more of the astronaut’s upper body and the expansive cosmos above, while the astronaut continues to drift, limbs relaxed in the cosmic environment.", "original_prompt_en": "An astronaut flying in space, tilt up"}
+{"index": 669, "data": "A tilt - down shot captures an astronaut flying in space. The astronaut, clad in a pristine white spacesuit with a glossy, reflective helmet, floats in a horizontal posture, his body slightly angled forward, and gloved hands held in a relaxed yet purposeful manner as if maneuvering through the weightless void. The backdrop is the boundless, pitch - black expanse of space, speckled with twinkling stars and a distant, softly glowing blue planet peeking from the darkness. As the camera executes a tilt - down motion, it gradually reveals more of the astronaut’s gentle, floating movement amidst the cosmic scenery, with the star - studded void and the far - off celestial body emphasizing the solitude and vastness of his spaceflight.", "original_prompt_en": "An astronaut flying in space, tilt down"}
+{"index": 670, "data": "Long shot of an astronaut in a white spacesuit with a reflective helmet flying through the vast, dark cosmos. The background showcases distant twinkling stars, faint nebulae, and the curved silhouette of a blue - and - white planet (likely Earth) in the distance. The scene features an intense shaking effect, as if the astronaut’s craft or the camera is buffeted by turbulent forces, causing the astronaut’s form to jolt erratically against the celestial backdrop.", "original_prompt_en": "An astronaut flying in space, with an intense shaking effect"}
+{"index": 671, "data": "Long shot of an astronaut flying in space. The astronaut, clad in a white spacesuit with a reflective helmet visor and subtle technical details, moves with steady, smooth motion. The backdrop is the vast, inky - black expanse of space, sprinkled with twinkling stars and wispy nebulae. The perspective remains steady and smooth, capturing the astronaut’s controlled, floating movement—arms slightly curved, legs relaxed—as the camera maintains a fixed, steady angle to follow their flight, highlighting the serene, boundless cosmos.", "original_prompt_en": "An astronaut flying in space, featuring a steady and smooth perspective"}
+{"index": 672, "data": "A medium long shot captures an astronaut in a white spacesuit floating in the vast, inky - black expanse of space. The background is a deep black void dotted with faint, twinkling stars, and a distant blue - and - white celestial body (resembling Earth) is partially visible. The camera employs a racking focus technique: initially, the focus is on the astronaut, showing the detailed textures of the spacesuit, the reflective visor of the helmet, and the subtle, controlled movements of their limbs as they drift in the zero - gravity environment. Then, the focus shifts to the starry backdrop or the distant celestial body, emphasizing the depth of space. The astronaut’s posture is relaxed yet controlled, with limbs slightly extended as they drift, conveying the weightlessness of the space environment.", "original_prompt_en": "An astronaut flying in space, racking focus"}
+{"index": 673, "data": "Panoramic shot of snow - blanketed rocky mountain peaks surrounding deep canyons. The snow - covered rocky mountains cast shadows over the canyons, and the canyons twist and bend through the high - elevated mountain peaks. The scene is captured in super slow motion, with the camera maintaining a steady perspective to highlight the majestic, winding landscape of the snow - capped mountains and their shadowed canyons.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, in super slow motion"}
+{"index": 674, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and canyons. Snow - blanketed rocky mountains surround and cast shadows over the deep canyons, which twist and bend through the high - elevated mountain peaks. The mountains display rugged rock faces partially covered by smooth, white snow, while the canyons reveal their deep, shadowy depths. The background consists of a vast range of snow - capped peaks with sharp rocky outcrops. The sky is clear with a pale blue tint. As the camera zooms in, it captures the intricate winding paths of the canyons and the textured snow - covered rocks, emphasizing the depth and grandeur of the landscape.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom in"}
+{"index": 675, "data": "A panoramic shot captures snow - blanketed rocky mountains surrounding and casting shadows over deep canyons. The canyons twist and bend through the high - elevated mountain peaks, with their rugged rock faces contrasting against the smooth snow. The sky above is a clear blue, which enhances the stark beauty of the landscape. As the camera zooms out, a more expansive view of the mountainous terrain and the winding canyons within it is revealed.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, zoom out"}
+{"index": 676, "data": "The sky is clear with a light blue tint. A panoramic shot of snow - blanketed rocky mountains surrounding deep canyons. The rocky mountains, covered in thick snow, cast shadows over the deep canyons that twist and bend through the high - elevated, jagged mountain peaks. The canyon floors are a mix of rocks and patches of snow. The camera pans left, capturing the vast expanse of the snow - capped mountains and the winding canyons stretching into the distance.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan left"}
+{"index": 677, "data": "A panoramic shot captures snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains, with rugged gray - brown rock surfaces partially covered in smooth white snow, surround the deep canyons and cast dark shadows over them. The canyons twist and bend sinuously through the high - elevated mountain peaks, revealing the steep, rocky walls of the canyons as they cut through the mountainous terrain. The sky above is clear and pale blue, enhancing the scene’s vastness. The camera pans right, revealing more of the magnificent mountain - canyon landscape, with additional snow - capped peaks and winding canyons coming into view.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, pan right"}
+{"index": 678, "data": "A panoramic shot captures snow - blanketed rocky mountains with rugged, grayish - white rock textures. These mountains surround and cast deep shadows over the deep canyons that twist and bend through the high - elevated mountain peaks. The camera tilts up, capturing the majestic height of the mountain peaks as the snow - covered rocky mountains enclose the sinuous canyons, with the canyons winding their way through the lofty mountain peaks.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt up"}
+{"index": 679, "data": "Panoramic shot of snow - blanketed rocky mountain peaks surrounding deep, twisting canyons. The snow - white peaks, with exposed gray rocky textures, cast long shadows over the canyons that snake and bend through the high - elevated mountains. The canyons’ rugged walls reveal layered rock formations, and the camera tilts down to capture the depth of the canyons as they wind amidst the mountainous landscape. The sky above is clear with a few wispy clouds, highlighting the stark contrast between the white snow, gray rocks, and the shadowed, twisting canyons below.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, tilt down"}
+{"index": 680, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and deep canyons. The snow - covered rocky mountains surround the canyons, casting shadows over the deep, twisting gorges that wind through the high - elevated peaks. An intense shaking effect is applied to the camera, emphasizing the rugged and dynamic nature of the mountainous canyon landscape.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, with an intense shaking effect"}
+{"index": 681, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains surround the area, casting shadows over the deep canyons that twist and bend through the high - elevated mountain peaks. The perspective is steady and smooth, capturing the rugged texture of the rocky peaks peeking out from beneath the thick, white snow. The canyons, with their winding paths, cut through the mountainous landscape, while the snow - covered peaks display a mix of white snow and gray - brown rock. The background reveals the expansive mountainous terrain, with the peaks rising sharply into the sky (the sky’s condition is unspecified in the original, but it could be a clear blue to emphasize the snow’s brightness or overcast for a more somber tone). The camera maintains a steady, smooth perspective, showcasing the majestic and serene scene of the snow - capped mountains and the meandering canyons.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, featuring a steady and smooth perspective"}
+{"index": 682, "data": "A panoramic shot captures snow - blanketed rocky mountains surrounding and casting shadows over deep canyons. The canyons twist and bend through the high - elevated mountain peaks. During the scene, a racking focus effect is applied, alternating the focus to highlight the rugged textures of the rocky mountains and the winding forms of the canyons. The sky above is clear, with bright light enhancing the contrast between the white snow and the dark rock surfaces of the mountains, while the deep canyons lie in their shadows, adding a sense of depth and mystery to the landscape.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks, racking focus"}
+{"index": 683, "data": "A close - up shot captures a bunch of vibrant purple, plump and juicy grapes resting on a smooth wooden rotating table. The table spins slowly clockwise, causing the grapes to sway gently, their natural waxy bloom glistening under soft ambient light. The background is minimal, revealing only the dark wood grain of the table’s edge, while the camera remains fixed in a close - up, emphasizing the grapes’ succulent texture and the subtle rotational motion of the table.", "original_prompt_en": "Close up of grapes on a rotating table."}
+{"index": 684, "data": "A long shot captures a turtle swimming in the ocean. The turtle has a dark - green shell with brown patterns, and its limbs are paddling rhythmically. The ocean water is a deep blue, with sunlight filtering through, creating shimmering spots. Small fish can be seen darting by occasionally in the background. The turtle moves gracefully, its body swaying slightly with the gentle current, as it navigates through the water.", "original_prompt_en": "Turtle swimming in ocean."}
+{"index": 685, "data": "A medium shot captures a storm trooper clad in the iconic white armor with black visor detailing, standing on a golden - sand beach. He holds a silver vacuum cleaner with a flexible hose, methodically vacuuming the sand—small grains and tiny debris are being sucked into the device. Behind him, turquoise ocean waves gently crash onto the shore, and a clear blue sky stretches overhead. The trooper remains focused, his posture steady as he moves the vacuum across the beach, with seagulls occasionally flying in the distant background.", "original_prompt_en": "A storm trooper vacuuming the beach."}
+{"index": 686, "data": "A medium shot captures a panda standing upright on a light - colored surfboard in the ocean at sunset. The panda, with its distinctive black - and - white fur, maintains a steady posture as gentle waves lap at the surfboard. The sky glows with warm orange and pink hues from the setting sun, with a few scattered clouds, and the vast ocean stretches out to the horizon, its surface shimmering under the golden light of the sunset.", "original_prompt_en": "A panda standing on a surfboard in the ocean in sunset."}
+{"index": 687, "data": "A medium shot captures an astronaut in a white spacesuit with reflective accents feeding ducks on a sunny afternoon. The scene unfolds by a calm pond, where sunlight creates gentle ripples and shimmering reflections on the water’s surface. Several ducks—including mallards with iridescent green heads and brown bodies—gather around the astronaut’s feet, eagerly pecking at the food he offers from his gloved hands. The background features a lush green park with tall trees casting dappled shadows, and a clear blue sky dotted with fluffy white clouds. The astronaut bends slightly, his posture relaxed yet focused, as the ducks quack softly and paddle in the shallow water, their reflections dancing alongside them amidst the sunlit ripples.", "original_prompt_en": "An astronaut feeding ducks on a sunny afternoon, reflection from the water."}
+{"index": 688, "data": "Medium shot of two giant pandas in a bamboo - fringed study nook. The panda on the left, with its iconic black - and - white fur, sits upright on a wooden bench, holding a white sheet (an academic paper) with black text in its right paw, pointing at a section with its left paw. The panda on the right leans in, head tilted toward the paper, eyes fixed on the content, its right paw resting on the table as if deeply engaged in discussion. The background features lush green bamboo swaying gently, a wooden table between them holding a pen holder, a stack of books, and a steaming bamboo - shoot - shaped teacup. Sunlight filters through bamboo leaves, casting dappled shadows. The pandas occasionally shift their postures—the left one gesturing at the paper while the right one nods thoughtfully—fully immersed in their academic exchange.", "original_prompt_en": "Two pandas discussing an academic paper."}
+{"index": 689, "data": "Time - lapse shot of a beach at sunset. The sky is ablaze with a dynamic palette of colors—swirling oranges, soft pinks, and deep purples—while fluffy clouds drift gracefully across the horizon, their edges tinted by the setting sun. The beach below features golden sand stretching along the shore, with gentle waves lapping at the water’s edge. As the time - lapse unfolds, the clouds move steadily, and the sky’s hues shift and blend, capturing the serene yet vibrant transition of the sunset over the tranquil beach.", "original_prompt_en": "Sunset time lapse at the beach with moving clouds and colors in the sky."}
+{"index": 690, "data": "Medium shot of a plump rabbit with fluffy white fur, clad in a flowing purple robe that billows gently as it moves. The rabbit walks with a slight waddle—its round belly jiggling softly with each step—through a surreal fantasy landscape. Its long, floppy ears (tipped in pale gray) sway, and bright, curious eyes scan the surroundings. The background unfolds as a dreamlike realm: glowing bioluminescent plants carpet the mossy ground, twisted rainbow - hued trees stretch toward a sky of swirling pastel clouds, and floating crystal shards glimmer in the air. The camera follows the rabbit’s amble, panning to capture the whimsical scenery—from iridescent foliage to distant floating islands—as the rabbit ventures deeper into the enchanted terrain.", "original_prompt_en": "A fat rabbit wearing a purple robe walking through a fantasy landscape."}
+{"index": 691, "data": "Medium shot of a koala bear with thick gray fur, round ears, and a black nose, sitting on a grassy patch in a lush forest. The koala is positioned in front of a small wooden piano with black - and - white keys, its front paws gently pressing the keys as if playing a tune. The background is filled with tall green trees with broad leaves, and patches of sunlight filter through the tree canopy onto the forest floor which is covered with fallen leaves and vibrant green grass. A few white clouds are scattered in the partly visible sky above. The koala keeps playing the piano, occasionally moving its head, and the camera remains fixed, capturing the delightful scene of the koala engrossed in its musical activity.", "original_prompt_en": "A koala bear playing piano in the forest."}
+{"index": 692, "data": "Long shot of an astronaut flying in space. The astronaut is clad in a white spacesuit with a reflective helmet visor and mission patches on the chest, floating gently—their body slowly rotating, arms slightly bent as if balancing in the weightless void. The background reveals the vast, dark expanse of space, dotted with twinkling stars and faint, colorful nebulae. The camera remains steady, capturing the astronaut’s smooth, drifting movements amid the serene, boundless cosmos.", "original_prompt_en": "An astronaut flying in space."}
+{"index": 693, "data": "A panoramic night shot captures vibrant fireworks bursting in the dark, star - dotted sky. The fireworks, in colors of bright red, vivid blue and golden yellow, spread into intricate patterns—some like blooming flowers, others like cascading sparks—illuminating the inky night. The background has the silhouettes of tall city buildings with glowing windows, framing the scene. Initially fixed, the camera tilts upward to follow the ascending trails of the fireworks as they explode into dazzling bursts and then fade. Some fireworks crackle with sharp pops, while others bloom softly, creating a dynamic contrast in the serene night.", "original_prompt_en": "Fireworks."}
+{"index": 694, "data": "Long shot of an animated painting - style scene showcasing fluffy white clouds with a cotton - like, billowy texture moving slowly across the sky. The sky, a soft pastel - hued expanse, forms the backdrop, complementing the whimsical, painterly aesthetic of the animation. The clouds drift gracefully, their edges blending gently as they glide, embodying the serene motion typical of such animated cloud depictions.", "original_prompt_en": "An animated painting of fluffy white clouds moving in sky."}
+{"index": 695, "data": "Long shot of a perspective flying through a fantasy landscape. The landscape is filled with floating islands covered in luminescent purple plants, cascading waterfalls glowing turquoise, and towering twisted trees with pink foliage. The sky is a gradient of deep blue to vibrant orange, dotted with iridescent clouds that shift colors as the perspective moves forward. Below, winding rivers of liquid gold and fields of bioluminescent flowers light the ground. The perspective glides forward, passing a massive crystal - like structure emitting soft white light, then a misty area where giant winged creatures with iridescent scales soar in the distance. The fantasy landscape, with ever - changing wonders, unfolds as the camera continues its flight through this otherworldly realm.", "original_prompt_en": "Flying through fantasy landscapes."}
+{"index": 696, "data": "A medium shot captures a bigfoot walking steadily through a ferocious snowstorm. The bigfoot, covered in thick, dark - brown fur and with a massive, hunched frame, takes heavy, deliberate steps that leave deep imprints in the snow. Snowflakes swirl densely around it, driven by strong winds that buffet its form, while the ground is blanketed with a thick layer of fresh, powdery snow. In the background, snow - laden trees bend under the storm’s force, and the sky is a gloomy gray, merging with the swirling snow to create a near - whiteout. The bigfoot’s head is slightly bowed against the driving snow, and its breath forms faint white clouds in the frigid air as it continues its slow, determined walk.", "original_prompt_en": "A bigfoot walking in the snowstorm."}
+{"index": 697, "data": "Medium shot of a brown squirrel with a fluffy tail eating a burger. The burger features a golden - brown bun, a juicy meat patty, and green lettuce peeking out. The squirrel holds the burger with its small, nimble front paws, taking small bites while sitting on a patch of green grass dotted with fallen leaves. The background shows scattered tree branches, and sunlight filters through, casting a warm glow. The squirrel chews slowly, its whiskers twitching with each bite, and occasionally pauses to glance around before continuing to enjoy the burger.", "original_prompt_en": "A squirrel eating a burger."}
+{"index": 698, "data": "A medium shot captures a sleek black cat dressed as a lifeguard, wearing stylish black sunglasses with reflective lenses. The cat stands atop a white plastic lifeguard chair by a sparkling blue swimming pool, its short, smooth fur glistening under the bright sun. The pool’s water shimmers with gentle ripples, and colorful inflatable floaties dot the surface. In the background, vibrant beach umbrellas in shades of pink and yellow line the poolside, while a few people in swimsuits playfully splash in the water. The cat maintains a vigilant stance, its tail flicking slightly as it “patrols” the pool area, embodying the role of a dedicated lifeguard.", "original_prompt_en": "A cat wearing sunglasses and working as a lifeguard at a pool."}
+{"index": 699, "data": "Panoramic shot of snow - blanketed rocky mountain peaks and deep canyons. The snow - blanketed rocky mountains, with rugged gray rock textures peeking out from beneath the thick, glistening white snow, surround and cast long shadows over the deep canyons. The deep canyons, with their steep, shadowy rock walls, twist and bend sinuously through the high - rising, snow - capped mountain peaks. The sky above is a vivid, cloudless blue, highlighting the stark contrast between the pristine white snow, the rugged gray rocks, and the dark, winding canyons. The camera stays fixed, capturing the motionless yet magnificent scene of the mountains enclosing the twisting canyons, as the canyons snake their way through the lofty, snow - covered peaks.", "original_prompt_en": "Snow rocky mountains peaks canyon. snow blanketed rocky mountains surround and shadow deep canyons. the canyons twist and bend through the high elevated mountain peaks."}
+{"index": 700, "data": "Extreme slow - motion close - up shot of a splash of turquoise water. The water droplets, with a semi - transparent look (indicating the inclusion of an alpha channel), scatter in mid - air, glistening. The background is blurred, highlighting the vivid turquoise hue of the water and the delicate, prolonged motion of the splash captured in extreme slow motion.", "original_prompt_en": "Splash of turquoise water in extreme slow motion, alpha channel included."}
+{"index": 701, "data": "A close - up shot captures a vanilla ice - cream cone with light - brown chocolate drizzle on top, placed on a white porcelain plate over a wooden table. The ice cream is gradually melting: its creamy top softens, and pale - colored liquid trickles down the cone, forming small droplets that land on the table. The background shows a simple kitchen setting with a glass of water and a few napkins on the table. Fixed shot, documenting the slow melting process as the ice cream’s texture turns more liquid, with the chocolate drizzle subtly blending into the melting cream.", "original_prompt_en": "an ice cream is melting on the table."}
+{"index": 702, "data": "Aerial long shot captures a gray drone with rapidly spinning propellers flying steadily over a vast snowy forest. The forest below is blanketed in thick, pristine white snow, with tall evergreen trees—their dark green branches heavy with snow—stretching to the horizon. The sky is a clear pale blue, and the drone moves forward smoothly, the camera following its path to showcase the serene, snow - laden expanse, where snowdrifts nestle between tree trunks and bare branches occasionally peek through the snow.", "original_prompt_en": "a drone flying over a snowy forest."}
+{"index": 703, "data": "Long shot of a gray shark swimming in the ocean. The shark has a sleek, torpedo - shaped body and a tall dorsal fin cutting through the deep blue water. Sunlight filters from the surface, creating shimmering light patterns on its gray scales. In the background, there are coral reefs and small schools of colorful fish. The shark moves smoothly, its tail undulating rhythmically, and the camera tracks its movement to keep it centered.", "original_prompt_en": "a shark is swimming in the ocean."}
+{"index": 704, "data": "Aerial panoramic shot from a drone captures a fantasy land. The landscape is dotted with glowing crystal forests, where tall iridescent trees emit a soft blue light. Floating islands linked by rainbow bridges hover in the sky, which displays a dreamy gradient of purple and pink. In the distance, gothic - style castles adorned with neon runes stand alongside futuristic skyscrapers with sleek metallic surfaces. A herd of winged unicorns gallops across a meadow of bioluminescent grass, their golden manes glowing. The ground in some regions is a shimmering silver - hued liquid, while others are covered with giant luminous mushrooms swaying gently. The background is a vast pastel - hued cloudscape, and magical particles shimmer in the air. The drone camera slowly pans right, revealing a river of liquid starlight winding through the land and strange winged creatures with iridescent wings soaring above the castles.", "original_prompt_en": "Aerial panoramic video from a drone of a fantasy land."}
+{"index": 705, "data": "Long shot of a brown teddy bear with soft plush fur swimming in the deep blue ocean. The teddy bear faces forward, its limbs moving in a paddling motion to stay afloat. The ocean water ripples with gentle waves, glistening under the bright sunlight. The background reveals a clear blue sky with a few fluffy white clouds above the distant horizon where the sea meets the sky. The camera follows the teddy bear as it swims steadily toward the right side of the frame, capturing its playful, buoyant movement against the vast, open ocean.", "original_prompt_en": "a teddy bear is swimming in the ocean."}
+{"index": 706, "data": "Time - lapse shot of sunrise on Mars. A long - shot view presents the Martian landscape, where the red, rocky, and dusty surface, dotted with scattered rocks and sand dunes, stretches to the horizon. The sky, tinted orange - red due to the dust and carbon dioxide in Mars' atmosphere, serves as the backdrop. The Sun, appearing smaller than it does from Earth, slowly rises from the Martian horizon. As it ascends, sunlight gradually bathes the terrain, illuminating the red rocks and dunes which shift from dark crimson to bright red under the light. Fine dust particles drift in the Martian wind, glistening in the sunlight and creating a hazy, glowing effect. The camera remains fixed, capturing the Sun’s slow climb, the subtle change in the sky’s color, and the gradual clarity of the landscape as the shadows of rocks and dunes shift with the light.", "original_prompt_en": "time lapse of sunrise on mars."}
+{"index": 707, "data": "A long shot captures a golden fish with shiny golden scales swimming in the ocean. The water is deep blue, with seaweed swaying in the current and small rocky formations on the seabed. The fish moves its tail gently, creating ripples as it swims toward the right of the frame. The camera remains fixed, focusing on the fish as its golden body glints in the light filtering through the water.", "original_prompt_en": "golden fish swimming in the ocean."}
+{"index": 708, "data": "A close - up shot captures an artist’s brush, with dark bristles and a smooth wooden handle, painting on a white canvas with a slightly textured surface. The brush moves in gentle, deliberate strokes, applying vibrant pigment that spreads across the canvas, while the background remains indistinct, emphasizing the intricate motion of the brush as it creates artwork.", "original_prompt_en": "An artist brush painting on a canvas close up."}
+{"index": 709, "data": "A drone shot captures a festive celebration scene. In the foreground, a beautifully decorated Christmas tree, adorned with twinkling lights and colorful ornaments, stands tall. Vibrant fireworks burst in the night sky, creating streaks of red, green, and gold. The background is a clear, starry sky with countless stars twinkling gently. The drone - mounted camera pans across the scene, showcasing the dazzling fireworks illuminating the Christmas tree and the serene, star - dotted sky beyond.", "original_prompt_en": "A drone view of celebration with Christmas tree and fireworks, starry sky - background."}
+{"index": 710, "data": "A medium close - up portrait shot in a studio. A happy dog, dressed in a yellow turtleneck, stands facing the camera with a cheerful expression. The background is dark, highlighting the dog’s vibrant attire and joyful demeanor.", "original_prompt_en": "happy dog wearing a yellow turtleneck, studio, portrait, facing camera, dark background"}
+{"index": 711, "data": "Studio shot of 3D - rendered origami dancers crafted from white paper, performing modern dance against a pristine white background. The dancers, with their angular paper forms, move fluidly—bending, twisting, and extending their paper limbs in sync with the choreography. The camera remains fixed, capturing the crisp, minimalist aesthetic of the white - on - white scene, highlighting the delicate folds and dynamic poses of the origami figures as they dance gracefully.", "original_prompt_en": "Origami dancers in white paper, 3D render, on white background, studio shot, dancing modern dance."}
+{"index": 712, "data": "Wide shot of a campfire burning brightly at night in a snowy forest. The forest floor is blanketed with fresh, glistening snow, and tall evergreen trees—their branches heavy with snow—surround the fire. The campfire’s orange flames flicker and dance, casting warm amber light that contrasts sharply with the cold, white snow. In the background, the sky is a deep inky black, dotted with countless twinkling stars, forming a starry canopy above the silent, snow - covered forest. The scene is still, with the only movement being the gentle sway of the fire’s flames and the subtle shift of snow - laden branches in the faint night breeze.", "original_prompt_en": "Campfire at night in a snowy forest with starry sky in the background."}
+{"index": 713, "data": "A panoramic shot of a fantasy landscape. The scene showcases towering, twisted rock formations with glowing blue veins coursing through them, emerging from a bed of iridescent purple moss that shimmers beneath an otherworldly, pink - hued sky. In the distance, floating islands drift leisurely, their undersides dotted with bioluminescent flora emitting a soft green glow. A delicate mist winds between the rock structures, and in the foreground, a small crystalline stream cascades over smooth, rainbow - colored stones, its water sparkling with magical particles. The camera slowly pans across the landscape, capturing the surreal beauty of this otherworldly realm. The floating islands move lazily toward the left of the frame, and the mist swirls gently around the rock formations.", "original_prompt_en": "a fantasy landscape"}
+{"index": 714, "data": "A close - up shot of a 3D model of an 1800s Victorian house. The model exhibits exquisite details: it has a steeply - pitched roof covered with dark brown shingles, and there are ornate wooden trims along the edges. The house is equipped with tall windows adorned with stained - glass panes in floral patterns, and its facade is painted white, with decorative brackets under the eaves. A small front porch with carved railings is also a part of the model, and there are chimney stacks with brick detailing. The model is placed against a minimalist white background, and soft, diffused lighting illuminates it to highlight its architectural details. The camera is fixed, presenting the 3D model in a static state, emphasizing the classic architectural style of the Victorian era in the 1800s.", "original_prompt_en": "A 3D model of a 1800s victorian house."}
+{"index": 715, "data": "A medium - close shot captures the speaker doing makeup in the morning. She is seated at a white vanity table with a mirror, in a cozy bedroom with light - colored curtains. Soft morning sunlight filters through the curtains, casting a warm glow over the room. On the vanity, various makeup products are placed, including a pink - cased foundation, a brown eyeshadow palette, and a red lipstick. She holds a makeup brush in her right hand, gently applying foundation to her face, while her left hand holds a compact powder case. The camera remains fixed, capturing her smooth and deliberate movements as she carries out her morning makeup routine, from applying the foundation to blending the eyeshadow, showing how she does her makeup in the morning.", "original_prompt_en": "this is how I do makeup in the morning."}
+{"index": 716, "data": "Medium shot of a digital art creation: a raccoon stylized to resemble a turtle. The raccoon has a black - masked face with white fur around the eyes, typical of a raccoon, but its body is armored with a turtle - like shell, patterned with brown and green to mimic a turtle’s carapace. Its furry raccoon - like limbs extend from the shell, and its ringed black - and - white tail curls behind. The background is a dreamy digital landscape with soft, neon - hued gradients (blues and pinks) and pixel - art - inspired textures, emphasizing the artistic, surreal nature of the piece. The raccoon stands still, showcasing the fusion of raccoon and turtle features in this digital artwork.", "original_prompt_en": "A raccoon that looks like a turtle, digital art."}
+{"index": 717, "data": "A panoramic shot captures a robot with a sleek silver metallic body and blue LED - lit joints dancing energetically in Times Square. The robot executes rhythmic dance moves—swinging its arms, rotating its torso, and stepping side to side—while the background teems with neon - illuminated billboards, crowds of pedestrians (some pausing to watch the performance), yellow taxis, and tall glass skyscrapers. The street is lined with colorful storefronts, and the air is filled with the hum of city traffic and lively chatter. The camera follows the robot’s movements, panning smoothly to capture the vibrant urban landscape, with Times Square’s bright lights glinting off the robot’s surface.", "original_prompt_en": "Robot dancing in Times Square."}
+{"index": 718, "data": "Long shot of a busy freeway at night. The sky is deep black, with faint city lights twinkling in the distance. The freeway is filled with vehicles: cars with bright white headlights and red taillights, trucks with glowing amber marker lights, all moving steadily or in slow traffic. Street lamps along the road cast yellow halos on the asphalt, and billboards with neon signs illuminate the sides. The camera pans slowly to the right, capturing the continuous stream of traffic, with some vehicles accelerating and others braking, their lights creating streaks against the night. In the background, tall buildings with lit windows stand, and the glow of city life reflects off the road surface. The scene is alive with the hum of engines and the flash of passing headlights, emphasizing the bustling nature of the freeway after dark.", "original_prompt_en": "Busy freeway at night."}
+{"index": 719, "data": "A high - speed (extreme slow - motion) close - up shot captures a transparent water - filled balloon. The balloon, round and taut with clear water inside, suddenly explodes in extreme slow motion. The water, in the form of countless tiny droplets and larger splashes, spreads outwards at an extremely slow pace. Each droplet seems to hang in the air, showcasing their smooth, glistening surfaces as they move along intricate trajectories. The background is a simple, light - colored space, which makes the dynamic process of the water balloon's explosion and the movement of the water droplets stand out vividly.", "original_prompt_en": "Balloon full of water exploding in extreme slow motion."}
+{"index": 720, "data": "Long shot in photorealistic style captures an astronaut riding a horse in space. The astronaut, clad in a white space suit with blue accents and a transparent helmet, sits upright on the horse’s back, gripping the reins with both hands. The horse, rendered with photorealistic detail—its coat sleek and dark, muscles taut—appears to “gallop” through the void, hooves suspended in the dark expanse. The background reveals a deep black cosmos dotted with twinkling stars, distant nebulas glowing in hues of purple and pink, and a faint, curved planet horizon. The scene maintains a surreal yet lifelike quality, emphasizing the photorealistic rendering of the astronaut and horse against the cosmic backdrop.", "original_prompt_en": "An astronaut is riding a horse in the space in a photorealistic style."}
+{"index": 721, "data": "Macro slow - motion, a cropped close - up captures roasted coffee beans (dark brown with fine cracks and a glossy surface) falling into an empty white ceramic bowl. The background is blurred, highlighting the sharp details of the beans and the bowl. In the slow - motion sequence, the coffee beans descend gracefully, their edges catching light as they tumble toward the bowl, while the empty bowl (with a smooth interior) awaits their arrival.", "original_prompt_en": "Macro slo-mo. Slow motion cropped closeup of roasted coffee beans falling into an empty bowl."}
+{"index": 722, "data": "Medium shot of an old sewing machine at work. The sewing machine has a dark, slightly worn metal body with a wooden base, showing signs of age like minor scratches and faded paint. Its needle moves rapidly up and down, while the thread spool beside it rotates steadily, guiding the thread through a piece of light - colored cotton cloth placed under the presser foot. The background reveals a cluttered worktable with scattered spools of colorful thread, a pair of silver scissors, and pieces of fabric in various patterns. The room is softly lit, with a vintage lamp on the table casting a warm glow, and the walls are adorned with old sewing patterns and posters, capturing the nostalgic atmosphere of a traditional sewing space. As the machine operates, the needle continues its rhythmic motion, and the fabric slowly advances, showcasing the smooth, mechanical workflow of the old sewing machine.", "original_prompt_en": "Sewing machine, old sewing machine working."}
+{"index": 723, "data": "A close - up shot captures the dynamic interaction of colorful ink with water. Initially, several drops of vividly colored ink (such as deep violet, lake blue, and tangerine) are gently dropped into the still, transparent water. The ink droplets, like dreamy colored pearls, slowly sink and then begin to swirl and spread in the water, forming a vortex - like ink cloud. The colors of the ink interweave and diffuse, resembling an abstract, fancy dream - like ink cloud dancing in the water. The clear water background makes the flow trajectory of the colors clearly visible, and soft light filters through the water, highlighting the magical dance of the colors. The ink continues to swirl and spread, eventually merging into a colorful abstract pattern. The camera remains fixed throughout, focusing on this enchanting display of color dynamics, as if transporting the viewer into a fantasy world woven with colors.", "original_prompt_en": "Motion colour drop in water, ink swirling in water, colourful ink in water, abstraction fancy dream cloud of ink."}
+{"index": 724, "data": "A close - up, macro shot captures a few big purple plums with smooth and glossy skins rotating slowly on a sleek white turntable. As they spin, tiny and glistening water droplets gradually form and adhere to their purple surfaces, highlighting the plums’ juicy texture. The scene is isolated against a pristine white background, which emphasizes the vivid color and fine details of the plums. The camera remains fixed, maintaining a sharp focus on the plums’ rotation and the delicate appearance of the water droplets.", "original_prompt_en": "Few big purple plums rotating on the turntable. water drops appear on the skin during rotation. isolated on the white background. close-up. macro."}
+{"index": 725, "data": "A close - up shot captures a beautiful girl with vampire - themed makeup. Her skin looks pale, and she has dramatic eye makeup with dark or red tones around her eyes, along with deep - red - painted lips. She wears vivid red contact lenses, giving her eyes an intense, otherworldly appearance. The background is softly blurred, drawing attention to her face decorated with the elaborate vampire makeup.", "original_prompt_en": "Vampire makeup face of beautiful girl, red contact lenses."}
+{"index": 726, "data": "Close - up shot of a glass ashtray brimming with yellowish - brown cigarette butts resting on a dark wooden table. Gentle streams of smoke drift slowly against a completely black background, the deep blackness highlighting the hazy, swirling movement of the smoke.", "original_prompt_en": "Ashtray full of butts on table, smoke flowing on black background, close-up"}
+{"index": 727, "data": "Panoramic shot of the Pacific coast at Carmel by the Sea. The deep blue ocean stretches across the frame, with white - crested waves rhythmically rolling toward the shore and gently crashing against the sandy coastline. The sky is clear with a few scattered clouds, casting a soft glow on the water. In the background, rugged coastal cliffs with patches of green vegetation rise. The camera remains fixed, capturing the serene yet dynamic motion of the ocean waves.", "original_prompt_en": "Pacific coast, carmel by the sea ocean and waves."}
+{"index": 728, "data": "A long shot captures a whimsical teddy bear—with soft brown fur and a red bow - tie—playing a shiny drum kit (equipped with a bass drum, snare, and cymbals) in the heart of NYC’s Times Square. The background is brimming with towering buildings decorated with dazzling LED billboards, throngs of pedestrians (some pausing to watch), yellow taxis weaving through the traffic, and the iconic red steps of the TKTS booth. The teddy bear, perched on a tiny stool, grips drumsticks with its paws: it pounds the bass drum with its right paw, taps the snare with its left, and flicks the cymbal with a swift motion, its head nodding rhythmically. The scene is bathed in evening neon, with the lights reflecting off the teddy’s fur, and the camera remains fixed, highlighting the playful contrast between the cuddly toy and the chaotic, luminous urban landscape.", "original_prompt_en": "A teddy bear is playing drum kit in NYC Times Square."}
+{"index": 729, "data": "A medium shot captures a fluffy corgi with brown and white fur, its short legs steady as it sits in front of a drum kit. The drum kit has a glossy snare drum, gleaming cymbals, and vibrant tom - toms. The corgi uses its front paws to rhythmically hit the drumhead and cymbals, its tail wagging excitedly. The background is a cozy living room with soft lighting, a plush rug beneath the drum kit, and framed pictures on the wall. The corgi keeps playing the drums, sometimes tilting its head as if savoring the music, while its tail sways with the beat.", "original_prompt_en": "A corgi is playing drum kit."}
+{"index": 730, "data": "Medium shot captures Iron Man, clad in his iconic red - and - gold armored suit with glowing white eye slits on the helmet, standing on a stage illuminated by neon lights. He is playing a high - styled electronic guitar—its body features a sleek, metallic design with blue LED accents and a high - position fretboard. The background reveals a dimly lit concert venue, with colorful spotlights flickering and a faint murmur of an audience cheering in the distance. Iron Man strums the guitar vigorously, his armored fingers moving nimbly across the strings, as the guitar emits high - energy, electrifying riffs that fill the air.", "original_prompt_en": "An Iron man is playing the electronic guitar, high electronic guitar."}
+{"index": 731, "data": "Medium shot captures a raccoon with brown fur and black facial stripes, seated upright. It grips an electronic guitar—with a glossy black body and metallic silver strings—using its front paws, skillfully strumming the strings with its nimble paws. The background is a cozy indoor space with wooden flooring, soft warm lighting, and a small table with a few books scattered atop. The raccoon’s tail, banded with black and white, curls behind it as it plays, occasionally tilting its head to the rhythm, fully engrossed in the music.", "original_prompt_en": "A raccoon is playing the electronic guitar."}
+{"index": 732, "data": "A long shot, rendered in the vivid, swirling brushwork characteristic of Vincent van Gogh’s art, captures a small wooden boat with a weathered hull and billowing white sails sailing leisurely along the Seine River. The river’s surface shimmers with iridescent blues and golds, rippling softly as the boat glides toward the right of the frame. In the background, the Eiffel Tower rises majestically, its intricate iron framework silhouetted against a sky alive with van Gogh’s signature swirling yellows, blues, and hints of purple. The camera follows the boat’s gentle drift, panning smoothly to maintain its position while the iconic tower remains a stately backdrop.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background by Vincent van Gogh"}
+{"index": 733, "data": "Close - up shot of a corgi's head, artistically depicted as an explosion of a nebula. The corgi's head retains its distinct outline (with recognizable ears and facial structure), while its interior bursts with swirling nebular matter—vibrant purple, cyan, and pink hues intermingle, dotted with twinkling star - like glimmers, mimicking the chaotic yet beautiful expansion of cosmic gas and dust. The background is a profound black void, faintly speckled with distant galaxy clusters and wispy dark matter halos, enhancing the celestial transformation of the corgi's head.", "original_prompt_en": "A corgi's head depicted as an explosion of a nebula"}
+{"index": 734, "data": "A panoramic shot of a fantasy landscape. The scene unfolds with floating islands suspended in a vibrant purple sky, their rocky surfaces dotted with bioluminescent plants that emit a soft blue glow. Below, a river of liquid silver winds through a valley of crystalline trees, their branches glistening like diamonds. In the distance, a castle with spiraling towers and iridescent walls stands atop a floating mountain, while winged creatures with iridescent scales soar between the islands. The camera slowly pans across the scene, capturing the gentle drift of the islands and the shimmering light from the glowing flora, immersing the viewer in this otherworldly realm.", "original_prompt_en": "A fantasy landscape"}
+{"index": 735, "data": "Panoramic shot of a futuristic urban plaza, where humans have achieved teleportation technology. Citizens in sleek, glowing attire interact with cylindrical teleportation devices: a woman steps onto a platform, vanishes in a golden shimmer, and reappears at another device (her hair still flowing from the teleport’s momentum). A man teleports a coffee cup to a friend—the cup disappears in a blue spark, materializing instantly in the friend’s grasp. The background features floating skyscrapers, flying cars, and holographic billboards. The camera pans to capture a child teleporting a toy, engineers adjusting devices, and families teleporting together, showcasing the technology’s seamless integration into daily life.", "original_prompt_en": "A future where humans have achieved teleportation technology"}
+{"index": 736, "data": "A medium shot captures a translucent jellyfish with a pale pink bell floating gracefully through the deep blue ocean. Its long, delicate tentacles, glowing with bioluminescent light, sway gently with the water currents, creating a trail of soft, ethereal illumination. The surrounding ocean water is filled with tiny, shimmering plankton, and the background reveals the dark, vast expanse of the sea with faint light filtering down from the surface. The jellyfish drifts slowly, its bioluminescent tentacles flickering like stardust in the water, while the camera follows its movement to highlight the mesmerizing glow and fluid motion of its tentacles.", "original_prompt_en": "A jellyfish floating through the ocean, with bioluminescent tentacles"}
+{"index": 737, "data": "Long shot of a silver - gray Mars rover, equipped with solar panels and a mechanical arm, moving slowly across the Martian surface. The rover’s angular body and rugged wheels kick up fine red dust as it travels toward the right of the frame. The background showcases a barren landscape of rust - hued rocks, undulating red sand dunes, and a hazy orange - red sky, characteristic of Mars’ atmosphere. The camera follows the rover’s movement, documenting its exploration of the rugged, dust - filled Martian terrain.", "original_prompt_en": "A Mars rover moving on Mars"}
+{"index": 738, "data": "Medium shot captures a giant panda with distinctive black - and - white fur (black patches around its eyes, ears, and limbs, white covering its round, plump body) sitting at a rustic wooden table in a charming Parisian café. The panda holds a white ceramic coffee cup with its paw, gently sipping the rich brown coffee, with droplets glistening on the rim. The café’s interior is warm and inviting, featuring exposed brick walls, vintage posters of Parisian landmarks, and plush velvet chairs. Outside the large window, the cobblestone street of Paris unfolds, with pedestrians in stylish attire and the silhouette of the Eiffel Tower peeking through the hazy afternoon light. The camera stays fixed, focusing on the panda’s leisurely sipping, and its tail occasionally twitches as it savors the drink.", "original_prompt_en": "A panda drinking coffee in a cafe in Paris"}
+{"index": 739, "data": "A long shot captures a white space shuttle with black thermal - protection tiles on its fuselage launching into orbit. Vivid orange flames and thick gray smoke billow vigorously from its engines. The shuttle steadily ascends against a backdrop of a clear blue sky dotted with scattered white clouds. The powerful flames from the engines light up the launch pad, and the smoke spreads in turbulent waves as the shuttle accelerates, gradually moving toward the orbital path.", "original_prompt_en": "A space shuttle launching into orbit, with flames and smoke billowing out from the engines"}
+{"index": 740, "data": "Long shot of a black steam train with multiple carriages moving along the mountainside. The train emits white steam from its chimney, and its side faces the camera, revealing a vintage design with large wheels and a cylindrical boiler. The background features rugged mountain slopes covered with dense green vegetation, with rocky outcrops scattered, and the sky is partly cloudy. The camera follows the train as it travels steadily along the winding railway track on the mountainside, capturing the train’s smooth movement against the majestic mountain scenery.", "original_prompt_en": "A steam train moving on a mountainside"}
+{"index": 741, "data": "Panoramic shot of a super - cool giant robot standing in cyberpunk - styled Beijing. The robot has a metallic silver - black body, with neon light strips of blue and purple crisscrossing it, exuding a strong futuristic atmosphere. Its head is equipped with a visor - like sensor, and mechanical arms with intricate mechanical structures hang down by its sides. The background is a bustling cyberpunk Beijing street: tall buildings are covered with holographic advertisement projections, neon signs of various colors flicker, the wet street surface reflects the colorful lights, and some pedestrians in futuristic clothing with mechanical prosthetics walk by. In the distance, flying vehicles shuttle through the air. The robot remains stationary, and the camera slowly pans to the right, capturing more of the cyberpunk cityscape around it, including the glowing billboards and the complex urban architecture.", "original_prompt_en": "A super cool giant robot in Cyberpunk Beijing"}
+{"index": 742, "data": "A panoramic shot at sunrise captures a tropical beach. In the foreground, tall palm trees with lush green fronds sway gently in the warm morning breeze, their slender brown trunks standing out. The crystal - clear ocean water shimmers with turquoise and golden hues, reflecting the soft light of the rising sun. The beach’s sand is a fine, creamy white, extending toward the horizon. In the background, the sky is a gradient of warm oranges, pinks, and purples as the sun slowly rises, casting a golden glow over everything. The camera stays fixed, letting the serene beauty of the tropical beach at sunrise be fully appreciated—with the palm trees swaying and the clear water glistening in the morning light.", "original_prompt_en": "A tropical beach at sunrise, with palm trees and crystal-clear water in the foreground"}
+{"index": 743, "data": "A cinematic medium shot captures a self - portrait of Vincent van Gogh, crafted in Van Gogh’s distinctive artistic style. Van Gogh, with his tousled golden hair and a thoughtful expression, wears a blue - gray jacket over a yellow shirt. The background is adorned with swirling, vibrant brushstrokes of deep blue and golden yellow, reminiscent of his famous “Starry Night” sky and sunflower - filled landscapes, rendered in thick, textured impasto. The portrait exudes the emotional intensity characteristic of Van Gogh’s works, with each brushstroke brimming with feeling.", "original_prompt_en": "Cinematic shot of Van Gogh's selfie, Van Gogh style"}
+{"index": 744, "data": "Medium shot captures Gwen Stacy, with her long blonde hair flowing over her shoulders, dressed in a white blouse and a blue plaid skirt, sitting gracefully on a wooden chair. She holds a paperback book with a colorful cover, her eyes intently scanning the pages as she gently flips them with her right hand. The background reveals a sunlit room with a large window, through which warm golden light streams in, illuminating the light brown wooden floor. A small vase with pink flowers rests on the table beside her, and a bookshelf filled with various book spines lines the wall behind her. Throughout the scene, Gwen remains absorbed in her reading, occasionally pausing to furrow her brow slightly before continuing to turn the pages.", "original_prompt_en": "Gwen Stacy reading a book"}
+{"index": 745, "data": "A medium shot captures Iron Man, clad in his iconic red - and - gold armored suit with a glowing circular arc reactor on his chest, flying through the sky. Blue energy jets from the repulsors on his back and legs, propelling his flight. The background shows a clear blue sky dotted with white clouds, with faint city skyscraper outlines below. He holds a streamlined stance, arms slightly bent, legs together, occasionally adjusting his path. The camera tracks his movement, capturing his figure gliding across the sky from a side angle, the armor shimmering under the sunlight.", "original_prompt_en": "Iron Man flying in the sky"}
+{"index": 746, "data": "A panoramic shot in an oil - painting style depicts The Bund in Shanghai. The European - styled buildings, with warm - yellow and dark - brown facades, have thick, textured brushstrokes on their exteriors, arranged in an orderly yet diverse manner along the Huangpu River's bank. Their vintage spires and arched windows/doors are distinctly visible. The Huangpu River, rendered like a delicately brushed canvas, has deep - blue water with subtle ripples, reflecting the warm - colored silhouettes of the buildings. A few retro - styled cruise ships, rich in vibrant hues, glide slowly on the river, their red - and - white hulls standing out strikingly against the oil - painting - like water. Along the riverside walkway, pedestrians (with figures softened by the oil - painting’s hazy brushstrokes) stroll or pause to admire the view, their varied - colored attire blending artistically. The sky, painted in a light blue with fluffy, brush - marked white clouds, merges with the distant building outlines in a dreamy, painterly haze. The entire scene, with high color saturation and bold, expressive brushstrokes, evokes a nostalgic, artistic ambiance, freezing Shanghai’s Bund in a “living” oil painting.", "original_prompt_en": "The bund Shanghai, oil painting"}
+{"index": 747, "data": "A medium shot captures Yoda, the green - skinned, big - eared Jedi Master, standing on a brightly lit stage. He is clad in his signature brown robe with a broad brown belt. In his hands rests a wooden acoustic guitar, its surface smooth and strings glinting under the stage lights. His long, green fingers skillfully pluck the strings as he plays, his eyes fixed on the instrument with a focused expression. The stage background is illuminated by vibrant stage lights—warm yellows and cool blues intermingle—while dark silhouettes of audience members fill the distance, suggesting a live performance atmosphere. The camera stays steady, capturing Yoda’s gentle sway as he plays the guitar, completely absorbed in the music.", "original_prompt_en": "Yoda playing guitar on the stage"}
+{"index": 748, "data": "A wide shot captures a beautiful coastal beach in spring, rendered in the Ukiyo - e style reminiscent of Hokusai’s works. The pale golden sand stretches smoothly, its fine grains glistening under soft sunlight. Gentle turquoise waves lap rhythmically against the shore, their crests frothing into delicate white foam before receding to leave wet, glistening patches on the sand. The sky above is a clear azure with wispy white clouds drifting lazily. In the distance, the hazy blue horizon merges with the sea, and a few small fishing boats dot the water, their sails faint silhouettes. The camera pans slowly along the shoreline, emphasizing the tranquil rhythm of the waves lapping the sand, while the Ukiyo - e - style rendering lends the scene a delicate, wood - block - print - like quality, with soft color gradients and elegant linework that echo Hokusai’s distinctive aesthetic.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Hokusai, in the style of Ukiyo"}
+{"index": 749, "data": "Panoramic shot of a beautiful coastal beach in spring, styled after Vincent van Gogh’s vivid, textured artwork. Light turquoise waves with frothy white edges lap rhythmically against the golden - beige sandy shore, where fine grains glisten as the tide recedes. The sky above is a striking cerulean, adorned with swirling white clouds that mimic Van Gogh’s dynamic brushstrokes. In the distance, the hazy horizon blends the deep blue sea and sky, with a small sailboat—its white sail rendered in bold, painterly strokes—drifting faintly. The camera remains fixed, capturing the gentle, repetitive motion of waves caressing the sand, while the entire scene radiates the dreamy, expressive quality emblematic of Van Gogh’s coastal scenes.", "original_prompt_en": "A beautiful coastal beach in spring, waves lapping on sand by Vincent van Gogh"}
+{"index": 750, "data": "A medium shot captures a white leisure boat with a blue stripe along its side sailing leisurely along the calm Seine River, the water’s surface glistening under the golden sunlight. In the background, the iconic Eiffel Tower, with its intricate iron lattice structure, stands proudly against a clear sky with a few scattered clouds. The riverbanks are lined with elegant Parisian buildings, their classic architecture and green - leafed trees creating a picturesque scene, while a few people are seen walking along the riverside walkway. The boat moves at a relaxed pace, its bow gently cutting through the water as it journeys past the scenic riverfront, with the Eiffel Tower providing a stunning backdrop throughout the scene.", "original_prompt_en": "A boat sailing leisurely along the Seine River with the Eiffel Tower in background"}
+{"index": 751, "data": "The sky is overcast with heavy rain. A long shot captures an empty street at evening, the wet asphalt glistening under dim streetlights. A dark gray sedan is moving slowly along the street, its windshield wipers swishing rhythmically to clear the rain. The background reveals shadowy silhouettes of tall buildings with sparse illuminated windows, and the air is filled with a misty rain haze. The camera remains fixed, focusing on the car’s deliberate, slow movement as raindrops create tiny splashes on the road. On either side of the street, leafless trees stand in the rain, their branches drooping, and a single streetlight casts a pale yellow glow, emphasizing the solitude of the rainy, empty street.", "original_prompt_en": "A car moving slowly on an empty street, rainy evening"}
+{"index": 752, "data": "Medium shot captures a gray - furred cat with a white chest eating food from a red ceramic bowl. The cat bends its head down, its small mouth opening and closing as it chews the food, and its tail sways gently behind it. The bowl is placed on a wooden floor, with some stacked cardboard boxes and a potted plant with green leaves in the background. The camera stays fixed, clearly showing the cat's focused eating posture; the cat occasionally lifts its head to look around before quickly lowering it to continue eating.", "original_prompt_en": "A cat eating food out of a bowl"}
+{"index": 753, "data": "Medium shot captures a gray domestic cat wearing sleek black sunglasses at a swimming pool. The cat is perched casually on the white - tiled pool edge, with its tail curled loosely behind it and its ears perked up. The pool water glistens under the bright sun, and a blue inflatable float drifts gently on the surface. In the background, beige lounge chairs with striped cushions are arranged neatly, a yellow sun umbrella casts a soft shadow, and tall palm trees with green fronds rustle in the breeze. The sky is clear and bright blue, dotted with a few fluffy white clouds. Fixed shot, the cat remains still, exuding a relaxed and cool demeanor as it soaks in the poolside atmosphere, while the camera stays steady to highlight its stylish sunglasses and the vibrant pool environment.", "original_prompt_en": "A cat wearing sunglasses at a pool"}
+{"index": 754, "data": "Medium shot of a confused black - and - white panda in a calculus classroom. The panda, seated at a wooden desk, has its right paw raised to its head (scratching in bewilderment) while its left paw rests on an open textbook filled with complex calculus formulas. Its round black eyes dart between the textbook and a crumpled sheet of paper covered in mathematical symbols, wide with confusion. The background reveals a classroom setting: a chalkboard with blurry calculus equations, wooden desks with other students (some glancing toward the panda), pale walls, and a window casting soft light. The camera remains fixed, capturing the panda’s perplexed posture as it fidgets with the paper, ears drooping slightly.", "original_prompt_en": "A confused panda in calculus class"}
+{"index": 755, "data": "A medium shot captures a cute, fluffy panda with black - and - white fur sitting at a wooden table in a cozy Chinese restaurant. The panda holds a pair of chopsticks in its right paw, delicately picking up a piece of steaming dumpling from a white porcelain plate with red patterns. The restaurant’s background features traditional red lanterns hanging from the ceiling, wooden chairs with cushioned seats, and walls adorned with Chinese calligraphy scrolls. Several ceramic bowls and chopstick rests are neatly arranged on the table, and soft ambient light from paper lanterns casts a warm glow. The panda chews slowly, savoring the dumpling, its round black eyes sparkling with contentment as it occasionally glances around the restaurant.", "original_prompt_en": "A cute fluffy panda eating Chinese food in a restaurant"}
+{"index": 756, "data": "The sun is setting, painting the sky with warm orange and pink hues. A medium full shot captures a cute, happy Corgi with a fluffy tricolor (brown, white, and black) coat and a short, stumpy tail wagging enthusiastically. The Corgi is playing in a lush green park, with grass under its paws and scattered autumn leaves around. In the background, there are tall trees with golden foliage, a few park benches, and distant playground equipment. The Corgi bounces around, chasing a fallen leaf, then pauses to lick its nose before dashing towards a small stick on the ground. The camera follows its movements, panning gently to keep the playful pup in frame, while the soft sunset light casts a golden glow over the scene.", "original_prompt_en": "A cute happy Corgi playing in park, sunset"}
+{"index": 757, "data": "[A medium shot captures a cute raccoon with soft brown fur and distinctive black facial markings, seated in a small wooden boat that gently floats on the deep blue ocean’s rippling surface. The raccoon holds a small, light - brown acoustic guitar, strumming its strings with lively enthusiasm. The background reveals an expansive ocean, with gentle waves undulating under a clear, sunny sky dotted with a few fluffy white clouds on the horizon. The boat rocks slightly with the water’s movement as the raccoon continues to play the guitar, its bushy tail with alternating black and white rings curled around its body.]", "original_prompt_en": "A cute raccoon playing guitar in a boat on the ocean"}
+{"index": 758, "data": "Panoramic shot of a happy fuzzy panda with black - and - white fur playing a wooden guitar beside a campfire. The panda sits on a snow - dotted grassy patch, its paws nimbly plucking the guitar strings with a wide - eyed, joyful look on its face. The campfire burns brightly, orange flames dancing and crackling, surrounded by stacked firewood. In the background, majestic snow - capped mountains rise, their white peaks glistening under a clear blue sky. The ground around the campfire is a mix of brown soil and snow patches, with a few dry branches scattered nearby. The camera remains fixed, capturing the panda’s cheerful strumming and the warm glow of the fire contrasting with the cold, snowy mountain backdrop.", "original_prompt_en": "A happy fuzzy panda playing guitar nearby a campfire, snow mountain in the background"}
+{"index": 759, "data": "A wide shot captures a bright white lightning bolt striking the top of the iconic Eiffel Tower in Paris. The Eiffel Tower, with its intricate metal lattice structure, stands prominently against a backdrop of heavy, lead - gray dark clouds that fill the entire sky, creating a gloomy and dramatic atmosphere. The lightning, jagged and vividly bright, momentarily illuminates the tower and the surrounding clouds as it hits the tower's apex. The camera remains fixed, focusing on the striking moment when the natural force interacts with the man - made landmark.", "original_prompt_en": "A lightning striking atop of eiffel tower, dark clouds in the sky"}
+{"index": 760, "data": "A panoramic shot of a modern art museum’s interior. The museum’s walls are painted a clean white, serving as a striking backdrop for the numerous colorful paintings on display—some are abstract works with swirling patterns in vivid red, electric blue, and golden yellow, while others are realistic landscapes brimming with rich, saturated hues like lush green and deep purple. These artworks, each framed in simple yet elegant black or white frames, are evenly spaced along the walls. The floor is made of polished light - brown wood, and soft, warm - toned lights from modern fixtures overhead cast a gentle glow over the space. In the background, a few sleek, metallic display stands hold smaller art pieces, and the camera slowly pans to the left, revealing more of the gallery with additional colorful paintings coming into view, all contributing to the vibrant, contemporary atmosphere of the museum.", "original_prompt_en": "A modern art museum, with colorful paintings"}
+{"index": 761, "data": "Medium shot captures a black - and - white panda cooking in a cozy kitchen. The panda, with its iconic black - and - white fur, stands upright in front of a wooden countertop. In its right hand, it holds a wooden spatula, carefully stirring a pot of steaming food on a stainless - steel gas stove. The countertop is cluttered with fresh ingredients: sliced green cucumbers, red tomatoes, and a white cutting board with a stainless - steel knife resting on it. The kitchen background showcases white tiled walls, wooden cabinets with silver handles, and a set of hanging stainless - steel utensils. The camera stays fixed, focusing on the panda’s concentrated demeanor as it continues to stir the food, occasionally adjusting the pot with its left paw.", "original_prompt_en": "A panda cooking in the kitchen"}
+{"index": 762, "data": "Medium shot captures a giant panda with distinctive black - and - white fur playing on a wooden swing set. The panda, with a round body and characteristic black eye patches, sits on the flat seat of the swing, gripping the sturdy brown ropes with its front paws as it gently sways back and forth. The swing set stands on a patch of green grass, and the background features a lush bamboo forest with tall, slender bamboo stalks and dense green foliage. The sky is partly cloudy, with soft white clouds drifting. The camera remains fixed, capturing the panda’s playful movements as it enjoys the swing, occasionally looking around with its curious black eyes.", "original_prompt_en": "A panda playing on a swing set"}
+{"index": 763, "data": "A medium shot captures a polar bear with thick, snow - white fur standing on a snow - covered ice floe. The bear holds a brown wooden guitar with both paws: its right paw strums the strings rhythmically while its left paw presses the frets. The background reveals an icy Arctic landscape, with towering icebergs and a clear blue sky. The camera stays steady, focusing on the polar bear as it seems immersed in playing the guitar, occasionally swaying its body slightly to the rhythm.", "original_prompt_en": "A polar bear is playing guitar"}
+{"index": 764, "data": "Medium shot of a raccoon dressed in a black suit with a white shirt and a red tie, standing on a stage. The raccoon holds a golden trumpet with both paws, pressing the valves, and its mouth rests on the mouthpiece, appearing to play the trumpet. The stage background features a dark curtain with colorful spotlights and decorative musical notes. The camera remains fixed, capturing the raccoon’s focused posture—its bushy tail, marked by black rings, curls behind it, and its masked face is intent on the performance.", "original_prompt_en": "A raccoon dressed in suit playing the trumpet, stage background"}
+{"index": 765, "data": "A medium shot captures a robot DJ with a metallic body and glowing blue circuitry operating a turntable. The scene is set on a futuristic Tokyo rooftop at night, drenched in heavy rain, with a cyberpunk aesthetic—neon lights from holographic billboards reflect off the wet surfaces, and towering skyscrapers with glowing neon lines and floating drones fill the background. The robot’s mechanical arms move precisely over the turntable, and raindrops streak down the camera’s view, enhancing the sci - fi fantasy atmosphere. The camera remains fixed, focusing on the DJ as rain pours, illuminating the scene with the vibrant, neon - lit cityscape behind it.", "original_prompt_en": "A robot DJ is playing the turntable, in heavy raining futuristic tokyo rooftop cyberpunk night, sci-fi, fantasy"}
+{"index": 766, "data": "A medium shot captures a gray shark with a sleek, streamlined body swimming gracefully in the crystal - clear Caribbean ocean. The shark’s dorsal fin slices through the bright turquoise water, and its tail undulates rhythmically as it moves forward. The ocean is exceptionally clear, revealing vibrant coral reefs in shades of red, orange, and green, along with small, colorful tropical fish darting about in the background. The camera follows the shark’s movement, smoothly panning to keep it in frame as it glides through the water, showcasing the serene yet lively underwater ecosystem of the Caribbean.", "original_prompt_en": "A shark swimming in clear Caribbean ocean"}
+{"index": 767, "data": "Panoramic shot of a super robot safeguarding a futuristic city. The robot, with a towering metallic silver frame and glowing red energy veins across its torso, stands amidst a city in turmoil. Its head is a sleek black dome with a vertical blue light strip for eyes, and mechanical wings unfold behind it, ready for flight. Around it, chaos reigns—smoke rises from a collapsing skyscraper, and a tentacled alien creature attacks a highway overpass. The super robot activates its arm - mounted laser cannons, firing red beams at the creature, while its left hand extends to catch a falling police car, placing it safely on the ground. The background features glass skyscrapers, hovering drones, and a sky with scattered gray clouds, emphasizing the high - tech crisis. The camera pans right, tracking the robot as it moves to intercept another threat, embodying the city’s last line of defense.", "original_prompt_en": "A super robot protecting city"}
+{"index": 768, "data": "Medium shot of a brown teddy bear with soft and fluffy fur, wearing a blue - white striped apron, standing in front of a white kitchen sink. The teddy bear holds a yellow sponge in its right paw, carefully scrubbing a white ceramic plate covered with soapy bubbles. Clear water flows from the silver faucet, and a green dish - soap bottle is placed on the left side of the sink. Behind it, wooden cabinets with silver handles and a window with checkered curtains can be seen. The teddy bear rinses the plate under the running water and then puts it on a drying rack with other colorful dishes. Its left paw gently holds the edge of the sink, maintaining a focused posture during the entire dish - washing process.", "original_prompt_en": "A teddy bear washing the dishes"}
+{"index": 769, "data": "A wide shot at night captures an epic tornado, made of swirling smoke, attacking above a glowing city. The city below radiates with vibrant neon lights and scattered building lights, painting the dark night sky with colorful glows. The tornado, with its churning gray - smoke structure, twists menacingly as it advances over the city, its smoky form expanding and roiling. The dark night sky serves as a backdrop, highlighting the stark contrast between the city’s luminous glow and the ominous, smoke - formed tornado, creating a dramatic and threatening scene.", "original_prompt_en": "An epic tornado attacking above a glowing city at night, the tornado is made of smoke"}
+{"index": 770, "data": "A medium full - shot oil painting depicts a couple in elegant formal evening wear. The man, with neatly combed dark hair, is dressed in a black tailcoat paired with a white dress shirt and a black bow tie. The woman is adorned in a floor - length, shimmering silver evening gown with delicate lace details at the neckline. They are caught in a heavy downpour, each holding a black umbrella with curved wooden handles. Raindrops hammer the umbrellas and splash onto their pristine attire, creating glistening wet patches. The background shows a dimly lit night street, with the blurred outlines of street lamps casting a warm glow through the rain, and puddles on the cobblestone road reflecting the faint light. The couple takes cautious steps forward, their postures tense as they try to shield themselves from the relentless rain, the fabric of their gowns and the man’s tailcoat clinging slightly to their bodies due to the moisture.", "original_prompt_en": "An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with umbrellas"}
+{"index": 771, "data": "A medium shot captures a clown fish swimming through a vibrant coral reef. The clown fish, with its distinctive orange body adorned with white stripes and black outlines, moves gracefully, its fins gently flapping as it navigates the intricate, multicolored coral formations—corals in hues of red, purple, and green, with tiny, iridescent fish darting among the crevices. The clear blue seawater surrounds the reef, with sunlight filtering from the water’s surface, casting dappled light on the reef and the fish. The clown fish weaves in and out of the coral structures, occasionally pausing to examine a small anemone, before continuing its journey. The camera follows the fish’s movement, smoothly tracking its path through the reef.", "original_prompt_en": "Clown fish swimming through the coral reef"}
+{"index": 772, "data": "Panoramic shot captures a hyper - realistic spaceship landing on Mars. The spaceship has a sleek, metallic silver - gray exterior with blue - glowing thrusters at the bottom. As it descends, the thrusters emit faint orange flames, stirring up the red Martian dust below. The Martian surface is a vast expanse of rust - colored sand dotted with scattered gray rocks and small craters. The sky above is a hazy orange - red, with the silhouettes of distant, jagged Martian mountains in the background. The spaceship slowly lowers toward the ground, its landing gear gradually extending. The camera remains fixed, capturing the detailed process of the spaceship's landing, from the initial descent to the gentle contact of the landing gear with the Martian surface, while the dust around it swirls and then slowly settles.", "original_prompt_en": "Hyper-realistic spaceship landing on Mars"}
+{"index": 773, "data": "Panoramic shot of The Bund in Shanghai, brimming with vibrant colors. The iconic European - style buildings line the Huangpu River, their facades displaying a rich array of warm yellows, deep reds, and crisp whites, glistening under the bright sunlight. The Huangpu River reflects the vivid hues of the sky and buildings, with colorful cruise ships sailing leisurely on the water. Along the riverside promenade, pedestrians in brightly - colored clothing stroll, chat, or take photos, enhancing the lively ambiance. The sky is a vivid, clear blue with fluffy white clouds. The camera pans slowly from left to right, capturing the bustling yet picturesque scene of the Bund, where the vibrant colors of architecture, nature, and human activity blend harmoniously.", "original_prompt_en": "The bund Shanghai, vibrant color"}
+{"index": 774, "data": "A medium shot captures Vincent van Gogh, with his distinctive red beard and tousled brown hair, painting intently at an easel in a cluttered room. He wears a dark, worn coat and a wide - brimmed hat, his hand moving the brush across the canvas to lay down vibrant, swirling colors. The room is filled with art supplies: scattered paint tubes, a wooden palette with blobs of paint, and half - finished canvases propped against the walls. Sunlight filters through a small window, casting soft shadows on the wooden floor, which shows signs of wear. The camera remains fixed, focusing on Van Gogh as he loses himself in the act of painting, occasionally pausing to mix colors on his palette before continuing his work.", "original_prompt_en": "Vincent van Gogh is painting in the room"}
+{"index": 775, "data": "Panoramic shot of numerous yellow flowers with delicate golden petals and slender green stems. The flowers, each with a subtle brown center, are scattered across a lush green meadow. The background is a clear blue sky with fluffy white clouds drifting by. The flowers swing gently in the wind, their petals rustling softly, and the camera remains fixed, capturing the tranquil motion of the blossoms as they dance with the breeze.", "original_prompt_en": "Yellow flowers swing in the wind"}
+{"index": 776, "data": "Panoramic shot of a narrow alley. The ground is paved with uneven gray flagstones, patches of green moss clinging to their edges. On either side, aged brick walls rise, their surfaces dotted with peeling paint, faded graffiti, and a red cloth hanging from a clothesline. A rusty metal trash can and a bicycle with a flat tire lean against the left wall, while a stack of cardboard boxes rests near the right wall’s base. The sky is overcast, casting a dim, diffused light. A woman in a brown coat walks slowly from the alley’s far end toward the camera, her steps echoing on the flagstones, and the camera pans left to follow, revealing more of the brick walls’ cracks and a cat crouched near a drainpipe. In the background, tall buildings loom, their windows reflecting the gray sky.", "original_prompt_en": "alley"}
+{"index": 777, "data": "Panoramic shot of a vibrant amusement park. The scene is filled with colorful rides: a towering Ferris wheel with bright red and yellow cabins slowly rotating, a looping roller coaster with blue and white tracks where trains zoom by, and a classic carousel with intricately carved horses in pastel hues spinning gently. Crowds of visitors, including families with children in playful outfits, teenagers in casual clothes, and couples holding hands, move around the park. Some kids with balloon animals in their hands run toward the candy - colored game stalls, while a group of friends in matching T - shirts queues excitedly for the water ride. The background shows a clear blue sky with fluffy white clouds, and the park’s vibrant marquees and decorative lights add to the festive atmosphere. The camera pans across the park, capturing the joyful chaos: a clown in a rainbow - colored outfit juggles balls near the entrance, and a group of performers in shiny costumes dance on a small stage. Rides like the swinging pirate ship rock back and forth, and the bumper cars in the corner are filled with laughing drivers.", "original_prompt_en": "amusement park"}
+{"index": 778, "data": "Panoramic shot of an aquarium. The clear blue water hosts a vibrant array of marine life: red parrotfish with glossy scales, blue angelfish with delicate, flowing fins, and transparent jellyfish drifting lazily, their tentacles undulating like gossamer. The tank’s base features a lifelike coral reef—orange - pink corals with textured surfaces and lush green aquatic plants swaying gently with the water’s current. Beyond the aquarium’s glass wall, blurred silhouettes of visitors are visible, and soft white - and - blue lighting lends a dreamy ambiance. The camera pans right, capturing a school of golden - yellow fish swimming in unison from left to right, their scales shimmering. A sea turtle emerges leisurely from behind the coral, its flippers propelling it forward in a calm glide. Some fish hover motionless near the corals, while others dart swiftly between the plants, creating a lively contrast.", "original_prompt_en": "aquarium"}
+{"index": 779, "data": "A medium shot captures a weathered stone arch with intricate carvings, standing motionless amidst a lush green garden. The arch, grayish - brown with mossy patches, showcases smooth stone blocks with visible cracks, hinting at its aged structure. The background features tall leafy trees, a clear blue sky with scattered clouds, and a cobblestone path winding beneath the arch. The camera remains fixed, highlighting the arch’s stable presence as vibrant flowers bloom at its base and a small bird flutters near its stone ledge.", "original_prompt_en": "arch"}
+{"index": 780, "data": "Panoramic shot of an art gallery. The interior features white walls adorned with various artworks—vibrant oil paintings in dark wooden frames, delicate watercolors in silver metal frames, and bold abstract prints in black plastic frames. The floor is smooth marble, reflecting the warm glow of ceiling lights. In the background, display pedestals hold sculptures: a bronze statue with intricate details and a white plaster figure with flowing curves. Several visitors are present: a woman in a navy dress stands before a landscape painting, hands clasped behind her back, gazing intently; a man in a gray suit and a child in a red sweater examine a modern abstract piece, whispering. The camera slowly pans right, revealing more artworks (including a large portrait with bold brushstrokes) and a glass case showcasing a delicate ceramic artwork, as groups of visitors—some in casual attire, others formal—move between exhibits, discussing in low voices.", "original_prompt_en": "art gallery"}
+{"index": 781, "data": "A medium shot captures a neatly organized bathroom. The walls are clad in white and light - blue subway tiles, and the floor is covered with smooth light - gray ceramic tiles. At the center, a white sink with a sleek silver faucet is positioned beneath a rectangular mirror, which reflects the room’s warm and diffused lighting. A sky - blue towel hangs neatly on a chrome rack to the right of the sink, and a small potted succulent rests on the sink’s edge, adding a touch of greenery. To the left, a white toilet with a closed lid is in the corner, and a glass - enclosed shower with a rainfall showerhead (its glass panels glistening) stands at the far end. Adjacent to the shower, a white bathtub with a wooden bath tray—on which a lit candle and a folded towel are placed—invites relaxation. Wooden cabinets with white handles line the back wall, and their doors conceal neatly arranged toiletries. The camera remains fixed, capturing the serene and spotless space, where every element, from the polished faucet to the neatly hung towel, exudes order and calm.", "original_prompt_en": "bathroom"}
+{"index": 782, "data": "A medium shot of a cozy bakery shop on a bustling street. The shop features a wooden signboard with \"Bakery\" in golden letters, and large glass windows showcasing an array of freshly baked goods—flaky croissants in golden - brown, round loaves of wheat bread with a rustic crust, and colorful fruit tarts with glistening toppings. Inside, a young female shopkeeper, wearing a white apron over a light blue dress, is arranging a tray of cinnamon rolls on the display shelf, her hands moving gently as she adjusts each pastry. Outside, a few pedestrians pause to peek through the window, and a bicycle with a wicker basket is parked by the door. The background includes neighboring shops with vibrant awnings, and the street is lined with green trees, with sunlight filtering through the leaves and casting warm shadows on the pavement. The camera lingers, capturing the inviting scene and the gentle movements of the shopkeeper as she tidies the display.", "original_prompt_en": "bakery shop"}
+{"index": 783, "data": "Medium shot of a grand ballroom. The room is adorned with crystal chandeliers hanging from the ceiling, casting warm golden light. The walls are lined with mirrors, reflecting the dancers. The polished wooden dance floor occupies the center, where several couples in formal attire—women in elegant gowns, men in tailored suits—are dancing gracefully. A live band plays in the corner, with musicians in black uniforms. The background features a bar with glassware and a few guests chatting, and the walls are decorated with ornate paintings. The camera pans across the dance floor, capturing the couples twirling and gliding, their movements synchronized to the melodious music. Some dancers laugh as they spin, while others maintain a poised posture, showcasing the refined atmosphere of the ballroom.", "original_prompt_en": "ballroom"}
+{"index": 784, "data": "A medium shot reveals a cozy bar with dark wooden paneling and warm, yellow - toned lighting. The bar counter, polished to a shiny finish, occupies the lower half of the frame. On the back shelf, there is a row of crystal - clear glassware and an array of liquor bottles, ranging from deep amber whiskies to vibrant green liqueurs. A bartender with short black hair, dressed in a black shirt and a white apron, is in the middle of mixing a drink: his right hand grips a metal cocktail shaker, while his left hand holds a jigger, carefully measuring a clear liquid. To the left of the bartender, a patron in a gray suit sits, swirling the contents of a whiskey glass. The background features a brick wall with framed artwork, and soft jazz music plays subtly, enhancing the intimate atmosphere. The camera remains steady, capturing the bartender’s fluid movements as he shakes the cocktail, with the ice clinking rhythmically inside the shaker.", "original_prompt_en": "bar"}
+{"index": 785, "data": "Long shot of a rustic wooden barn standing in a sprawling green field. The barn features weathered brown planks, a sloped dark gray roof, and white - framed windows. A dirt path leads to its large wooden doors, which are slightly ajar. In the background, tall trees sway gently under a clear blue sky with fluffy white clouds. Fixed shot, the barn remains still, with a gentle breeze rustling the surrounding grass and a few birds perched on its roof.", "original_prompt_en": "barn"}
+{"index": 786, "data": "Long shot of a dimly lit basement. The walls are rough gray concrete, with aged pipes snaking along the ceiling. In the center, an old wooden workbench stands, cluttered with dusty cardboard boxes, a rusty metal toolbox, and scattered tools. To the left, a tall metal shelving unit holds stacked plastic crates and faded cardboard boxes, some slightly ajar. The concrete floor is dotted with small debris, and faint dust motes swirl in the sparse light. In the background, a single bare light bulb hangs from a wire, casting a faint, flickering yellow glow that barely reaches the far corners. The scene is static, with no visible movement—only the subtle drift of dust particles in the air, and the quiet creak of the wooden bench under the weight of its clutter.", "original_prompt_en": "basement"}
+{"index": 787, "data": "Panoramic shot of a beach. The golden sandy beach stretches along the shore, with fine grains glistening under the sunlight. The azure sea has gentle waves lapping the shore, creating small foamy ripples. On the beach, some people lie on colorful beach towels, sunbathing with relaxed postures, while a few children in swimsuits are building sandcastles, their hands busy shaping the wet sand. A couple in casual beachwear walks along the water’s edge, their feet occasionally touching the cool seawater. In the background, tall palm trees sway gently in the breeze, and a few white sailboats dot the distant horizon. The sky is clear and blue, with fluffy white clouds drifting lazily. The camera pans slowly to the right, capturing more of the lively beach scene—some swimmers frolicking in the waves, seagulls soaring overhead, and more sunbathers enjoying the warm sunshine.", "original_prompt_en": "beach"}
+{"index": 788, "data": "A medium shot of a cozy bedroom. The bed, with white and gray striped bedding, is centered against the back wall, flanked by two wooden bedside tables—each holding a small lamp with white lampshades. A plush beige rug covers the light wooden floor, and a tall wardrobe with mirrored doors stands on the right, reflecting the room’s soft, warm lighting. The walls are painted a calm light blue, and a framed landscape hangs above the bed. In the foreground, a white armchair with a colorful throw pillow sits near a window draped with sheer white curtains, through which soft natural light filters in. The camera slowly pans right, revealing a desk with a laptop and potted plant in the corner, and a laundry basket tucked beside it. The room exudes a peaceful, lived - in charm, with a few books stacked on the bedside table and a cozy blanket folded at the foot of the bed.", "original_prompt_en": "bedroom"}
+{"index": 789, "data": "A long shot captures a gray concrete bridge with a simple beam structure spanning a wide river. The bridge features metal railings on both sides, slightly rusted to show traces of weathering. Below, the river flows gently, its surface reflecting the overcast sky above. On the bridge, a few cars drive slowly, their tires creating soft sounds on the concrete. In the background, tall urban buildings stand to the left, while a lush green forest stretches to the right, all under a sky blanketed with thick gray clouds. The camera pans left to follow a car moving across the bridge, capturing the contrast between the urban architecture and natural greenery surrounding it.", "original_prompt_en": "bridge"}
+{"index": 790, "data": "A panoramic shot of a vibrant botanical garden. The scene is brimming with a diverse array of plants: colorful flowering shrubs with petals in shades of pink, purple, and yellow, tall green palm trees with broad fronds, and winding stone pathways meandering through the lush greenery. The ground features well - maintained lawns interspersed with patches of soil where small, delicate ferns thrive. In the background, glass greenhouses with metal frames stand, their transparent walls revealing rows of potted plants inside. The sky is clear and blue, with a few fluffy white clouds drifting by. Gentle breezes make the leaves and flowers sway softly. Along the pathways, a few visitors in casual attire stroll leisurely, some pausing to admire the plants or take photos. The camera slowly pans across the garden, capturing the lush vegetation and the tranquil atmosphere.", "original_prompt_en": "botanical garden"}
+{"index": 791, "data": "Medium shot of a bustling cafeteria. Neatly arranged wooden tables and chairs fill the space, with people in casual clothes seated, chatting and eating. In the background, a stainless - steel food counter displays trays of colorful meals like steamed rice, vivid stir - fries, and savory meat. Cafeteria staff in white uniforms and hairnets replenish food. Walls have bright nutrition posters, and warm fluorescent lights illuminate the scene. The camera stays still, capturing the lively vibe as diners pass trays and laugh.", "original_prompt_en": "cafeteria"}
+{"index": 792, "data": "Panoramic shot of a campsite nestled in a dense green forest. Three canvas tents—one green, one brown, and one blue—are pitched on a patch of grass mixed with earth. In the center, a campfire with orange flames dances within a circle of gray stones, sending wisps of smoke upward. Around the fire, two campers (a man with a beard in a gray hoodie and a woman with long brown hair in a blue jacket) sit on folding chairs, chatting while a golden retriever lies at their feet. Nearby, a backpack, a rolled - up sleeping bag, and a metal pot with steam rising from it rest on a wooden crate. The background is filled with tall pine trees swaying gently in the breeze, and the sky is clear with a few white clouds drifting. The camera pans slowly to the right, revealing more of the campsite: a clothesline with drying towels, a camping stove with a kettle, and a trail leading into the forest.", "original_prompt_en": "campsite"}
+{"index": 793, "data": "Panoramic shot of a vibrant campus. The campus is dotted with red - brick teaching buildings, a library with expansive glass windows, and dormitory buildings with balconies. Students, mostly young East Asian individuals, are spread across the scene—some stride briskly along paved pathways, backpacks in tow; others lounge on the lush green lawn, chatting or poring over books. The ground is a blend of neatly manicured grass and stone - laid paths. The background showcases a sports field with a running track, where a few students are jogging. The sky is clear and blue, with a handful of white clouds drifting. The camera pans slowly across the campus, first capturing a group of cyclists pedaling toward the left of the frame, then zeroing in on a cluster of students laughing and strolling near a flower bed brimming with colorful blossoms. Some students are heading into the teaching building, while others are exiting, crafting a lively ambiance of everyday campus life.", "original_prompt_en": "campus"}
+{"index": 794, "data": "Panoramic shot of a vibrant carrousel in a park. The carrousel features ornately decorated wooden horses with colorful manes and tails, some adorned with golden accents and painted in hues of red, blue, and white. The central structure is topped with a bright, striped canopy in shades of yellow and green, spinning slowly clockwise. Several children and adults are seated on the horses, holding onto the handles with excited expressions, while a few stand nearby, watching. The background shows a lush green park with tall trees, a clear blue sky with fluffy white clouds, and other amusement park rides visible in the distance. The camera remains fixed, capturing the carrousel’s gentle rotation as the horses move up and down in sync with the spinning motion.", "original_prompt_en": "carrousel"}
+{"index": 795, "data": "A panoramic shot of a majestic stone castle. The castle stands atop a grassy hill, with thick crenellated walls and several cylindrical towers capped with conical roofs—one tower displays a faded flag fluttering in the breeze. The surrounding landscape is a blend of lush green meadows, dense dark forests, and distant misty mountains under a partly cloudy sky. The camera slowly pans around the castle, revealing its intricate stone carvings and a narrow moat filled with still water at its base, while a few birds circle the tallest tower, their wings glinting in the sunlight.", "original_prompt_en": "castle"}
+{"index": 796, "data": "A panoramic shot of a solemn cemetery. The ground is blanketed with well - trimmed green grass, interspersed with gray stone tombstones of varying sizes—some bearing faded inscriptions, others adorned with small, withered flower bouquets. Tall, leafless trees with gnarled branches stand as silent sentinels around the cemetery, their skeletal limbs reaching toward the overcast sky, where thick gray clouds hang low, casting a somber hue over the scene. In the distance, a weathered iron fence and the silhouette of a small chapel with a pointed roof emerge faintly. A gentle breeze stirs the grass, causing it to ripple softly, while a crow perched on a tombstone flaps its wings and takes flight, momentarily disrupting the eerie stillness.", "original_prompt_en": "cemetery"}
+{"index": 797, "data": "A medium shot captures a bright classroom. Rows of wooden desks and chairs are neatly arranged, with several East Asian students seated, attentively facing the front. The student in the front row, a girl with long black hair tied in a ponytail, wears a white shirt and blue skirt, her eyes fixed on the blackboard. A young female teacher with short brown hair, dressed in a light blue blouse, stands at the blackboard, holding a piece of chalk and writing equations, her posture focused. The background shows large windows with white frames, through which sunlight streams in, illuminating the light - colored walls and a bookshelf filled with textbooks on the left. The camera remains fixed, capturing the quiet learning atmosphere as students take notes and the teacher continues writing.", "original_prompt_en": "classroom"}
+{"index": 798, "data": "Long shot of a steep cliff. The cliff face is composed of grayish - brown rocks with visible cracks and weathered textures, and some patches of green vegetation cling to the crevices. Below the cliff, a deep valley unfolds, filled with dense green trees, and a winding river glints in the sunlight. The sky above is clear blue with a few white clouds drifting. The camera slowly pans upward to reveal the full height of the cliff, while a few birds soar around the rocky edges, their wings catching the light as they navigate the air currents.", "original_prompt_en": "cliff"}
+{"index": 799, "data": "A medium shot of a city street intersection. A crosswalk with bold, white horizontal stripes stretches across the dark asphalt road, its surface subtly textured for traction. In the foreground, several pedestrians—among them a woman in a blue dress carrying a shopping bag, a man in a gray suit glancing at his watch, and a child in a red jacket hopping playfully—are midway through crossing, moving from the right to the left of the frame. To the left, a traffic light glows red, halting a line of vehicles: a sleek black SUV, a white delivery van, and a vibrant yellow taxi, all stationary. The background reveals tall, glass - clad buildings reflecting the overcast sky, with a row of leafy green trees lining the sidewalk, their branches swaying gently in the breeze. The camera pans right, capturing more of the crosswalk and the steady flow of pedestrians, while a cyclist in a bright helmet pedals along the adjacent bike lane, merging into the urban hustle.", "original_prompt_en": "crosswalk"}
+{"index": 800, "data": "Panoramic shot of a busy city street under an overcast sky. On the left side of the street lies a construction site, covered with green safety nets and supported by orange metal frames. Several cars are parked in the parking spots in front of a nearby building. Above the road, a traffic light glows red (three traffic lights in total), with vehicles—including an orange and a black car in the foreground—either moving or parked. Trees, billboards, and street lamps line both sides of the street, while tall buildings dominate the background. The camera pans right then left, capturing the bustling traffic: some vehicles park roadside, others drive forward (backs to the camera), and cyclists navigate the road too.", "original_prompt_en": "construction site"}
+{"index": 801, "data": "A medium shot captures a long corridor. The walls are painted in a clean white, and the floor is covered with smooth, light - colored tiles that reflect the soft, warm - hued light from the ceiling lights. Along both sides of the corridor, there are deep - brown wooden doors; some are slightly open, revealing a glimpse of the rooms inside. On the left - hand wall, a few colorful posters are hung, while the right - hand wall has a bulletin board filled with various notices. The camera moves forward slowly, and a person in a blue shirt walks from the right side of the frame to the left, with hands in pockets, heading toward the end of the corridor where a brighter area is visible. In the background, the corridor extends into the distance, with its end partially obscured by a gentle curve, and the faint outline of another section of the corridor can be seen.", "original_prompt_en": "corridor"}
+{"index": 802, "data": "Panoramic shot of a tranquil courtyard. The ground features winding paths paved with smooth gray flagstones, interspersed with patches of lush green grass dotted with vibrant pink peonies and golden marigolds. Encircling the courtyard are traditional wooden verandas with dark brown beams, from which red lanterns dangle, swaying gently in the breeze. In the background, white - walled buildings with dark gray tiled roofs stand, and several willow trees with drooping light green branches rustle as the wind blows. At the courtyard’s center, a stone table holds a blue - and - white porcelain tea set, while butterflies flit among the flowerbeds. The sky is clear, dotted with fluffy white clouds. The camera remains fixed, capturing the peaceful scene—occasional birds alight on the willow branches, chirping softly, and a cat with orange fur lazily strolls across the lawn.", "original_prompt_en": "courtyard"}
+{"index": 803, "data": "Panoramic shot of a vast desert. The ground is covered with golden - yellow sand, and the sand dunes show smooth undulations shaped by the wind. The sky is clear and blue, with a few white clouds floating. In the distance, there are some sparse desert shrubs, their branches and leaves withered and curled to adapt to the arid environment. The camera slowly pans to the right, capturing the boundless expanse of the desert. No animals or humans are in sight, only the silent sand extending towards the distant horizon.", "original_prompt_en": "desert"}
+{"index": 804, "data": "Panoramic shot of a bustling downtown. The streets are filled with a mix of vehicles: sleek sedans, chunky SUVs, and buses, all navigating the busy intersections, their headlights and taillights creating streaks of light. Pedestrians crowd the sidewalks—office workers in sharp suits, students in backpacks, and shoppers with overflowing bags—moving with purpose. Storefronts burst with color: a coffee shop with a chalkboard menu, a boutique with mannequins in the latest fashion, and a bookstore with stacks of bestsellers in the window. The background showcases towering skyscrapers, their steel and glass exteriors glinting under a bright, partly cloudy sky. The camera pans right, then left, capturing the dynamic flow: a delivery truck parks by the curb, a street performer plays a saxophone, and a group of tourists points at a historic building. Some cars are parked along the street, others speed forward, while cyclists dart between them. The atmosphere is electric, with the clatter of traffic, the murmur of conversations, and the distant rumble of a subway beneath the streets.", "original_prompt_en": "downtown"}
+{"index": 805, "data": "Panoramic shot of a driveway. The driveway is made of smooth light - gray asphalt, stretching from the foreground to the background with a gentle curve. On the left side, there are neatly trimmed green shrubs, and on the right, a white wooden fence. At the far end of the driveway, a black sedan is parked next to a garage with a brown door. The sky is overcast, casting a soft light. A person in a blue jacket walks from the right side of the frame toward the parked car, and the camera stays fixed, capturing the tranquil scene of the driveway. The ground around the driveway is a mix of green grass and small patches of soil, with a few fallen leaves near the shrubs.", "original_prompt_en": "driveway"}
+{"index": 806, "data": "A panoramic shot of a sun - drenched farm. In the foreground, a weathered wooden fence encloses a lush green pasture where several brown - and - white dairy cows graze leisurely, their tails swishing gently. To the left, a red - roofed barn with white trim stands, its large wooden doors slightly ajar, revealing neatly stacked hay bales inside. A middle - aged farmer, donning a straw hat, a blue plaid shirt, and denim overalls, operates a green tractor in the golden wheat field; the tractor’s wheels churn the soil as it moves slowly forward for harvesting. The background features rolling hills blanketed with dense green trees, and the sky is a clear blue with a few fluffy white clouds drifting. The camera pans right, capturing a group of chickens pecking at the ground near a wooden coop and a horse trotting along a dirt path beside the coop.", "original_prompt_en": "farm"}
+{"index": 807, "data": "Panoramic shot of a bustling food court. The space is lined with diverse food stalls, their vibrant signboards—some neon - lit, others wooden - made—advertising various cuisines from sushi to street - style pancakes. The polished, warm - toned tile floor reflects the overhead lights. In the foreground, a group of casually dressed young people gather around a metal table, laughing as they share colorful snacks. To the left, a chef in a white uniform and red apron flips golden - brown pancakes on a griddle, with a digital menu cycling above him. The background reveals a modern mall with glass railings and escalators, and the air is filled with the aromas of grilled meat and freshly brewed coffee. The camera pans right, capturing a noodle stall with steam rising, a juice bar with fresh fruit displays, and families navigating crowded aisles. Customers queue, walk with food trays, or point at dessert stalls, while children eye treats excitedly. Lighting combines skylight and warm lamps, creating a lively yet cozy atmosphere.", "original_prompt_en": "food court"}
+{"index": 808, "data": "Panoramic shot of a football field. The field is covered with lush bright - green natural grass, neatly trimmed, and white boundary lines, center circle, and goal area markings are clearly visible. At both ends of the field, white goalposts with black nets stand firmly. The background has a grand spectator stand with rows of blue seats, and some spectators can be vaguely seen. The sky above is clear and blue with a few white clouds floating. In the foreground, a soccer ball lies on the grass, and a group of players in colorful jerseys are jogging and warming up, full of energy as they prepare for the match. The camera remains steady, capturing the vibrant atmosphere of the football field.", "original_prompt_en": "football field"}
+{"index": 809, "data": "A panoramic shot of a forest road. The road is a narrow, winding path paved with uneven brown soil and scattered dry leaves, stretching into the distance. On both sides of the road stand tall, verdant trees with thick trunks, their green leaves forming a dense canopy that filters the sunlight into gentle patches on the ground. The underbrush alongside the road is lush with ferns and wildflowers, some in delicate shades of purple and white. The background reveals a deeper, more impenetrable part of the forest, with mist subtly lingering among the trees, giving a sense of mystery. The sky is overcast, casting a soft, diffused light over the scene. A small squirrel scampers across the road from right to left, pausing momentarily to sniff the air before vanishing into the left - side foliage. The camera remains fixed, capturing the tranquil, natural beauty of the forest road, with the rustling of leaves in the gentle breeze adding to the peaceful atmosphere.", "original_prompt_en": "forest road"}
+{"index": 810, "data": "A panoramic shot captures a fountain in a spacious plaza. The fountain has multiple tiers, with clear water jetting upward and cascading down, creating glistening splashes. The ground around it is paved with light - colored stone tiles. At the plaza’s edges, there are wooden benches and neatly trimmed green shrubs. In the background, several low - rise buildings with white facades and large glass windows are visible, and the sky is a bright blue with a few fluffy white clouds. The camera is fixed, showing the continuous water flow of the fountain as it cycles between spraying and falling. Occasionally, a few pedestrians enter the frame, pausing to admire the fountain before moving on.", "original_prompt_en": "fountain"}
+{"index": 811, "data": "Panoramic shot of a gas station. The station has a rectangular building with a blue roof and white walls, marked by bright red “GAS” signage. Two silver fuel dispensers with black nozzles stand on smooth black asphalt—one has a white sedan with a blue stripe parked, its engine off as it refuels. A man in a gray jacket checks his black SUV’s tire pressure, while a woman in a pink dress walks toward the station’s convenience store (with glass windows showing snack shelves). The sky is clear blue, and distant green trees line the horizon. The camera stays fixed: a few cars drive past on the adjacent road, and a worker in a yellow uniform wipes a dispenser.", "original_prompt_en": "gas station"}
+{"index": 812, "data": "Long shot of a massive glacier under a clear blue sky. The glacier, with its deep blue ice interspersed with white snow patches and dark, jagged cracks, occupies the central part of the frame. The surface glistens in the sunlight, revealing layers of compressed ice. At the base, a turquoise glacial lake reflects the glacier’s towering form, with small icebergs floating gently. The background features rugged, snow - capped mountain peaks stretching across the horizon. The camera slowly pans left, capturing the glacier’s expansive, frozen expanse and the serene, icy landscape surrounding it. A few birds can be seen soaring in the distance, adding a sense of scale to the glacier’s grandeur.", "original_prompt_en": "glacier"}
+{"index": 813, "data": "Panoramic shot of a golf course. The course is blanketed with lush, well - manicured green grass, dotted with golf holes topped with white flags fluttering gently in the breeze. In the foreground, a golfer with short brown hair, dressed in a navy - blue polo shirt and white shorts, is in the middle of a swing, his golf club slicing through the air as he aims for the green ahead. Nearby, a silver golf cart with black tires is parked beside a sand bunker, its driver, a woman in a pink visor, stepping out to retrieve a golf ball. The fairways stretch out towards the horizon, bordered by tall oak trees with golden leaves, and the sky above is a bright blue, dotted with wispy clouds. The camera pans slowly to the right, capturing more of the expansive course, where groups of golfers are scattered, some putting on the greens, others walking along the cart paths. The grass sways softly in the wind, and the distant mountains add a majestic backdrop to the tranquil scene.", "original_prompt_en": "golf course"}
+{"index": 814, "data": "Panoramic shot of an indoor gymnasium. The gymnasium features a polished wooden floor with distinct grain patterns, and its walls are painted light gray. In the foreground, several basketball hoops with orange rims and white nets are mounted on metallic poles, while blue exercise mats are neatly stacked along the walls. Treadmills with black frames and white digital displays are positioned near the large windows, which allow soft natural light to filter in. The high ceiling is fitted with bright white fluorescent lights, illuminating the space. In the background, a large scoreboard with red and blue digits is visible, and athletes in sportswear are actively dribbling basketballs across the court. The camera remains fixed, capturing the dynamic atmosphere as people engage in various fitness activities—some running on treadmills, others practicing basketball drills, and a few stretching on the mats—creating a lively energy within the gymnasium.", "original_prompt_en": "indoor gymnasium"}
+{"index": 815, "data": "A panoramic shot of a bustling harbor. The calm blue water fills the foreground, with various vessels: a large cargo ship loaded with colorful stacked containers, a white yacht with blue trim, and a small fishing boat with a red hull. Concrete docks line the water, equipped with tall yellow cranes standing still, their metal arms reaching toward the sky. The background features tall gray warehouse buildings with large windows, and the sky is overcast with gray clouds. The camera slowly pans right, capturing a small tugboat moving toward the dock, its black smoke billowing gently. Some seagulls fly over the water, and a few workers in orange vests are visible on the dock, walking briskly.", "original_prompt_en": "harbor"}
+{"index": 816, "data": "Panoramic shot of a highway. The smooth black asphalt road, marked with white lane lines, stretches into the distance. Multiple vehicles—cars, trucks, and motorcycles—move forward; some speed along, others maintain a steady pace. Metal guardrails line both sides, and distant road signs guide traffic. The sky is clear blue with scattered white clouds. The camera pans right, capturing the continuous traffic flow: vehicles’ headlights and taillights create streaks as they move, some overtake, and others enter or exit via ramps. The background features distant buildings or green landscapes, adding depth to the scene.", "original_prompt_en": "highway"}
+{"index": 817, "data": "A panoramic shot of a modern hospital building with a white facade and large glass windows, displaying a red cross emblem on the front. The hospital is surrounded by green trees and flower beds, with several white ambulances (adorned with red stripes) parked in front. The sky is overcast, and the ground is paved with gray tiles. In the foreground, patients in wheelchairs are being pushed by nurses in white uniforms, while visitors with concerned expressions walk in and out of the automatic glass doors. Medical staff in blue scrubs hurry past, carrying medical equipment. The background shows adjacent buildings and a cloudy sky. The camera pans right to capture the busy entrance: people move steadily (some entering, others exiting), a few cars park along the roadside, and a delivery truck with a “Medical Supplies” logo drives by.", "original_prompt_en": "hospital"}
+{"index": 818, "data": "A long shot captures a quaint single - story house with a sloped roof, painted in light yellow, standing on a lush green lawn. The house has a dark brown roof, and several rectangular windows with white frames are on its walls. A small wooden porch with a few potted plants is at the front. Around the house, there are tall trees with thick green leaves, and the sky is clear blue with some white clouds. The camera is fixed, quietly presenting the house's peaceful appearance in the natural surroundings.", "original_prompt_en": "house"}
+{"index": 819, "data": "Long shot of a massive iceberg with a rugged, white icy exterior and deep - blue submerged sections, floating on the dark - blue ocean. The iceberg’s surface displays intricate glacial patterns, with sharp ridges and smooth, snow - capped peaks. The background is a vast, clear blue sky dotted with a few wispy clouds, and the surrounding ocean water is a rich, deep blue. The camera stays fixed, capturing the iceberg’s stately presence as it slowly drifts with the gentle ocean currents. In the distance, a few seabirds soar, enhancing the tranquil, frigid atmosphere of the scene.", "original_prompt_en": "iceberg"}
+{"index": 820, "data": "Panoramic shot of an industrial area. Multiple gray factory buildings with tall chimneys dominate the scene, from which gray smoke drifts lazily into the overcast sky. On the wide concrete road, a blue freight truck moves steadily toward the right of the frame, while a red forklift busily loads metal containers near a warehouse with corrugated iron walls. The ground, dotted with oil stains and scattered metal scraps, stretches out. In the background, power transformers and tangled wires stand, with a few leafless trees lining the area’s edge. The camera pans right, capturing the continuous movement of vehicles and the steady emission of smoke from the chimneys, emphasizing the bustling industrial activity.", "original_prompt_en": "industrial area"}
+{"index": 821, "data": "Medium shot of a jail cell. The cell has gray concrete walls, a metal bunk bed with a thin, worn mattress on the left side, and a small metal sink - toilet unit in the corner. A prisoner in a black - and - white striped uniform sits on the bed, with his head bowed and hands resting on his knees in a still, pensive stance. The background reveals the cell’s metal - barred door, and dim light filters through from the corridor outside, casting soft shadows on the floor. Fixed shot: The prisoner remains mostly motionless, only occasionally shifting his weight slightly, capturing the somber and confined atmosphere of the jail cell.", "original_prompt_en": "jail cell"}
+{"index": 822, "data": "Panoramic shot of a junkyard. The ground is strewn with rusted metal scraps—twisted, dented, and piled haphazardly, glinting dully under the overcast sky. Scattered across the yard are dilapidated vehicles: a faded red sedan with a shattered windshield, a rusted gray van missing its wheels, and a battered blue pickup truck with a caved - in roof. Among the debris, old wooden furniture—like a splintered table, a tattered armchair with frayed upholstery—and broken appliances (a rusted refrigerator, a cracked TV set) lie in disarray. The background features a chain - link fence topped with barbed wire, leaning slightly, and the sky is a murky gray, heavy with clouds. A plastic bag tumbles across the dirt - strewn, oil - slicked ground, propelled by a faint wind. In the foreground, a scrawny stray cat with matted fur paws at a discarded tin can, while in the distance, a crow pecks at a rotting mattress. The camera remains fixed, capturing the still, cluttered expanse of the junkyard, where rusted metal and broken debris create a chaotic, somber scene.", "original_prompt_en": "junkyard"}
+{"index": 823, "data": "Medium shot of a cozy kitchen with white and gray checkered tiled walls. Wooden cabinets with silver handles line the space, and a stainless steel sink sits beneath a window with white lace curtains, where green potted plants sway gently. On the light brown countertop, a young Asian woman with long black hair, wearing a blue apron, is chopping a red tomato with a sharp knife. Beside her, a glass bowl holds sliced cucumbers, and a stainless steel pot on the stove releases a wisp of steam. The woman pauses to wipe her hands with a white towel, then resumes cutting, and the camera slightly zooms in to capture her focused expression. In the background, a clock on the wall ticks softly, and the faint sound of a refrigerator humming fills the air.", "original_prompt_en": "kitchen"}
+{"index": 824, "data": "A medium wide shot of an indoor library. Rows of tall wooden bookshelves, laden with a mix of colorful hardcover novels, academic textbooks, and glossy magazines, line the space—some shelves adorned with small potted ferns. In the center, several oak desks with brown leather chairs are arranged, where readers in casual and formal attire engage in quiet activities: one with black - rimmed glasses traces lines in a book with a finger, another types on a silver laptop, the screen glowing softly. Soft, diffused light from recessed ceiling lights and natural light streaming through large windows with white lace curtains bathes the room, highlighting the intricate patterns of the beige carpet. The background features a cozy reading nook with a plush green armchair, a small wooden side table, and a vintage floor lamp casting a warm halo. The camera slowly pans left, capturing a librarian in a navy uniform pushing a metal book cart, organizing books with precise movements, while a student in a gray hoodie browses the shelves, pulling out a book to examine its cover. On the far wall, framed literary prints in gold frames add elegance, and a silent clock ticks above a row of glass - fronted bookcases holding rare editions.", "original_prompt_en": "indoor library"}
+{"index": 825, "data": "A long shot captures a white lighthouse with red horizontal stripes standing tall on the rocky coastline. Its cylindrical body tapers to a lantern room at the top, designed to emit guiding light. The sky is partly overcast, with patches of blue peeking through the clouds, and soft sunlight casts subtle shadows on the lighthouse’s surface. Below, turbulent ocean waves crash against the gray, jagged rocks, forming frothy white crests. In the distance, a small fishing boat with a white hull and blue trim sails slowly across the water, while seagulls glide past the lighthouse, their wings outstretched. The camera remains fixed, highlighting the lighthouse’s steadfast presence against the dynamic coastal landscape, with the rhythmic motion of waves and the distant boat adding life to the serene scene.", "original_prompt_en": "lighthouse"}
+{"index": 826, "data": "A medium shot captures a laboratory scene. At the center, a wooden laboratory workbench is cluttered with scientific instruments: a silver microscope with an adjustable lens, a rack holding a dozen glass test tubes (some filled with blue and red liquids), and amber - colored glass bottles with dropper caps. A female researcher with brown hair tied back, wearing a white lab coat and purple nitrile gloves, stands in front of the bench. She holds a glass stirring rod with a smooth and cylindrical tip in her right hand, gently mixing a viscous light - green liquid in a beaker. The background is lined with metal shelves, which are stocked with neatly arranged reagent bottles, a digital scale, and stacks of lab notebooks. The researcher’s posture is focused, her eyes fixed on the beaker as she stirs, highlighting the precise and methodical nature of laboratory work.", "original_prompt_en": "laboratory"}
+{"index": 827, "data": "Panoramic shot of a grand mansion. The mansion showcases a classic architectural style with white stone walls, tall arched windows, and a steeply - pitched roof covered in dark slate tiles. Ivy creeps up the left facade, adding vibrant greenery. In front, a well - manicured lawn with neatly trimmed hedges and a decorative fountain (water gently splashing) occupies the space. The background is a lush forest with tall trees whose leaves rustle in the breeze, and the sky is clear with a few fluffy clouds. The camera slowly pans right, capturing the mansion’s elegant structure and the serene surrounding landscape in full.", "original_prompt_en": "mansion"}
+{"index": 828, "data": "A panoramic shot of a marsh. The ground is a patchwork of muddy soil and shallow water, covered with green aquatic plants and moss. Several white - feathered waterfowl with orange beaks wade in the shallow water, pecking at tiny aquatic organisms. The background shows low - lying shrubs and distant trees veiled in mist, under an overcast sky with a hazy light. Tall reeds sway softly in the wind, and the camera pans right to reveal more of the marsh, where water lilies dot the water and small mammals scurry among the vegetation.", "original_prompt_en": "marsh"}
+{"index": 829, "data": "Long shot captures a majestic mountain range. The central peak, bathed in soft sunlight, is cloaked in dense, dark - green forests, with rugged grayish - brown rock faces visible where the trees thin. At the mountain’s base, a vibrant green meadow spreads out, dotted with tiny, multicolored wildflowers that ripple in the gentle wind. A wispy layer of white mist coils around the mid - slope, lending an ethereal feel to the forested slopes. The background reveals more distant mountain peaks, their forms blurred by a light, bluish haze. The sky above is a clear, bright blue with a few fluffy white clouds drifting lazily. The camera pans slowly to the right, showcasing the continuous expanse of the mountain range, while a pair of eagles glides silently across the sky above the highest peak.", "original_prompt_en": "mountain"}
+{"index": 830, "data": "A panoramic shot of an indoor movie theater. Neat rows of plush black theater seats with red armrests stretch toward a large, illuminated white screen displaying a paused cinematic scene. The theater’s walls are clad in dark, sound - absorbing panels, and soft, warm light filters from recessed ceiling fixtures, casting gentle shadows. In the foreground, a handful of moviegoers—some in casual hoodies, others in dressier outfits—occupy the seats; a few scroll through their phones, while others sit quietly, anticipating the film’s continuation. The floor, carpeted in a deep blue - gray pattern, dampens noise, and in the background, the faint silhouette of a concession stand with glass cases of candy and a popcorn machine is visible. A fixed shot holds the stillness of the space, with the screen’s light reflecting subtly off the seat upholstery, and the theater remains hushed, save for the soft rustle of a patron’s jacket.", "original_prompt_en": "indoor movie theater"}
+{"index": 831, "data": "Panoramic shot of an indoor museum. The walls are painted in a soft off - white, adorned with framed artworks and informational plaques. Glass display cases with metallic frames line the corridors, showcasing ancient artifacts—some with intricate carvings, others glinting under the warm, recessed ceiling lights. The polished wooden floor reflects the subtle glow of the lighting, and a few visitors, including a family with children and a couple, wander leisurely, pausing to examine exhibits. In the background, a life - size bronze sculpture stands near a staircase with wrought - iron railings, leading to an upper gallery. The camera slowly pans across the space, capturing the quiet, contemplative atmosphere as people immerse themselves in the exhibits.", "original_prompt_en": "indoor museum"}
+{"index": 832, "data": "A medium shot of a music studio. The studio has soundproof walls lined with dark gray acoustic panels, and a polished wooden floor. On the left wall, a black electric guitar with a white pickguard hangs, while a white digital keyboard rests on a black stand in the center, its keys glistening under soft overhead lights. A silver microphone with a black pop filter is mounted on a boom arm, positioned above a black leather stool. In the background, a mixing console with colorful faders and two black studio monitors (with silver trim) occupy a wooden desk. A young musician with long brown hair, dressed in a black hoodie, leans over the console, carefully adjusting a knob as he monitors the audio levels. Warm light from a desk lamp casts gentle shadows on the workspace, and a poster of a classic rock band decorates the wall behind him. The camera remains fixed, capturing the studio’s organized, creative ambiance—instruments and equipment poised for a recording session, with the musician’s focused posture highlighting the space’s functional artistry.", "original_prompt_en": "music studio"}
+{"index": 833, "data": "Medium shot of a cozy nursery. The walls are adorned with cute cartoon - style animal stickers and painted in soft pastel hues. In the middle of the room, a group of East Asian children are playing on a light - colored, fluffy carpet. A little girl with wavy brown hair, dressed in a pink dress with white polka dots, sits cross - legged, holding a plush rabbit and laughing. A boy with short black hair, wearing a blue onesie, crawls towards a stack of colorful building blocks. Against the wall, there is a wooden crib with a white canopy, and a shelf full of picture books and plush toys. In the background, a large window with sheer white curtains allows soft natural light to fill the room. The camera pans to the right, capturing a teacher with long brown hair, wearing a light - green apron, gently helping a toddler stand up. Some children are clapping their hands, while others are sharing toys, creating a warm and lively scene.", "original_prompt_en": "nursery"}
+{"index": 834, "data": "Panoramic shot of the ocean. The ocean stretches out with a deep blue surface, where gentle ripples create subtle patterns, reflecting the clear blue sky with a few white clouds. In the distance, a light brown sandy coastline is faintly visible, and seagulls with white feathers and gray wings fly overhead. The camera is fixed, capturing the tranquil waves rolling toward the shore. A small white sailboat with a blue hull floats slowly to the right, its sails billowing in the breeze.", "original_prompt_en": "ocean"}
+{"index": 835, "data": "A medium shot captures an office space. The walls are white, and a wooden desk with a silver laptop occupies the center. A black leather office chair is positioned in front of the desk, and a bookshelf filled with documents and books stands against the back wall. A person in a blue shirt sits at the desk, typing on the laptop. To the left, a green potted plant and a framed photo decorate the space. The camera pans right, revealing a whiteboard with notes and a coffee mug on a side table. Soft light filters through a window with blinds, creating a calm, orderly office environment.", "original_prompt_en": "office"}
+{"index": 836, "data": "Panoramic shot of a magnificent Chinese palace with red walls and golden glazed tiles. The palace features upturned eaves with intricate carvings, and stone lions stand guard at the entrance. The sky above is clear blue with a few white clouds drifting. The foreground shows a paved courtyard with greenery, and in the background, more traditional buildings with matching architectural styles are visible. The camera slowly pans up to capture the full grandeur of the palace’s roof and the detailed decorations on the eaves, highlighting the vibrant colors of the structure against the serene sky.", "original_prompt_en": "palace"}
+{"index": 837, "data": "Panoramic shot of a parking lot. The ground is paved with gray asphalt, marked by white parking lines. Several cars of different colors—white sedans, black SUVs, and a red sports car—are parked neatly in the spaces, while a silver hatchback and a blue pickup truck move slowly toward the exit. The background features modern buildings with glass facades, and the sky is clear with scattered white clouds. The camera pans right, capturing a man in a blue shirt approaching his parked car, a woman in a yellow dress opening her trunk, and bicycles parked near a metal rack by the corner. A street lamp with a red sign stands at the entrance, and green shrubs line the perimeter, swaying gently in the breeze.", "original_prompt_en": "parking lot"}
+{"index": 838, "data": "A medium shot of a pharmacy. The interior is well - lit with white ceiling lights. Along the walls, shelves are packed with various pharmaceutical products: glass bottles with vibrant labels (blue, green, and white), small cardboard boxes of different sizes, and some blister - packed medications. A wooden counter with a glass surface occupies the middle of the room, its surface reflecting the lights. Behind the counter, a pharmacist in a white coat and a blue name tag is gently placing a bottle of painkillers onto the shelf, her hands moving with care. The walls are painted a soft beige, adorned with posters that provide health tips. The camera stays fixed, capturing the orderly arrangement of the pharmacy's inventory.", "original_prompt_en": "pharmacy"}
+{"index": 839, "data": "A medium shot captures a classic red phone booth with a glass front and white - framed doors standing on a paved city sidewalk. The booth’s domed top features black decorative trim, and a person inside is partially visible through the glass, holding a phone receiver as if in mid - call. The background includes a bustling street with pedestrians in casual clothing walking by, vehicles—such as a black sedan and a red bicycle—moving along the road, and multi - story buildings with shop signs lining the street. The sky is overcast, and the camera slowly pans to the right, revealing a bus stop and a green traffic light in the distance.", "original_prompt_en": "phone booth"}
+{"index": 840, "data": "A panoramic shot of a raceway. The raceway is a smooth, dark - colored asphalt track with white racing lines marking the lanes. On both sides of the track, there are metal guardrails. In the background, empty spectator stands and a clear blue sky can be seen. A red racing car speeds from the right to the left of the frame along the raceway, and the camera pans left to follow the car's movement. The surface of the raceway glistens under the sunlight, and the guardrails reflect the light, presenting a vivid scene of the racing circuit.", "original_prompt_en": "raceway"}
+{"index": 841, "data": "A medium shot of a cozy restaurant interior bathed in warm yellow lighting. Dark wooden tables, each dressed in crisp white tablecloths and set with gleaming silverware, are neatly arranged. A waiter in a white shirt and black apron glides across the floor, balancing a tray with a steaming bowl of pasta and a glass of red wine, heading toward a table where a couple— the man in a blue button - down, the woman in a floral dress— are smiling as they peruse the menu. The background reveals a rustic brick wall lined with framed food photographs, a bustling open kitchen where a chef in a white uniform skillfully flips a pizza, and a sleek counter with a barista frothing milk for a latte. The camera holds steady, capturing the gentle clink of utensils, soft chatter, and the aromatic haze of freshly cooked meals, with soft jazz music drifting through the air.", "original_prompt_en": "restaurant"}
+{"index": 842, "data": "Long shot of a winding river flowing through a lush green landscape. The river’s surface is smooth, reflecting the blue sky with scattered white clouds. On both riverbanks, dense green trees and grassy areas extend, with vibrant wildflowers blooming. The water meanders gently toward the lower right of the frame. Fixed shot captures the serene flow of the river, as it moves through the natural scenery, with the camera remaining still to emphasize the calm movement of the water.", "original_prompt_en": "river"}
+{"index": 843, "data": "A panoramic shot of the interior of a science museum. The spacious hall is adorned with diverse scientific exhibits: a large Newton's cradle with silver metal balls suspended, a transparent human skeleton model with detailed anatomical labels, and an interactive physics station featuring colorful levers and pulleys. Visitors of various ages and ethnicities are scattered around—children in vibrant clothing eagerly pressing buttons on a digital display, adults leaning in to read information panels with black text on white backgrounds. The ceiling has a grid of recessed lights, and the walls are lined with wooden display cabinets holding small scientific artifacts. The camera pans right to reveal a futuristic robotics exhibit, where a white robotic arm with metallic joints moves smoothly, demonstrating precision. In the background, a group of students in blue uniforms listens to a guide holding a red flag, while a couple in the foreground takes a photo of a glowing periodic table display.", "original_prompt_en": "science museum"}
+{"index": 844, "data": "A medium shot reveals a bathroom with white tiled walls and a silver showerhead mounted on the wall. A young East Asian woman with long, wet black hair stands beneath the running shower, water cascading down her bare body. She closes her eyes, tilting her head back as she uses her right hand to adjust the temperature knob on the chrome faucet, while her left hand gently runs through her hair, spreading the warm water. The background features a white sink with a round mirror, a blue towel hanging on a metallic rack, and a glass shower door dotted with water droplets. The camera remains fixed, capturing the steady stream of water from the showerhead and the woman’s relaxed movements—she occasionally shifts her weight, rubbing her arm to rinse off soap suds. The bathroom floor, covered in light gray non - slip tiles, has a small puddle of water forming near her feet.", "original_prompt_en": "shower"}
+{"index": 845, "data": "A panoramic shot of a ski slope. The slope is blanketed in fresh, glistening white snow, with visible ski tracks crisscrossing its smooth surface. Under a clear blue sky, the background features towering snow - capped mountains and clusters of evergreen trees dotting the landscape. In the foreground, a few skiers in colorful winter gear are gliding down the slope, their skis carving graceful arcs in the snow. The camera slowly pans to the right, capturing more of the expansive ski area, including a ski lift with chairs moving steadily up an adjacent slope, carrying skiers toward the summit. The crisp mountain air and bright sunlight enhance the vividness of the snowy scene, with the distant mountains and trees creating a picturesque backdrop for the dynamic skiing action.", "original_prompt_en": "ski slope"}
+{"index": 846, "data": "Panoramic shot of the sky. The sky is bright blue with a few scattered white clouds floating gently. The background is empty, with no visible ground or buildings, and the camera remains fixed, showcasing the vast and clear sky.", "original_prompt_en": "sky"}
+{"index": 847, "data": "A panoramic shot captures a towering skyscraper with a sleek glass curtain wall, reflecting the pale blue sky with scattered white clouds. The skyscraper stands prominently in a bustling urban cityscape, surrounded by shorter buildings with varied architectural styles—some with brick facades, others with modern metal exteriors. Below, the busy street is filled with vehicles: a red sedan, a white delivery truck, and cyclists in colorful helmets navigating through traffic. The sidewalks are lined with lush green trees and street lamps with classic black fixtures. In the background, more high - rises stretch towards the horizon, and the camera slowly pans upward to emphasize the skyscraper’s impressive height, while a gentle breeze rustles the tree leaves.", "original_prompt_en": "skyscraper"}
+{"index": 848, "data": "Panoramic shot of a baseball stadium. The field features vibrant green grass in the outfield and a reddish - brown dirt infield, with white chalk lines clearly marking the bases and the batter’s box. At the center, a pitcher in a white uniform with a dark - colored cap is in the middle of a pitch, his arm extended toward the batter. The batter, dressed in a navy - colored jersey, stands ready, firmly gripping the bat. The stands are packed with spectators; some are wearing team - colored shirts, some are waving foam fingers, and many are holding snacks or drinks. In the background, there are tall, light - colored stadium seats, and colorful advertisements are attached along the upper railings. The sky is clear, with a few fluffy white clouds floating. The camera slowly pans across the field, capturing the pitcher’s throw, the batter’s swing, and the ball flying toward the outfield. An outfielder in a gray uniform sprints to catch the ball.", "original_prompt_en": "baseball stadium"}
+{"index": 849, "data": "A medium shot captures a wooden staircase with dark - brown steps and light - brown handrails, standing against a white - painted wall. The floor at the base of the stairs is made of light - colored wood, and a small potted plant with green leaves is placed on the left side of the staircase. The background includes a hallway, and a framed picture hangs on the wall to the right of the stairs. The camera is initially fixed, and then it slowly pans upward to reveal more of the staircase, showing its neatly arranged steps and the smooth texture of the wood.", "original_prompt_en": "staircase"}
+{"index": 850, "data": "Panoramic shot of a bustling city street. The sky is clear and blue, dotted with a few fluffy white clouds. On both sides of the street, rows of buildings with diverse storefronts line the road—some are cafes with outdoor tables, others are vibrant retail shops with colorful signage. Several cars, including a red sedan and a silver SUV, drive along the asphalt road, while others are parked neatly by the curb. A traffic light above the intersection glows green, letting vehicles proceed. In the foreground, pedestrians stroll on the sidewalk—some chat, others hurry, and a few helmeted cyclists ride past. The background showcases tall glass - faced skyscrapers reflecting sunlight. The camera pans right, capturing more of the busy street with vehicles moving in both directions and pedestrians filling the sidewalks, then pans left to reveal the lively urban scene where people and traffic interweave. Vehicles on the road move steadily, some with their fronts toward the camera, others with their backs, and cyclists weave through the traffic.", "original_prompt_en": "street"}
+{"index": 851, "data": "Panoramic shot of a bustling supermarket interior. Bright white fluorescent lights illuminate the space, with neatly arranged shelves on both sides filled with colorful packaged goods—canned foods, snack bags, and fresh produce in transparent containers. In the center aisle, a middle - aged woman with brown curly hair, wearing a blue apron, is stocking shelves with boxes of cereal, carefully placing each box to ensure alignment. A young couple pushes a silver shopping cart; the man, in a gray hoodie, reaches for a bottle of laundry detergent from the top shelf, while the woman, in a pink dress, examines a jar of peanut butter. In the background, a row of checkout counters with glowing LED screens is visible, and a few customers are queuing, some holding baskets of groceries. The floor is a smooth, light - colored tile, and signs with bold black text hang from the ceiling, indicating different product sections like \"Dairy\" and \"Bakery\". The camera slowly pans right, capturing more shoppers browsing the aisles, including a child in a red hoodie sitting in a shopping cart, pointing excitedly at a display of colorful toys.", "original_prompt_en": "supermarket"}
+{"index": 852, "data": "Panoramic shot of an indoor swimming pool. The pool is rectangular with clear turquoise water, and the surrounding deck is tiled with light blue and white square tiles. Metal handrails line the pool’s edge, and a few colorful floating devices are placed nearby. In the background, white changing rooms with wooden benches stand against the wall, while warm - white LED lights on the ceiling cast a soft glow over the area. Several swimmers in vibrant swimsuits swim laps, their arms slicing through the water rhythmically, and some people in bathrobes sit on beige lounge chairs by the poolside, chatting. The camera slowly pans across the pool, capturing the ripples on the water’s surface and the relaxed atmosphere of the indoor pool.", "original_prompt_en": "indoor swimming pool"}
+{"index": 853, "data": "Panoramic shot of a gray stone tower with a pointed roof, standing still against the sky. The tower features several narrow, arched windows along its weathered stone facade, showcasing fine cracks and textures of aged masonry. The background is a clear blue sky dotted with fluffy white clouds, while the ground around the tower is a patch of green grass interspersed with a cobblestone path. A few birds glide past in the distance, and the camera remains fixed, capturing the tower’s stately, motionless presence.", "original_prompt_en": "tower"}
+{"index": 854, "data": "Panoramic shot of an outdoor track. The track, crafted from vibrant red rubber with crisp white lane markings, extends under a clear blue sky. Flanking the track are lush green grassy patches and scattered trees with swaying leaves. In the background, a grandstand with blue seats stands, and a few athletes in sportswear jog along the lanes, their steps steady as they circle the track. The camera remains fixed, capturing the rhythmic movement of the runners while the breeze rustles the surrounding foliage.", "original_prompt_en": "outdoor track"}
+{"index": 855, "data": "It's a long shot of a silver passenger train with multiple carriages parked on the railway track. The train’s side faces the camera, with rectangular windows lining both sides and a faint logo visible on the side. The ground around the track is a mix of barren soil and sparse grass, with a few small plants poking through. To the right of the frame, telegraph poles with wires stretch into the distance. The background showcases a clear blue sky dotted with fluffy white clouds. Fixed shot: the train remains still, while the camera holds steady, capturing the serene scene of the train resting on the rails.", "original_prompt_en": "train railway"}
+{"index": 856, "data": "A panoramic shot of a train station platform. The platform is paved with gray concrete tiles, featuring a yellow safety line along its edge. A red passenger train with white horizontal stripes is parked on the adjacent railway track, its doors closed. Several passengers—some carrying black suitcases, others with backpacks—walk briskly across the platform: one in a blue jacket hurries toward the train, while a few stand near the ticket vending machine, checking their phones. A station attendant in a blue uniform stands by a signboard displaying train schedules. The background reveals a modern station building with glass windows and a digital display showing departure times, and the sky outside is clear with scattered white clouds. The camera pans slowly to the right, capturing more of the platform as a group of travelers gathers near the train’s doors, preparing to board.", "original_prompt_en": "train station platform"}
+{"index": 857, "data": "A panoramic shot captures an underwater coral reef scene. The reef is composed of corals in various vibrant colors—bright red, vivid orange, and deep blue—with intricate shapes, some resembling antlers and others blooming like flowers. Clear turquoise water surrounds the reef, and sunlight filters through the water's surface, creating dappled light spots on the corals. Colorful tropical fish with striped or spotted patterns swim nimbly among the corals, while a green sea turtle glides slowly past, its flippers moving gracefully. Seaweed sways gently with the underwater current, and small shrimps scuttle across the coral surfaces. The background reveals a deep blue expanse of the ocean, with distant, blurry coral formations. The camera slowly pans to showcase different sections of the reef, capturing the lively marine life and the delicate, swaying sea plants.", "original_prompt_en": "underwater coral reef"}
+{"index": 858, "data": "Panoramic shot of a valley. The valley is enclosed by towering, forest - clad mountains, their slopes blanketed in dense green foliage that rustles in the gentle wind. The valley floor is a tapestry of tall, golden grasses interspersed with clusters of broad - leafed trees, their leaves glistening under the sunlight. A meandering stream with crystal - clear water cuts through the center, its surface rippling as it flows over smooth stones. The ground is a mix of soft, brown soil and patches of gray rock, dotted with vibrant wildflowers in shades of red and white. In the background, distant mountain ridges fade into a light haze, and the sky stretches out in a deep blue, dotted with a few wispy clouds. The camera remains fixed, capturing the serene landscape, while a group of colorful birds flits among the trees, and a deer pauses to drink from the stream at the edge of the frame.", "original_prompt_en": "valley"}
+{"index": 859, "data": "A panoramic shot of a volcano. The volcano, with a rugged dark - gray exterior and a wide crater at the summit, emits thick grayish - black smoke that rises slowly into the overcast sky. The surrounding landscape is a barren, ashen plain scattered with black volcanic rocks, and in the distance, hazy low - lying mountains blend into the gloomy atmosphere. The volcano remains mostly still, yet the smoke from its crater continues to billow and swirl gently in the wind. The camera is fixed, capturing the imposing presence of the volcano and the drifting smoke, while the overcast sky adds a somber tone to the scene.", "original_prompt_en": "volcano"}
+{"index": 860, "data": "Panoramic shot of a waterfall. The waterfall cascades down from a high rocky cliff, with white frothy water streaming down and creating a misty spray at the base. The rocks around the waterfall are gray with patches of green moss. The background features lush green mountains with dense trees, and the sky is clear blue with a few white clouds. The camera slowly pans down to capture the waterfall’s full height, from the top of the cliff to the turquoise pool below, where the water ripples and reflects the surrounding greenery.", "original_prompt_en": "waterfall"}
+{"index": 861, "data": "Panoramic shot of a traditional windmill standing on a vast, sun - drenched green field. The windmill features a sturdy, brown wooden tower with a slanted, dark gray roof, and its four large, white - trimmed blades are gracefully rotating as the wind caresses them. The sky overhead is a vibrant blue, with a few cotton - like white clouds lazily floating. In the background, there are quaint rural cottages with thatched roofs and expansive, golden - hued barley fields that ripple like waves in the wind. The camera stays fixed, capturing the windmill’s blades spinning smoothly, the surrounding wildflowers nodding gently, and the distant birds soaring across the sky.", "original_prompt_en": "windmill"}
+{"index": 862, "data": "A front-view wide shot shows only two main objects: a bicycle and a car. A bicycle is on the left of a car. A bicycle is placed in the center-left area of the frame, and a car is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a bicycle stays on the left of a car throughout.", "original_prompt_en": "a bicycle on the left of a car, front view"}
+{"index": 863, "data": "A front-view wide shot shows only two main objects: a car and a motorcycle. A car is on the right of a motorcycle. A motorcycle is placed in the center-left area of the frame, and a car is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a car stays on the right of a motorcycle throughout.", "original_prompt_en": "a car on the right of a motorcycle, front view"}
+{"index": 864, "data": "A front-view wide shot shows only two main objects: a motorcycle and a bus. A motorcycle is on the left of a bus. A motorcycle is placed in the center-left area of the frame, and a bus is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a motorcycle stays on the left of a bus throughout.", "original_prompt_en": "a motorcycle on the left of a bus, front view"}
+{"index": 865, "data": "A front-view wide shot shows only two main objects: a bus and a traffic light. A bus is on the right of a traffic light. A traffic light is placed in the center-left area of the frame, and a bus is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a bus stays on the right of a traffic light throughout.", "original_prompt_en": "a bus on the right of a traffic light, front view"}
+{"index": 866, "data": "A front-view wide shot shows only two main objects: a traffic light and a fire hydrant. A traffic light is on the left of a fire hydrant. A traffic light is placed in the center-left area of the frame, and a fire hydrant is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a traffic light stays on the left of a fire hydrant throughout.", "original_prompt_en": "a traffic light on the left of a fire hydrant, front view"}
+{"index": 867, "data": "A front-view wide shot shows only two main objects: a fire hydrant and a stop sign. A fire hydrant is on the right of a stop sign. A stop sign is placed in the center-left area of the frame, and a fire hydrant is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a fire hydrant stays on the right of a stop sign throughout.", "original_prompt_en": "a fire hydrant on the right of a stop sign, front view"}
+{"index": 868, "data": "A front-view wide shot shows only two main objects: a stop sign and a parking meter. A stop sign is on the left of a parking meter. A stop sign is placed in the center-left area of the frame, and a parking meter is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a stop sign stays on the left of a parking meter throughout.", "original_prompt_en": "a stop sign on the left of a parking meter, front view"}
+{"index": 869, "data": "A front-view wide shot shows only two main objects: a parking meter and a bench. A parking meter is on the right of a bench. A bench is placed in the center-left area of the frame, and a parking meter is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a parking meter stays on the right of a bench throughout.", "original_prompt_en": "a parking meter on the right of a bench, front view"}
+{"index": 870, "data": "A front-view wide shot shows only two main objects: a bench and a truck. A bench is on the left of a truck. A bench is placed in the center-left area of the frame, and a truck is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a bench stays on the left of a truck throughout.", "original_prompt_en": "a bench on the left of a truck, front view"}
+{"index": 871, "data": "A front-view wide shot shows only two main objects: a truck and a bicycle. A truck is on the right of a bicycle. A bicycle is placed in the center-left area of the frame, and a truck is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a truck stays on the right of a bicycle throughout.", "original_prompt_en": "a truck on the right of a bicycle, front view"}
+{"index": 872, "data": "A front-view wide shot shows only two main objects: a bird and a cat. A bird is on the left of a cat. A bird is placed in the center-left area of the frame, and a cat is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a bird stays on the left of a cat throughout.", "original_prompt_en": "a bird on the left of a cat, front view"}
+{"index": 873, "data": "A front-view wide shot shows only two main objects: a cat and a dog. A cat is on the right of a dog. A dog is placed in the center-left area of the frame, and a cat is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a cat stays on the right of a dog throughout.", "original_prompt_en": "a cat on the right of a dog, front view"}
+{"index": 874, "data": "A front-view wide shot shows only two main objects: a dog and a horse. A dog is on the left of a horse. A dog is placed in the center-left area of the frame, and a horse is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a dog stays on the left of a horse throughout.", "original_prompt_en": "a dog on the left of a horse, front view"}
+{"index": 875, "data": "A front-view wide shot shows only two main objects: a horse and a sheep. A horse is on the right of a sheep. A sheep is placed in the center-left area of the frame, and a horse is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a horse stays on the right of a sheep throughout.", "original_prompt_en": "a horse on the right of a sheep, front view"}
+{"index": 876, "data": "A front-view wide shot shows only two main objects: a sheep and a cow. A sheep is on the left of a cow. A sheep is placed in the center-left area of the frame, and a cow is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a sheep stays on the left of a cow throughout.", "original_prompt_en": "a sheep on the left of a cow, front view"}
+{"index": 877, "data": "A front-view wide shot shows only two main objects: a cow and an elephant. A cow is on the right of an elephant. An elephant is placed in the center-left area of the frame, and a cow is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a cow stays on the right of an elephant throughout.", "original_prompt_en": "a cow on the right of an elephant, front view"}
+{"index": 878, "data": "A front-view wide shot shows only two main objects: an elephant and a bear. An elephant is on the left of a bear. An elephant is placed in the center-left area of the frame, and a bear is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and an elephant stays on the left of a bear throughout.", "original_prompt_en": "an elephant on the left of a bear, front view"}
+{"index": 879, "data": "A front-view wide shot shows only two main objects: a bear and a zebra. A bear is on the right of a zebra. A zebra is placed in the center-left area of the frame, and a bear is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a bear stays on the right of a zebra throughout.", "original_prompt_en": "a bear on the right of a zebra, front view"}
+{"index": 880, "data": "A front-view wide shot shows only two main objects: a zebra and a giraffe. A zebra is on the left of a giraffe. A zebra is placed in the center-left area of the frame, and a giraffe is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a zebra stays on the left of a giraffe throughout.", "original_prompt_en": "a zebra on the left of a giraffe, front view"}
+{"index": 881, "data": "A front-view wide shot shows only two main objects: a giraffe and a bird. A giraffe is on the right of a bird. A bird is placed in the center-left area of the frame, and a giraffe is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open grassy field under natural daylight, with a clean and natural background. fixed shot. Exactly these two objects remain in the scene, and a giraffe stays on the right of a bird throughout.", "original_prompt_en": "a giraffe on the right of a bird, front view"}
+{"index": 882, "data": "A front-view close shot shows only two main objects: a bottle and a wine glass. A bottle is on the left of a wine glass. A bottle is placed in the center-left area of the frame, and a wine glass is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a bottle stays on the left of a wine glass throughout.", "original_prompt_en": "a bottle on the left of a wine glass, front view"}
+{"index": 883, "data": "A front-view close shot shows only two main objects: a wine glass and a cup. A wine glass is on the right of a cup. A cup is placed in the center-left area of the frame, and a wine glass is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a wine glass stays on the right of a cup throughout.", "original_prompt_en": "a wine glass on the right of a cup, front view"}
+{"index": 884, "data": "A front-view close shot shows only two main objects: a cup and a fork. A cup is on the left of a fork. A cup is placed in the center-left area of the frame, and a fork is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a cup stays on the left of a fork throughout.", "original_prompt_en": "a cup on the left of a fork, front view"}
+{"index": 885, "data": "A front-view close shot shows only two main objects: a fork and a knife. A fork is on the right of a knife. A knife is placed in the center-left area of the frame, and a fork is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a fork stays on the right of a knife throughout.", "original_prompt_en": "a fork on the right of a knife, front view"}
+{"index": 886, "data": "A front-view close shot shows only two main objects: a knife and a spoon. A knife is on the left of a spoon. A knife is placed in the center-left area of the frame, and a spoon is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a knife stays on the left of a spoon throughout.", "original_prompt_en": "a knife on the left of a spoon, front view"}
+{"index": 887, "data": "A front-view close shot shows only two main objects: a spoon and a bowl. A spoon is on the right of a bowl. A bowl is placed in the center-left area of the frame, and a spoon is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a spoon stays on the right of a bowl throughout.", "original_prompt_en": "a spoon on the right of a bowl, front view"}
+{"index": 888, "data": "A front-view close shot shows only two main objects: a bowl and a bottle. A bowl is on the left of a bottle. A bowl is placed in the center-left area of the frame, and a bottle is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a bowl stays on the left of a bottle throughout.", "original_prompt_en": "a bowl on the left of a bottle, front view"}
+{"index": 889, "data": "A front-view medium shot shows only two main objects: a potted plant and a remote. A potted plant is on the left of a remote. A potted plant is placed in the center-left area of the frame, and a remote is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a potted plant stays on the left of a remote throughout.", "original_prompt_en": "a potted plant on the left of a remote, front view"}
+{"index": 890, "data": "A front-view close shot shows only two main objects: a remote and a clock. A remote is on the right of a clock. A clock is placed in the center-left area of the frame, and a remote is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a remote stays on the right of a clock throughout.", "original_prompt_en": "a remote on the right of a clock, front view"}
+{"index": 891, "data": "A front-view close shot shows only two main objects: a clock and a vase. A clock is on the left of a vase. A clock is placed in the center-left area of the frame, and a vase is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a clock stays on the left of a vase throughout.", "original_prompt_en": "a clock on the left of a vase, front view"}
+{"index": 892, "data": "A front-view close shot shows only two main objects: a vase and scissors. A vase is on the right of scissors. Scissors is placed in the center-left area of the frame, and a vase is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a vase stays on the right of scissors throughout.", "original_prompt_en": "a vase on the right of scissors, front view"}
+{"index": 893, "data": "A front-view close shot shows only two main objects: scissors and a teddy bear. Scissors is on the left of a teddy bear. Scissors is placed in the center-left area of the frame, and a teddy bear is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and scissors stays on the left of a teddy bear throughout.", "original_prompt_en": "scissors on the left of a teddy bear, front view"}
+{"index": 894, "data": "A front-view medium shot shows only two main objects: a teddy bear and a potted plant. A teddy bear is on the right of a potted plant. A potted plant is placed in the center-left area of the frame, and a teddy bear is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a teddy bear stays on the right of a potted plant throughout.", "original_prompt_en": "a teddy bear on the right of a potted plant, front view"}
+{"index": 895, "data": "A front-view medium-wide shot shows only two main objects: a frisbee and a sports ball. A frisbee is on the left of a sports ball. A frisbee is placed in the center-left area of the frame, and a sports ball is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a frisbee stays on the left of a sports ball throughout.", "original_prompt_en": "a frisbee on the left of a sports ball, front view"}
+{"index": 896, "data": "A front-view medium-wide shot shows only two main objects: a sports ball and a baseball bat. A sports ball is on the right of a baseball bat. A baseball bat is placed in the center-left area of the frame, and a sports ball is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a sports ball stays on the right of a baseball bat throughout.", "original_prompt_en": "a sports ball on the right of a baseball bat, front view"}
+{"index": 897, "data": "A front-view medium-wide shot shows only two main objects: a baseball bat and a baseball glove. A baseball bat is on the left of a baseball glove. A baseball bat is placed in the center-left area of the frame, and a baseball glove is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a baseball bat stays on the left of a baseball glove throughout.", "original_prompt_en": "a baseball bat on the left of a baseball glove, front view"}
+{"index": 898, "data": "A front-view medium-wide shot shows only two main objects: a baseball glove and a tennis racket. A baseball glove is on the right of a tennis racket. A tennis racket is placed in the center-left area of the frame, and a baseball glove is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a baseball glove stays on the right of a tennis racket throughout.", "original_prompt_en": "a baseball glove on the right of a tennis racket, front view"}
+{"index": 899, "data": "A front-view medium-wide shot shows only two main objects: a tennis racket and a frisbee. A tennis racket is on the left of a frisbee. A tennis racket is placed in the center-left area of the frame, and a frisbee is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a tennis racket stays on the left of a frisbee throughout.", "original_prompt_en": "a tennis racket on the left of a frisbee, front view"}
+{"index": 900, "data": "A front-view medium shot shows only two main objects: a toilet and a hair drier. A toilet is on the left of a hair drier. A toilet is placed in the center-left area of the frame, and a hair drier is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a bright clean bathroom with soft natural light and a simple uncluttered background. fixed shot. Exactly these two objects remain in the scene, and a toilet stays on the left of a hair drier throughout.", "original_prompt_en": "a toilet on the left of a hair drier, front view"}
+{"index": 901, "data": "A front-view medium shot shows only two main objects: a hair drier and a toothbrush. A hair drier is on the right of a toothbrush. A toothbrush is placed in the center-left area of the frame, and a hair drier is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a bright clean bathroom with soft natural light and a simple uncluttered background. fixed shot. Exactly these two objects remain in the scene, and a hair drier stays on the right of a toothbrush throughout.", "original_prompt_en": "a hair drier on the right of a toothbrush, front view"}
+{"index": 902, "data": "A front-view medium shot shows only two main objects: a toothbrush and a sink. A toothbrush is on the left of a sink. A toothbrush is placed in the center-left area of the frame, and a sink is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a bright clean bathroom with soft natural light and a simple uncluttered background. fixed shot. Exactly these two objects remain in the scene, and a toothbrush stays on the left of a sink throughout.", "original_prompt_en": "a toothbrush on the left of a sink, front view"}
+{"index": 903, "data": "A front-view medium shot shows only two main objects: a sink and a toilet. A sink is on the right of a toilet. A toilet is placed in the center-left area of the frame, and a sink is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a bright clean bathroom with soft natural light and a simple uncluttered background. fixed shot. Exactly these two objects remain in the scene, and a sink stays on the right of a toilet throughout.", "original_prompt_en": "a sink on the right of a toilet, front view"}
+{"index": 904, "data": "A front-view medium shot shows only two main objects: a chair and a couch. A chair is on the left of a couch. A chair is placed in the center-left area of the frame, and a couch is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a chair stays on the left of a couch throughout.", "original_prompt_en": "a chair on the left of a couch, front view"}
+{"index": 905, "data": "A front-view medium shot shows only two main objects: a couch and a bed. A couch is on the right of a bed. A bed is placed in the center-left area of the frame, and a couch is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a couch stays on the right of a bed throughout.", "original_prompt_en": "a couch on the right of a bed, front view"}
+{"index": 906, "data": "A front-view medium shot shows only two main objects: a bed and a tv. A bed is on the left of a tv. A bed is placed in the center-left area of the frame, and a tv is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a bed stays on the left of a tv throughout.", "original_prompt_en": "a bed on the left of a tv, front view"}
+{"index": 907, "data": "A front-view medium shot shows only two main objects: a tv and a dining table. A tv is on the right of a dining table. A dining table is placed in the center-left area of the frame, and a tv is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a tv stays on the right of a dining table throughout.", "original_prompt_en": "a tv on the right of a dining table, front view"}
+{"index": 908, "data": "A front-view medium shot shows only two main objects: a dining table and a chair. A dining table is on the left of a chair. A dining table is placed in the center-left area of the frame, and a chair is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a dining table stays on the left of a chair throughout.", "original_prompt_en": "a dining table on the left of a chair, front view"}
+{"index": 909, "data": "A front-view wide shot shows only two main objects: an airplane and a train. An airplane is on the left of a train. An airplane is placed in the center-left area of the frame, and a train is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and an airplane stays on the left of a train throughout.", "original_prompt_en": "an airplane on the left of a train, front view"}
+{"index": 910, "data": "A front-view wide shot shows only two main objects: a train and a boat. A train is on the right of a boat. A boat is placed in the center-left area of the frame, and a train is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a train stays on the right of a boat throughout.", "original_prompt_en": "a train on the right of a boat, front view"}
+{"index": 911, "data": "A front-view wide shot shows only two main objects: a boat and an airplane. A boat is on the left of an airplane. A boat is placed in the center-left area of the frame, and an airplane is placed in the center-right area of the frame, with a wide clear horizontal gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a spacious outdoor area under clear daylight, with a simple open background. fixed shot. Exactly these two objects remain in the scene, and a boat stays on the left of an airplane throughout.", "original_prompt_en": "a boat on the left of an airplane, front view"}
+{"index": 912, "data": "A front-view medium shot shows only two main objects: an oven and a toaster. An oven is above a toaster. An oven is placed in the upper-center area of the frame, and a toaster is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and an oven stays above a toaster throughout.", "original_prompt_en": "an oven on the top of a toaster, front view"}
+{"index": 913, "data": "A front-view medium shot shows only two main objects: an oven and a toaster. An oven is below a toaster. A toaster is placed in the upper-center area of the frame, and an oven is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and an oven stays below a toaster throughout.", "original_prompt_en": "an oven on the bottom of a toaster, front view"}
+{"index": 914, "data": "A front-view medium shot shows only two main objects: a toaster and a microwave. A toaster is above a microwave. A toaster is placed in the upper-center area of the frame, and a microwave is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a toaster stays above a microwave throughout.", "original_prompt_en": "a toaster on the top of a microwave, front view"}
+{"index": 915, "data": "A front-view medium shot shows only two main objects: a toaster and a microwave. A toaster is below a microwave. A microwave is placed in the upper-center area of the frame, and a toaster is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a toaster stays below a microwave throughout.", "original_prompt_en": "a toaster on the bottom of a microwave, front view"}
+{"index": 916, "data": "A front-view medium shot shows only two main objects: a microwave and an oven. A microwave is above an oven. A microwave is placed in the upper-center area of the frame, and an oven is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a microwave stays above an oven throughout.", "original_prompt_en": "a microwave on the top of an oven, front view"}
+{"index": 917, "data": "A front-view medium shot shows only two main objects: a microwave and an oven. A microwave is below an oven. An oven is placed in the upper-center area of the frame, and a microwave is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in a clean indoor room with soft natural light and a plain background. fixed shot. Exactly these two objects remain in the scene, and a microwave stays below an oven throughout.", "original_prompt_en": "a microwave on the bottom of an oven, front view"}
+{"index": 918, "data": "A front-view close shot shows only two main objects: a banana and an apple. A banana is above an apple. A banana is placed in the upper-center area of the frame, and an apple is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a banana stays above an apple throughout.", "original_prompt_en": "a banana on the top of an apple, front view"}
+{"index": 919, "data": "A front-view close shot shows only two main objects: a banana and an apple. A banana is below an apple. An apple is placed in the upper-center area of the frame, and a banana is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a banana stays below an apple throughout.", "original_prompt_en": "a banana on the bottom of an apple, front view"}
+{"index": 920, "data": "A front-view close shot shows only two main objects: an apple and a sandwich. An apple is above a sandwich. An apple is placed in the upper-center area of the frame, and a sandwich is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and an apple stays above a sandwich throughout.", "original_prompt_en": "an apple on the top of a sandwich, front view"}
+{"index": 921, "data": "A front-view close shot shows only two main objects: an apple and a sandwich. An apple is below a sandwich. A sandwich is placed in the upper-center area of the frame, and an apple is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and an apple stays below a sandwich throughout.", "original_prompt_en": "an apple on the bottom of a sandwich, front view"}
+{"index": 922, "data": "A front-view close shot shows only two main objects: a sandwich and an orange. A sandwich is above an orange. A sandwich is placed in the upper-center area of the frame, and an orange is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a sandwich stays above an orange throughout.", "original_prompt_en": "a sandwich on the top of an orange, front view"}
+{"index": 923, "data": "A front-view close shot shows only two main objects: a sandwich and an orange. A sandwich is below an orange. An orange is placed in the upper-center area of the frame, and a sandwich is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a sandwich stays below an orange throughout.", "original_prompt_en": "a sandwich on the bottom of an orange, front view"}
+{"index": 924, "data": "A front-view close shot shows only two main objects: an orange and a carrot. An orange is above a carrot. An orange is placed in the upper-center area of the frame, and a carrot is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and an orange stays above a carrot throughout.", "original_prompt_en": "an orange on the top of a carrot, front view"}
+{"index": 925, "data": "A front-view close shot shows only two main objects: an orange and a carrot. An orange is below a carrot. A carrot is placed in the upper-center area of the frame, and an orange is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and an orange stays below a carrot throughout.", "original_prompt_en": "an orange on the bottom of a carrot, front view"}
+{"index": 926, "data": "A front-view close shot shows only two main objects: a carrot and a hot dog. A carrot is above a hot dog. A carrot is placed in the upper-center area of the frame, and a hot dog is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a carrot stays above a hot dog throughout.", "original_prompt_en": "a carrot on the top of a hot dog, front view"}
+{"index": 927, "data": "A front-view close shot shows only two main objects: a carrot and a hot dog. A carrot is below a hot dog. A hot dog is placed in the upper-center area of the frame, and a carrot is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a carrot stays below a hot dog throughout.", "original_prompt_en": "a carrot on the bottom of a hot dog, front view"}
+{"index": 928, "data": "A front-view close shot shows only two main objects: a hot dog and a pizza. A hot dog is above a pizza. A hot dog is placed in the upper-center area of the frame, and a pizza is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a hot dog stays above a pizza throughout.", "original_prompt_en": "a hot dog on the top of a pizza, front view"}
+{"index": 929, "data": "A front-view close shot shows only two main objects: a hot dog and a pizza. A hot dog is below a pizza. A pizza is placed in the upper-center area of the frame, and a hot dog is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a hot dog stays below a pizza throughout.", "original_prompt_en": "a hot dog on the bottom of a pizza, front view"}
+{"index": 930, "data": "A front-view close shot shows only two main objects: a pizza and a donut. A pizza is above a donut. A pizza is placed in the upper-center area of the frame, and a donut is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a pizza stays above a donut throughout.", "original_prompt_en": "a pizza on the top of a donut, front view"}
+{"index": 931, "data": "A front-view close shot shows only two main objects: a pizza and a donut. A pizza is below a donut. A donut is placed in the upper-center area of the frame, and a pizza is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a pizza stays below a donut throughout.", "original_prompt_en": "a pizza on the bottom of a donut, front view"}
+{"index": 932, "data": "A front-view close shot shows only two main objects: a donut and broccoli. A donut is above broccoli. A donut is placed in the upper-center area of the frame, and broccoli is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a donut stays above broccoli throughout.", "original_prompt_en": "a donut on the top of broccoli, front view"}
+{"index": 933, "data": "A front-view close shot shows only two main objects: a donut and broccoli. A donut is below broccoli. Broccoli is placed in the upper-center area of the frame, and a donut is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and a donut stays below broccoli throughout.", "original_prompt_en": "a donut on the bottom of broccoli, front view"}
+{"index": 934, "data": "A front-view close shot shows only two main objects: broccoli and a banana. Broccoli is above a banana. Broccoli is placed in the upper-center area of the frame, and a banana is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and broccoli stays above a banana throughout.", "original_prompt_en": "broccoli on the top of a banana, front view"}
+{"index": 935, "data": "A front-view close shot shows only two main objects: broccoli and a banana. Broccoli is below a banana. A banana is placed in the upper-center area of the frame, and broccoli is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. They are placed on a clean flat surface with natural light and a softly blurred background. fixed shot. Exactly these two objects remain in the scene, and broccoli stays below a banana throughout.", "original_prompt_en": "broccoli on the bottom of a banana, front view"}
+{"index": 936, "data": "A front-view medium-wide shot shows only two main objects: skis and a snowboard. Skis is above a snowboard. Skis is placed in the upper-center area of the frame, and a snowboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and skis stays above a snowboard throughout.", "original_prompt_en": "skis on the top of a snowboard, front view"}
+{"index": 937, "data": "A front-view medium-wide shot shows only two main objects: skis and a snowboard. Skis is below a snowboard. A snowboard is placed in the upper-center area of the frame, and skis is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and skis stays below a snowboard throughout.", "original_prompt_en": "skis on the bottom of a snowboard, front view"}
+{"index": 938, "data": "A front-view medium-wide shot shows only two main objects: a snowboard and a kite. A snowboard is above a kite. A snowboard is placed in the upper-center area of the frame, and a kite is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a snowboard stays above a kite throughout.", "original_prompt_en": "a snowboard on the top of a kite, front view"}
+{"index": 939, "data": "A front-view medium-wide shot shows only two main objects: a snowboard and a kite. A snowboard is below a kite. A kite is placed in the upper-center area of the frame, and a snowboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a snowboard stays below a kite throughout.", "original_prompt_en": "a snowboard on the bottom of a kite, front view"}
+{"index": 940, "data": "A front-view medium-wide shot shows only two main objects: a kite and a skateboard. A kite is above a skateboard. A kite is placed in the upper-center area of the frame, and a skateboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a kite stays above a skateboard throughout.", "original_prompt_en": "a kite on the top of a skateboard, front view"}
+{"index": 941, "data": "A front-view medium-wide shot shows only two main objects: a kite and a skateboard. A kite is below a skateboard. A skateboard is placed in the upper-center area of the frame, and a kite is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a kite stays below a skateboard throughout.", "original_prompt_en": "a kite on the bottom of a skateboard, front view"}
+{"index": 942, "data": "A front-view medium-wide shot shows only two main objects: a skateboard and a surfboard. A skateboard is above a surfboard. A skateboard is placed in the upper-center area of the frame, and a surfboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a skateboard stays above a surfboard throughout.", "original_prompt_en": "a skateboard on the top of a surfboard, front view"}
+{"index": 943, "data": "A front-view medium-wide shot shows only two main objects: a skateboard and a surfboard. A skateboard is below a surfboard. A surfboard is placed in the upper-center area of the frame, and a skateboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a skateboard stays below a surfboard throughout.", "original_prompt_en": "a skateboard on the bottom of a surfboard, front view"}
+{"index": 944, "data": "A front-view medium-wide shot shows only two main objects: a surfboard and skis. A surfboard is above skis. A surfboard is placed in the upper-center area of the frame, and skis is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a surfboard stays above skis throughout.", "original_prompt_en": "a surfboard on the top of skis, front view"}
+{"index": 945, "data": "A front-view medium-wide shot shows only two main objects: a surfboard and skis. A surfboard is below skis. Skis is placed in the upper-center area of the frame, and a surfboard is placed in the lower-center area of the frame, with a clear vertical gap between them. Both objects are fully visible from the front, complete, and entirely inside the frame, with comfortable margins from all image borders. They do not overlap or occlude each other, and no additional prominent objects appear near them. The scene is set in an open outdoor area under clear daylight, with a simple natural background. fixed shot. Exactly these two objects remain in the scene, and a surfboard stays below skis throughout.", "original_prompt_en": "a surfboard on the bottom of skis, front view"}
diff --git a/benchmarks/video_gen/Vbench/sample_vbench.py b/benchmarks/video_gen/Vbench/sample_vbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..da628261297b7403f54ac21760b7453f7fac262b
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/sample_vbench.py
@@ -0,0 +1,559 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+
+import json
+import os.path as osp
+from copy import deepcopy
+from dataclasses import asdict, fields
+from pathlib import Path
+from typing import Optional, Tuple, cast
+
+import imageio
+import torch
+import torch.distributed as dist
+from safetensors.torch import load_file
+from torch.utils.data import DataLoader
+from tqdm import trange
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+
+from config.config_factory import (
+    DataArguments,
+    EvaluationArguments,
+    ModelArguments,
+    TrainingArguments,
+    get_model_path,
+)
+from common.model.hacks import hack_qwen2_5_vl_config
+from common.utils.misc import AutoEncoderParams, tuple_mul
+from common.val.utils import decode_video_tensor, make_padded_latent
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from data.datasets_custom import ValidationDataset
+from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+
+
+PROMPT_JSON_FILENAME = "prompt.json"
+TEMPORAL_FLICKERING_SAMPLE_NUM = 25
+DEFAULT_VBENCH_DATA = "benchmarks/video_gen/Vbench/Vbench_recaption.jsonl"
+TEMPORAL_FLICKERING_PROMPT_FILE = (
+    Path(__file__).resolve().parent / "temporal_flickering_prompts.json"
+)
+
+
+def load_temporal_flickering_prompts() -> set[str]:
+    if not TEMPORAL_FLICKERING_PROMPT_FILE.exists():
+        warnings.warn(
+            f"Temporal flickering prompt file not found: {TEMPORAL_FLICKERING_PROMPT_FILE}. "
+            "Falling back to an empty prompt set.",
+            stacklevel=2,
+        )
+        return set()
+
+    with TEMPORAL_FLICKERING_PROMPT_FILE.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    return set(data)
+
+
+PROMPT_WITH_TEMPORAL_FLICKERING = load_temporal_flickering_prompts()
+
+
+def clean_memory(*objects):
+    for obj in objects:
+        del obj
+    import gc
+
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
+def init_from_model_path_if_needed(
+    model: Qwen2ForCausalLM,
+    model_args: ModelArguments,
+):
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+
+    if "latent_pos_embed.pos_embed" in model_state_dict:
+        model_state_dict.pop("latent_pos_embed.pos_embed")
+
+    model.load_state_dict(model_state_dict, strict=False)
+    clean_memory(model_state_dict)
+
+
+def resolve_vbench_paths(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+) -> None:
+    if not model_args.model_path:
+        raise ValueError("VBench requires --model_path to be provided explicitly.")
+
+    if not getattr(model_args, "llm_path", ""):
+        model_args.llm_path = model_args.model_path
+
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+
+    if not data_args.val_dataset_config_file:
+        data_args.val_dataset_config_file = DEFAULT_VBENCH_DATA
+
+
+def build_runtime_dataset_config(
+    model_args: ModelArguments,
+    training_args: TrainingArguments,
+    inference_args: EvaluationArguments,
+    vae_config: Optional[AutoEncoderParams],
+) -> DataConfig:
+    dataset_config = DataConfig()
+
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    dataset_config.max_duration = inference_args.max_duration
+    dataset_config.system_prompt_type = inference_args.system_prompt_type
+
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+
+    if training_args.visual_gen and vae_config:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = tuple_mul(
+            model_args.latent_patch_size,
+            (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial),
+        )
+        dataset_config.max_latent_size = model_args.max_latent_size
+        dataset_config.max_num_frames = model_args.max_num_frames
+
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+    return dataset_config
+
+
+def save_prompt_results(prompt_data_dict, save_path_gen: str):
+    prompt_json_path = os.path.join(save_path_gen, PROMPT_JSON_FILENAME)
+    with open(prompt_json_path, "w", encoding="utf-8") as f:
+        json.dump(prompt_data_dict, f, ensure_ascii=False, indent=2)
+
+
+def safe_instantiate(cls, cfg: dict, name: str):
+    valid_keys = {f.name for f in fields(cls)}
+    valid, invalid = {}, {}
+    for k, v in cfg.items():
+        if k in valid_keys:
+            valid[k] = v
+        else:
+            invalid[k] = v
+
+    if invalid:
+        print(f"[WARN] {name} 过滤无效参数: {invalid}")
+    return cls(**valid)
+
+
+def is_valid_value(value):
+    return value is not None
+
+
+def merge_args(original_args, override_args):
+    merged_dict = asdict(original_args)
+    override_dict = asdict(override_args)
+
+    for key, value in override_dict.items():
+        if is_valid_value(value):
+            merged_dict[key] = value
+
+    return original_args.__class__(**merged_dict)
+
+
+def apply_config_json_overrides(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    inference_args: EvaluationArguments,
+):
+    if not inference_args.config_json_path or not inference_args.config_json_path.endswith(".json"):
+        return model_args, data_args, inference_args
+
+    model_path_original = model_args.model_path
+    val_dataset_config_file_original = data_args.val_dataset_config_file
+
+    with open(inference_args.config_json_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    if "model_args" in config:
+        model_args = merge_args(
+            model_args,
+            safe_instantiate(ModelArguments, config["model_args"], "ModelArguments"),
+        )
+    if "data_args" in config:
+        data_args = merge_args(
+            data_args,
+            safe_instantiate(DataArguments, config["data_args"], "DataArguments"),
+        )
+    if "training_args" in config:
+        inference_args = merge_args(
+            inference_args,
+            safe_instantiate(EvaluationArguments, config["training_args"], "EvaluationArguments"),
+        )
+
+    model_args.model_path = model_path_original
+    if getattr(model_args, "llm_path", "") == "":
+        model_args.llm_path = model_path_original
+    data_args.val_dataset_config_file = val_dataset_config_file_original
+    return model_args, data_args, inference_args
+
+
+def get_sample_num_per_prompt(
+    inference_args: EvaluationArguments,
+    prompt: str,
+) -> int:
+    if prompt in PROMPT_WITH_TEMPORAL_FLICKERING:
+        if inference_args.quick_debug:
+            return min(inference_args.sample_num_per_prompt, 5)
+        return max(inference_args.sample_num_per_prompt, TEMPORAL_FLICKERING_SAMPLE_NUM)
+    return inference_args.sample_num_per_prompt
+
+
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    val_data_cpu: dict,
+    training_args: TrainingArguments,
+    model_args: ModelArguments,
+    inference_args: EvaluationArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_path_gen: str,
+):
+    is_rank0 = not dist.is_initialized() or dist.get_rank() == 0
+    val_data = val_data_cpu.cuda(device).to_dict()
+
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        if "padded_videos" in val_data:
+            val_data["padded_latent"] = make_padded_latent(
+                val_data["padded_videos"],
+                val_data["vae_data_mode"],
+                vae_model,
+            )
+
+        prompt = val_data.get("original_prompt_en") or val_data.get("caption")
+        if not prompt:
+            raise ValueError("VBench sample requires `original_prompt_en` or `caption` in dataset.")
+
+        sample_num_per_prompt = get_sample_num_per_prompt(inference_args, prompt)
+        loop_iterator = trange(sample_num_per_prompt, disable=(not is_rank0), leave=False, desc="Sampling")
+
+        for sample_idx in loop_iterator:
+            save_name = f"{save_path_gen}/{prompt}-{sample_idx}.mp4"
+            if os.path.exists(save_name):
+                continue
+
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": training_args.validation_timestep_shift,
+                "num_timesteps": training_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": training_args.cfg_interval,
+                "cfg_renorm_min": training_args.cfg_renorm_min,
+                "cfg_renorm_type": training_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": training_args.validation_max_samples,
+                "validation_noise_seed": training_args.validation_noise_seed + sample_idx,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": training_args.cfg_type,
+                "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": None,
+            }
+
+            if inference_args.use_KVcache:
+                denoise_latent, captions, _, _ = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, captions, _, _ = fsdp_model.validation_gen(**params)
+
+            for i_val, latent in enumerate(denoise_latent):
+                v_list = [vae_model.vae_decode([latent_])[0] for latent_ in latent]
+                v_thwc = decode_video_tensor(v_list)
+                imageio.mimsave(
+                    save_name,
+                    v_thwc,
+                    fps=inference_args.validation_video_saving_fps,
+                    format="mp4",
+                )
+                inference_args.prompt_data_dict[os.path.basename(save_name)] = captions[i_val]
+                clean_memory(v_list, v_thwc)
+
+            clean_memory(denoise_latent, captions)
+
+
+def main():
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        global_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        global_rank = 0
+        world_size = 1
+
+    local_rank = global_rank % torch.cuda.device_count()
+    device = local_rank
+    torch.cuda.set_device(device)
+
+    parser = HfArgumentParser((ModelArguments, DataArguments, EvaluationArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, EvaluationArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+
+    model_args, data_args, inference_args = apply_config_json_overrides(
+        model_args,
+        data_args,
+        inference_args,
+    )
+    training_args = inference_args
+    resolve_vbench_paths(model_args, data_args)
+
+    training_args.validation_noise_seed = inference_args.evaluation_seed
+    training_args.validation_data_seed = inference_args.evaluation_seed
+
+    seed = training_args.global_seed * world_size + global_rank
+    set_seed(seed)
+    log_rank0 = print if global_rank == 0 else (lambda *_: None)
+
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+        clean_memory(vit_weights)
+
+    if training_args.visual_gen:
+        vae_model = WanVideoVAE()
+        vae_config: Optional[AutoEncoderParams] = deepcopy(vae_model.vae_config)
+    else:
+        vae_model = None
+        vae_config = None
+
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    model = model.to(device)
+
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+
+    init_from_model_path_if_needed(model, model_args)
+
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        language_model = hack_qwen2_5_vl_config(language_model)
+
+    image_token_id = language_model.config.video_token_id
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+
+    if model_args.tie_word_embeddings:
+        model.language_model.untie_lm_head()
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else:
+        assert (
+            model.language_model.get_input_embeddings().weight.data.data_ptr()
+            != model.language_model.get_output_embeddings().weight.data.data_ptr()
+        ), "tie_world_embeddings 冲突"
+
+    model = model.to(device=device, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+
+    dataset_config = build_runtime_dataset_config(
+        model_args=model_args,
+        training_args=training_args,
+        inference_args=inference_args,
+        vae_config=vae_config,
+    )
+    val_dataset = ValidationDataset(
+        jsonl_path=data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=global_rank,
+        world_size=world_size,
+    )
+
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,
+        drop_last=True,
+        prefetch_factor=None,
+        persistent_workers=False,
+        multiprocessing_context=None,
+    )
+    val_loader_iter = iter(val_loader)
+
+    if not hasattr(inference_args, "prompt_data_dict"):
+        inference_args.prompt_data_dict = {}
+
+    os.makedirs(inference_args.save_path_gen, exist_ok=True)
+
+    for _ in trange(
+        len(val_loader),
+        desc="Validating",
+        unit="batch",
+        leave=True,
+        ncols=80,
+        disable=(global_rank != 0),
+    ):
+        val_data_cpu = next(val_loader_iter)
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=device,
+            save_path_gen=inference_args.save_path_gen,
+        )
+
+    if dist.is_initialized():
+        dist.barrier()
+        gathered = [None for _ in range(dist.get_world_size())]
+        dist.all_gather_object(gathered, inference_args.prompt_data_dict)
+
+        if global_rank == 0:
+            merged = {}
+            for d in gathered:
+                merged.update(d)
+            inference_args.prompt_data_dict = merged
+            save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+    elif global_rank == 0:
+        save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen)
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/video_gen/Vbench/sample_vbench.sh b/benchmarks/video_gen/Vbench/sample_vbench.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9753a653fd772b54652bb13adf5d899e63a2b8ca
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/sample_vbench.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+source "$SCRIPT_DIR/../../sample_env.sh"
+
+# ========================= 推理参数配置 =========================
+TASK_NAME="t2v"
+NUM_GPUS=8
+
+VALIDATION_NUM_TIMESTEPS=30 # 30 # 50 # 10 # 30 # 50
+VALIDATION_TIMESTEP_SHIFT=3.0 # 3.5
+EVALUATION_SEED=42
+CFG_TEXT_SCALE=4.0
+CFG_INTERVAL_START=0.4
+CFG_INTERVAL_END=1.0
+SAMPLE_NUM_PER_PROMPT=5
+USE_KVCACHE=true
+
+VIDEO_HEIGHT=480
+VIDEO_WIDTH=848
+NUM_FRAMES=50
+MAX_NUM_FRAMES=121
+MAX_LATENT_SIZE=64
+RESOLUTION="video_480p"
+
+MODEL_PATH="downloads/Lance_3B_Video"
+VAL_DATASET_CONFIG_FILE="benchmarks/video_gen/Vbench/Vbench_recaption.jsonl"
+
+# ========================= 自动生成路径 =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="kvcache_"
+fi
+SAVE_PATH_GEN="results/Vbench_ts${VALIDATION_NUM_TIMESTEPS}_tss${VALIDATION_TIMESTEP_SHIFT}_seed${EVALUATION_SEED}_cfg${CFG_TEXT_SCALE}_${KVCACHE_TAG}${TIMESTAMP}"
+
+if [ -z "$MODEL_PATH" ]; then
+    echo "错误: 请在脚本顶部配置区手动设置 MODEL_PATH"
+    exit 1
+fi
+
+# ============================== 环境与分布式配置 ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+
+# ========================= 显示任务配置 =========================
+echo "================================================"
+echo "VBench T2V 推理"
+echo "================================================"
+echo "GPU数量: ${NUM_GPUS}"
+echo "保存路径: ${SAVE_PATH_GEN}"
+echo "分辨率: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "输出帧数: ${NUM_FRAMES}"
+echo "最大帧数: ${MAX_NUM_FRAMES}"
+echo "模型路径: ${MODEL_PATH}"
+if [ -n "$VAL_DATASET_CONFIG_FILE" ]; then
+    echo "数据路径: ${VAL_DATASET_CONFIG_FILE}"
+fi
+if [ -n "$CONFIG_JSON_PATH" ]; then
+    echo "配置JSON: ${CONFIG_JSON_PATH}"
+fi
+echo ""
+echo "关键参数："
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - evaluation_seed: ${EVALUATION_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - cfg_interval: [${CFG_INTERVAL_START}, ${CFG_INTERVAL_END}]"
+echo "  - num_frames: ${NUM_FRAMES}"
+echo "  - sample_num_per_prompt: ${SAMPLE_NUM_PER_PROMPT}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+
+# ============================== 执行推理 ==============================
+# 注意：请直接修改本脚本顶部的“推理参数配置”区
+accelerate launch \
+    --num_machines                      $NUM_MACHINES           \
+    --num_processes                     $TOTAL_RANK             \
+    --machine_rank                      $MACHINE_RANK           \
+    --main_process_ip                   $MAIN_PROCESS_IP        \
+    --main_process_port                 $MAIN_PROCESS_PORT      \
+    --mixed_precision                   bf16                    \
+    benchmarks/video_gen/Vbench/sample_vbench.py \
+    --model_path                        "$MODEL_PATH" \
+    --val_dataset_config_file           "$VAL_DATASET_CONFIG_FILE" \
+    --config_json_path                  "$CONFIG_JSON_PATH" \
+    --vit_type                          qwen_2_5_vl_original \
+    --llm_qk_norm                       true \
+    --llm_qk_norm_und                   true \
+    --llm_qk_norm_gen                   true \
+    --tie_word_embeddings               false \
+    --validation_num_timesteps          $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift         $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe                     true \
+    --use_flex                          true \
+    --max_num_frames                    $MAX_NUM_FRAMES \
+    --max_latent_size                   $MAX_LATENT_SIZE \
+    --latent_patch_size                 1 1 1 \
+    --num_replicate                     $NUM_REPLICATE \
+    --num_shard                         $NUM_SHARD \
+    --visual_und                        true \
+    --visual_gen                        true \
+    --vae_model_type                    wan \
+    --apply_qwen_2_5_vl_pos_emb         true \
+    --apply_chat_template               false \
+    --cfg_type                          0 \
+    --validation_video_saving_fps       12 \
+    --validation_log_type               direct \
+    --video_height                      $VIDEO_HEIGHT \
+    --video_width                       $VIDEO_WIDTH \
+    --num_frames                        $NUM_FRAMES \
+    --task                              $TASK_NAME \
+    --save_path_gen                     $SAVE_PATH_GEN \
+    --resolution                        $RESOLUTION \
+    --evaluation_seed                   $EVALUATION_SEED \
+    --text_template                     true \
+    --sample_num_per_prompt             $SAMPLE_NUM_PER_PROMPT \
+    --cfg_text_scale                    $CFG_TEXT_SCALE \
+    --cfg_interval                      $CFG_INTERVAL_START $CFG_INTERVAL_END \
+    --use_KVcache                       $USE_KVCACHE
+
+echo ""
+echo "================================================"
+echo "完成! 结果: ${SAVE_PATH_GEN}"
+echo "================================================"
diff --git a/benchmarks/video_gen/Vbench/temporal_flickering_prompts.json b/benchmarks/video_gen/Vbench/temporal_flickering_prompts.json
new file mode 100644
index 0000000000000000000000000000000000000000..5915ab63be18f2e9d4c97df6005afd3e13ee2bca
--- /dev/null
+++ b/benchmarks/video_gen/Vbench/temporal_flickering_prompts.json
@@ -0,0 +1,77 @@
+[
+  "In a still frame, a stop sign",
+  "a toilet, frozen in time",
+  "a laptop, frozen in time",
+  "A tranquil tableau of alley",
+  "A tranquil tableau of bar",
+  "A tranquil tableau of barn",
+  "A tranquil tableau of bathroom",
+  "A tranquil tableau of bedroom",
+  "A tranquil tableau of cliff",
+  "In a still frame, courtyard",
+  "In a still frame, gas station",
+  "A tranquil tableau of house",
+  "indoor gymnasium, frozen in time",
+  "A tranquil tableau of indoor library",
+  "A tranquil tableau of kitchen",
+  "A tranquil tableau of palace",
+  "In a still frame, parking lot",
+  "In a still frame, phone booth",
+  "A tranquil tableau of restaurant",
+  "A tranquil tableau of tower",
+  "A tranquil tableau of a bowl",
+  "A tranquil tableau of an apple",
+  "A tranquil tableau of a bench",
+  "A tranquil tableau of a bed",
+  "A tranquil tableau of a chair",
+  "A tranquil tableau of a cup",
+  "A tranquil tableau of a dining table",
+  "In a still frame, a pear",
+  "A tranquil tableau of a bunch of grapes",
+  "A tranquil tableau of a bowl on the kitchen counter",
+  "A tranquil tableau of a beautiful, handcrafted ceramic bowl",
+  "A tranquil tableau of an antique bowl",
+  "A tranquil tableau of an exquisite mahogany dining table",
+  "A tranquil tableau of a wooden bench in the park",
+  "A tranquil tableau of a beautiful wrought-iron bench surrounded by blooming flowers",
+  "In a still frame, a park bench with a view of the lake",
+  "A tranquil tableau of a vintage rocking chair was placed on the porch",
+  "A tranquil tableau of the jail cell was small and dimly lit, with cold, steel bars",
+  "A tranquil tableau of the phone booth was tucked away in a quiet alley",
+  "a dilapidated phone booth stood as a relic of a bygone era on the sidewalk, frozen in time",
+  "A tranquil tableau of the old red barn stood weathered and iconic against the backdrop of the countryside",
+  "A tranquil tableau of a picturesque barn was painted a warm shade of red and nestled in a picturesque meadow",
+  "In a still frame, within the desolate desert, an oasis unfolded, characterized by the stoic presence of palm trees and a motionless, glassy pool of water",
+  "In a still frame, the Parthenon's majestic Doric columns stand in serene solitude atop the Acropolis, framed by the tranquil Athenian landscape",
+  "In a still frame, the Temple of Hephaestus, with its timeless Doric grace, stands stoically against the backdrop of a quiet Athens",
+  "In a still frame, the ornate Victorian streetlamp stands solemnly, adorned with intricate ironwork and stained glass panels",
+  "A tranquil tableau of the Stonehenge presented itself as an enigmatic puzzle, each colossal stone meticulously placed against the backdrop of tranquility",
+  "In a still frame, in the vast desert, an oasis nestled among dunes, featuring tall palm trees and an air of serenity",
+  "static view on a desert scene with an oasis, palm trees, and a clear, calm pool of water",
+  "A tranquil tableau of an ornate Victorian streetlamp standing on a cobblestone street corner, illuminating the empty night",
+  "A tranquil tableau of a tranquil lakeside cabin nestled among tall pines, its reflection mirrored perfectly in the calm water",
+  "In a still frame, a vintage gas lantern, adorned with intricate details, gracing a historic cobblestone square",
+  "In a still frame, a tranquil Japanese tea ceremony room, with tatami mats, a delicate tea set, and a bonsai tree in the corner",
+  "A tranquil tableau of the Parthenon stands resolute in its classical elegance, a timeless symbol of Athens' cultural legacy",
+  "A tranquil tableau of in the heart of Plaka, the neoclassical architecture of the old city harmonizes with the ancient ruins",
+  "A tranquil tableau of in the desolate beauty of the American Southwest, Chaco Canyon's ancient ruins whispered tales of an enigmatic civilization that once thrived amidst the arid landscapes",
+  "A tranquil tableau of at the edge of the Arabian Desert, the ancient city of Petra beckoned with its enigmatic rock-carved façades",
+  "In a still frame, amidst the cobblestone streets, an Art Nouveau lamppost stood tall",
+  "A tranquil tableau of in the quaint village square, a traditional wrought-iron streetlamp featured delicate filigree patterns and amber-hued glass panels",
+  "A tranquil tableau of the lampposts were adorned with Art Deco motifs, their geometric shapes and frosted glass creating a sense of vintage glamour",
+  "In a still frame, in the picturesque square, a Gothic-style lamppost adorned with intricate stone carvings added a touch of medieval charm to the setting",
+  "In a still frame, in the heart of the old city, a row of ornate lantern-style streetlamps bathed the narrow alleyway in a warm, welcoming light",
+  "A tranquil tableau of in the heart of the Utah desert, a massive sandstone arch spanned the horizon",
+  "A tranquil tableau of in the Arizona desert, a massive stone bridge arched across a rugged canyon",
+  "A tranquil tableau of in the corner of the minimalist tea room, a bonsai tree added a touch of nature's beauty to the otherwise simple and elegant space",
+  "In a still frame, amidst the hushed ambiance of the traditional tea room, a meticulously arranged tea set awaited, with porcelain cups, a bamboo whisk",
+  "In a still frame, nestled in the Zen garden, a rustic teahouse featured tatami seating and a traditional charcoal brazier",
+  "A tranquil tableau of a country estate's library featured elegant wooden shelves",
+  "A tranquil tableau of beneath the shade of a solitary oak tree, an old wooden park bench sat patiently",
+  "A tranquil tableau of beside a tranquil pond, a weeping willow tree draped its branches gracefully over the water's surface, creating a serene tableau of reflection and calm",
+  "A tranquil tableau of in the Zen garden, a perfectly raked gravel path led to a serene rock garden",
+  "In a still frame, a tranquil pond was fringed by weeping cherry trees, their blossoms drifting lazily onto the glassy surface",
+  "In a still frame, within the historic library's reading room, rows of antique leather chairs and mahogany tables offered a serene haven for literary contemplation",
+  "A tranquil tableau of a peaceful orchid garden showcased a variety of delicate blooms",
+  "A tranquil tableau of in the serene courtyard, a centuries-old stone well stood as a symbol of a bygone era, its mossy stones bearing witness to the passage of time"
+]
\ No newline at end of file
diff --git a/common/__init__.py b/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb602d1d7d99e148c5a517a9514cfd9a56277963
--- /dev/null
+++ b/common/__init__.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .io import (
+    get_download_dir,
+    set_download_dir,
+    is_hdfs_path,
+    download,
+    download_and_extract,
+    listdir,
+    listdir_with_metafile,
+    exists,
+    mkdir,
+    copy,
+    move,
+    remove,
+)
+from .utils import (
+    get_global_rank,
+    get_local_rank,
+    get_world_size,
+    is_master,
+    get_device,
+    barrier_if_distributed,
+    get_logger,
+    AutoEncoderParams,
+    tuple_mul,
+    flatten,
+    unflatten,
+    rearrange,
+    repeat,
+    pack,
+    unpack,
+    get_local_dir,
+    set_local_dir,
+    get_local_path,
+    convert_dtype,
+    save,
+    dummy_indexes_searchsorted,
+)
+from .model import (
+    hack_qwen2_5_vl_config,
+)
+from .val import (
+    pad_video_list,
+    decode_video_tensor,
+    map_splits_to_samples,
+    make_padded_latent,
+    make_packed_vit_token_embed,
+    uncond_split_pro,
+    INSTRUCTIONS_I2T_LIST,
+)
+
+__all__ = [
+    # config
+    "TemplateArguments",
+    "ModelArguments",
+    "DataArguments",
+    "TrainingArguments",
+    "InferenceArguments",
+    "EvaluationArguments",
+    # io
+    "get_download_dir",
+    "set_download_dir",
+    "is_hdfs_path",
+    "download",
+    "download_and_extract",
+    "listdir",
+    "listdir_with_metafile",
+    "exists",
+    "mkdir",
+    "copy",
+    "move",
+    "remove",
+    # utils
+    "get_global_rank",
+    "get_local_rank",
+    "get_world_size",
+    "is_master",
+    "get_device",
+    "barrier_if_distributed",
+    "get_logger",
+    "AutoEncoderParams",
+    "tuple_mul",
+    "flatten",
+    "unflatten",
+    "rearrange",
+    "repeat",
+    "pack",
+    "unpack",
+    "get_local_dir",
+    "set_local_dir",
+    "get_local_path",
+    "convert_dtype",
+    "save",
+    "dummy_indexes_searchsorted",
+    # model
+    "hack_qwen2_5_vl_config",
+    # val
+    "pad_video_list",
+    "decode_video_tensor",
+    "map_splits_to_samples",
+    "make_padded_latent",
+    "make_packed_vit_token_embed",
+    "uncond_split_pro",
+    "INSTRUCTIONS_I2T_LIST",
+]
diff --git a/common/io/__init__.py b/common/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d124be7d0ae07a024251c452343607f059622ff3
--- /dev/null
+++ b/common/io/__init__.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .filesystem import (
+    get_download_dir,
+    set_download_dir,
+    is_hdfs_path,
+    download,
+    download_and_extract,
+    listdir,
+    listdir_with_metafile,
+    exists,
+    mkdir,
+    copy,
+    move,
+    remove,
+)
+
+__all__ = [
+    "get_download_dir",
+    "set_download_dir",
+    "is_hdfs_path",
+    "download",
+    "download_and_extract",
+    "listdir",
+    "listdir_with_metafile",
+    "exists",
+    "mkdir",
+    "copy",
+    "move",
+    "remove",
+]
diff --git a/common/io/filesystem.py b/common/io/filesystem.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b261ddf9b2a7e37fe7c085207e3820069b99bf7
--- /dev/null
+++ b/common/io/filesystem.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""
+File system operations. Currently supports local and hadoop file systems.
+"""
+
+import hashlib
+import os
+import os.path as osp
+import pickle
+import shutil
+import subprocess
+import tarfile
+import tempfile
+from typing import List, Optional
+
+from ..utils.distributed import barrier_if_distributed, get_global_rank, get_local_rank
+from ..utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+# DOWNLOAD_DIR = os.environ.get("DOWNLOAD_DIR", 'pretrained_weights')
+DOWNLOAD_DIR = os.environ.get("DOWNLOAD_DIR", 'downloads')
+
+
+def get_download_dir():
+    """
+    Get a local download directory for storaging download files.
+    """
+    if DOWNLOAD_DIR is None:
+        return osp.expanduser("~/.cache/vgfm/downloads/")
+    return DOWNLOAD_DIR
+
+
+def set_download_dir(dirname):
+    """
+    Set a local download directory for storaging download files.
+    """
+    if dirname is None:
+        return
+    if exists(dirname) and get_local_rank() == 0:
+        remove(dirname)
+    global DOWNLOAD_DIR
+    DOWNLOAD_DIR = dirname
+    if get_local_rank() == 0:
+        mkdir(DOWNLOAD_DIR)
+
+
+def is_hdfs_path(path: str) -> bool:
+    """
+    Detects whether a path is an hdfs path.
+    A hdfs path must startswith "hdfs://" protocol prefix.
+    """
+    return path.lower().startswith("hdfs://")
+
+
+def download(
+    path: str,
+    dirname: Optional[str] = None,
+    filename: Optional[str] = None,
+    add_hash_suffix: bool = True,
+    distributed: bool = True,
+    overwrite: bool = False,
+) -> str:
+    """
+    Download a file to a local location. Returns the local path.
+    This function avoids repeated download if it has already been downloaded before.
+    Under distributed context, only local rank zero will download and the rest will wait.
+    Args:
+        path: source file path.
+        dirname: destination directory, or None for auto.
+        filename: destination file name, or None for auto.
+        add_hash_suffix: whether to add a hash suffix to distinguish
+                         between files with same name but different paths.
+        distributed: True if this method is called by all ranks. False if called by a single rank.
+        overwrite: whether to overwrite a downloaded file.
+    """
+    # If local path and no destination specification, directly return.
+    if not is_hdfs_path(path) and dirname is None and filename is None:
+        return path
+
+    # Compute a local filename.
+    if dirname is None:
+        dirname = get_download_dir()
+    if filename is None:
+        filename = osp.split(path)[-1]
+        if add_hash_suffix:
+            hashname = hashlib.md5(path.encode("utf-8")).hexdigest()
+            filename += "." + hashname
+
+    pathname = osp.join(dirname, filename)
+
+    # If distributed, only local rank zero performs download.
+    if (not distributed) or (get_local_rank() == 0):
+        # Download if the file doesn't exist.
+        if overwrite and osp.exists(pathname):
+            remove(pathname)
+        if not osp.exists(pathname):
+            os.makedirs(dirname, exist_ok=True)
+            logger.info(f"Downloading {path} to {pathname}")
+            copy(path, pathname)
+        else:
+            logger.info(f"File {pathname} already exists, skip download.")
+
+    # If distributed, all ranks must wait.
+    if distributed:
+        barrier_if_distributed()
+    return pathname
+
+
+def download_and_extract(path: str) -> str:
+    """
+    Download from hdfs if needed and extract tarball if needed.
+    Do nothing if the file has already been downloaded and extracted locally.
+    Returns the extracted local path.
+    Under distributed context, only local rank zero will do work and the rest will wait.
+    """
+    # Download from hdfs if needed.
+    path = download(path)
+    # If the path is a file instead of directory,
+    # assume it is a tarball and try extract it.
+    if osp.isfile(path):
+        with tarfile.open(path) as tar:
+            # Assume the tarball's first entry as the directory name.
+            folder_name = tar.next().name
+            # If distributed, only local rank zero performs the extraction.
+            if get_local_rank() == 0:
+                # Extract only if it hasn't been extracted before.
+                if not osp.exists(folder_name):
+                    tar.extractall(".")
+            # If distributed, all ranks must wait.
+            barrier_if_distributed()
+            path = folder_name
+    return path
+
+
+def listdir(path: str) -> List[str]:
+    """
+    List directory. Returns full path.
+
+    Examples:
+        - listdir("hdfs://dir") -> ["hdfs://dir/file1", "hdfs://dir/file2"]
+        - listdir("/dir") -> ["/dir/file1", "/dir/file2"]
+    """
+    files = []
+
+    if is_hdfs_path(path):
+        pipe = subprocess.Popen(
+            args=["hdfs", "dfs", "-ls", path],
+            shell=False,
+            stdout=subprocess.PIPE,
+        )
+
+        for line in pipe.stdout:
+            parts = line.strip().split()
+
+            # drwxr-xr-x   - user group  4 file
+            if len(parts) < 5:
+                continue
+
+            # Filter out warning texts when listing files on uswest cluster.
+            if "Warn" in parts[0].decode("utf8"):
+                continue
+
+            files.append(parts[-1].decode("utf8"))
+
+        pipe.stdout.close()
+        pipe.wait()
+
+    else:
+        files = [osp.join(path, file) for file in os.listdir(path)]
+
+    return files
+
+
+def listdir_with_metafile(path: str, overwrite: bool = True) -> List[str]:
+    """
+    Create a metafile caching the list directory result.
+    Read from metafile for all other ranks and all future list operations.
+    Same behavior as listdir(path).
+    """
+    # Local directory should directly return.
+    if not is_hdfs_path(path):
+        return listdir(path)
+
+    # Define metafile path.
+    metafile = osp.join(path, "metafile.pkl")
+
+    # Write metafile only by global rank zero.
+
+    if get_global_rank() == 0 and (overwrite or not exists(metafile)):
+        files = listdir(path)
+        with tempfile.NamedTemporaryFile("wb", delete=True) as f:
+            f.write(pickle.dumps(files))
+            f.flush()
+            copy(f.name, metafile, blocking=True)
+        logger.info(f"Created metafile for {path}")
+
+    # All other ranks wait.
+    barrier_if_distributed()
+
+    # All ranks read from metafile.
+    with open(download(metafile, overwrite=overwrite), "rb") as f:
+        files = pickle.loads(f.read())
+
+    # Assert to prevent directory move.
+    assert all(
+        file.startswith(path) for file in files
+    ), f"metafile for path: {path} is outdated. The directory likely has been moved."
+
+    # Return the list of files.
+    return files
+
+
+def exists(path: str) -> bool:
+    """
+    Check whether a path exists.
+    Returns True if exists, False otherwise.
+    """
+    if is_hdfs_path(path):
+        process = subprocess.run(["hdfs", "dfs", "-test", "-e", path], capture_output=True)
+        return process.returncode == 0
+    return osp.exists(path)
+
+
+def mkdir(path: str):
+    """
+    Create a directory.
+    Create all parent directory if not present. No-op if directory already present.
+    """
+    if is_hdfs_path(path):
+        subprocess.run(["hdfs", "dfs", "-mkdir", "-p", path])
+    else:
+        os.makedirs(path, exist_ok=True)
+
+
+def copy(src: str, tgt: str, blocking: bool = True):
+    """
+    Copy a file.
+    """
+    if src == tgt:
+        return
+
+    src_hdfs = is_hdfs_path(src)
+    tgt_hdfs = is_hdfs_path(tgt)
+
+    if not src_hdfs and not tgt_hdfs:
+        shutil.copy(src, tgt)
+        return
+
+    if src_hdfs and tgt_hdfs:
+        process = subprocess.Popen(["hdfs", "dfs", "-cp", "-f", src, tgt])
+    elif src_hdfs and not tgt_hdfs:
+        process = subprocess.Popen(
+            ["hdfs", "dfs", "-get", "-c", "128", "-t", "10", "--ct", "32", src, tgt]
+        )
+    elif not src_hdfs and tgt_hdfs:
+        process = subprocess.Popen(
+            ["hdfs", "dfs", "-put", "-f", "-c", "128", "-t", "10", "--ct", "32", src, tgt]
+        )
+
+    if blocking:
+        process.wait()
+
+
+def move(src: str, tgt: str):
+    """
+    Move a file.
+    """
+    if src == tgt:
+        return
+
+    src_hdfs = is_hdfs_path(src)
+    tgt_hdfs = is_hdfs_path(tgt)
+
+    if src_hdfs and tgt_hdfs:
+        subprocess.run(["hdfs", "dfs", "-mv", src, tgt])
+    elif not src_hdfs and not tgt_hdfs:
+        shutil.move(src, tgt)
+    else:
+        copy(src, tgt)
+        remove(src)
+
+
+def remove(path: str):
+    """
+    Remove a file or directory.
+    """
+    if is_hdfs_path(path):
+        subprocess.run(["hdfs", "dfs", "-rm", "-r", path])
+    elif osp.isfile(path):
+        os.remove(path)
+    else:
+        shutil.rmtree(path)
diff --git a/common/model/__init__.py b/common/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c0b5f96b7c85fd85f6bc8aa78c126b5585a3cc7
--- /dev/null
+++ b/common/model/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .hacks import hack_qwen2_5_vl_config
+
+__all__ = [
+    "hack_qwen2_5_vl_config",
+]
diff --git a/common/model/checks.py b/common/model/checks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e1156c6fe8d25b84f6e1aa321088003465472a
--- /dev/null
+++ b/common/model/checks.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
diff --git a/common/model/hacks.py b/common/model/hacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc31fda7ae3f991bd683eccc2b7cb811b08f6f4
--- /dev/null
+++ b/common/model/hacks.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+def hack_qwen2_5_vl_config(language_model):
+    # HACK!!!!!
+    language_model.config.image_token_id = 151655
+    language_model.config.video_token_id = 151656
+    language_model.config.vision_start_token_id = 151652
+    language_model.config.vision_end_token_id = 151653
+
+    language_model.config.vision_config = {
+        "depth": 32,
+        "hidden_act": "silu",
+        "hidden_size": 1280,
+        "intermediate_size": 3420,
+        "num_heads": 16,
+        "in_chans": 3,
+        "out_hidden_size": 2048,
+        "patch_size": 14,
+        "spatial_merge_size": 2,
+        "spatial_patch_size": 14,
+        "window_size": 112,
+        "fullatt_block_indexes": [
+            7,
+            15,
+            23,
+            31
+        ],
+        "tokens_per_second": 2,
+        "temporal_patch_size": 2
+    }
+
+    language_model.config.rope_scaling = {
+        "type": "mrope",
+        "mrope_section": [
+            16,
+            24,
+            24
+        ]
+    }
+
+    return language_model
diff --git a/common/utils/__init__.py b/common/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..728eb38c1e143fc61f12186e3bb3feab7d280ed2
--- /dev/null
+++ b/common/utils/__init__.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .distributed import (
+    get_global_rank,
+    get_local_rank,
+    get_world_size,
+    is_master,
+    get_device,
+    barrier_if_distributed,
+)
+from .logging import get_logger
+from .misc import AutoEncoderParams, tuple_mul
+from .tensor_ops import (
+    flatten,
+    unflatten,
+    rearrange,
+    repeat,
+    pack,
+    unpack,
+)
+from .save import (
+    get_local_dir,
+    set_local_dir,
+    get_local_path,
+    convert_dtype,
+    save,
+    dummy_indexes_searchsorted,
+)
+
+__all__ = [
+    # distributed
+    "get_global_rank",
+    "get_local_rank",
+    "get_world_size",
+    "is_master",
+    "get_device",
+    "barrier_if_distributed",
+    # logging
+    "get_logger",
+    # misc
+    "AutoEncoderParams",
+    "tuple_mul",
+    # tensor_ops
+    "flatten",
+    "unflatten",
+    "rearrange",
+    "repeat",
+    "pack",
+    "unpack",
+    # save
+    "get_local_dir",
+    "set_local_dir",
+    "get_local_path",
+    "convert_dtype",
+    "save",
+    "dummy_indexes_searchsorted",
+]
diff --git a/common/utils/distributed.py b/common/utils/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2bf1428cf931257c3a5b6e420e28fd9e212ed4
--- /dev/null
+++ b/common/utils/distributed.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import os
+import torch
+import torch.distributed as dist
+
+def get_global_rank() -> int:
+    """
+    Get the global rank, the global index of the GPU.
+    """
+    return int(os.environ.get("RANK", "0"))
+
+
+def get_local_rank() -> int:
+    """
+    Get the local rank, the local index of the GPU.
+    """
+    return int(os.environ.get("LOCAL_RANK", "0"))
+
+
+def get_world_size() -> int:
+    """
+    Get the world size, the total amount of GPUs.
+    """
+    return int(os.environ.get("WORLD_SIZE", "1"))
+
+
+def is_master():
+    """
+    Check if the current process is the master process (rank 0).
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        return True
+    return dist.get_rank() == 0
+
+
+def get_device() -> torch.device:
+    """
+    Get current rank device.
+    """
+    return torch.device("cuda", get_local_rank())
+
+
+def barrier_if_distributed(*args, **kwargs):
+    """
+    Synchronizes all processes if under distributed context.
+    """
+    if dist.is_initialized():
+        return dist.barrier(*args, **kwargs)
diff --git a/common/utils/logging.py b/common/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe3b058c9aca93f8ea40d8911a63ef9760d747e0
--- /dev/null
+++ b/common/utils/logging.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Logging utility functions.
+"""
+
+import logging
+import sys
+from typing import Optional
+
+from .distributed import get_global_rank, get_local_rank, get_world_size
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    logger.propagate = False # 修复: 禁用日志传播，防止日志被父级 logger 重复处理
+
+    if not logger.handlers:  # 只看自身，避免祖先影响
+        h = logging.StreamHandler(sys.stdout)
+        fmt = logging.Formatter(
+            "[%(asctime)s] "
+            + (f"[Rank:{get_global_rank()}]" if get_world_size() > 1 else "")
+            + (f"[LocalRank:{get_local_rank()}]" if get_world_size() > 1 else "")
+            + "[%(pathname)s:%(lineno)d][%(threadName).12s][%(name)s][%(levelname).5s] %(message)s"
+        )
+        h.setFormatter(fmt)
+        logger.addHandler(h)
+    return logger
+
+
+
diff --git a/common/utils/misc.py b/common/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..23424c938516594db5c2d5670bc75e1330c55d5b
--- /dev/null
+++ b/common/utils/misc.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from dataclasses import dataclass
+
+@dataclass
+class AutoEncoderParams:
+    downsample_spatial: int
+    downsample_temporal: int
+    z_channels: int
+    # for flux
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+
+def tuple_mul(a: tuple, b: tuple) -> tuple:
+    """
+    返回两个同长度 tuple 的按位乘积。
+
+    参数：
+        a (tuple of numbers)：第一个元组
+        b (tuple of numbers)：第二个元组，长度需与 a 一致
+
+    返回：
+        tuple：按位相乘后的结果
+    """
+    if len(a) != len(b):
+        raise ValueError("两个元组长度必须相等")
+    return tuple(x * y for x, y in zip(a, b))
diff --git a/common/utils/save.py b/common/utils/save.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f93e53712fa27634631e02361a67419b72c76f
--- /dev/null
+++ b/common/utils/save.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import os
+import os.path as osp
+import uuid
+from typing import Any, Optional
+import torch
+from safetensors.torch import save_file as save_safetensors, save_model as save_safetensors_model
+from .logging import get_logger
+from .distributed import get_global_rank
+
+logger = get_logger(__name__)
+
+# 解决循环导入问题：延迟导入 is_hdfs_path, mkdir, copy
+def _get_filesystem_funcs():
+    from ..io.filesystem import is_hdfs_path, mkdir, copy
+    return is_hdfs_path, mkdir, copy
+
+_local_dir = None
+
+
+def get_local_dir():
+    """
+    Get a local directory for temporary storage for this process.
+    """
+    global _local_dir
+    _, mkdir, _ = _get_filesystem_funcs()
+    if _local_dir is None:
+        _local_dir = os.path.join("persistence", "rank_" + str(get_global_rank()) + "_" + str(uuid.uuid4()))
+        mkdir(_local_dir)
+    return _local_dir
+
+
+def set_local_dir(dirname):
+    """
+    Set a local directory for temporary storage for this process.
+    """
+    global _local_dir
+    _, mkdir, _ = _get_filesystem_funcs()
+    if dirname is None:
+        return
+    _local_dir = os.path.join(dirname, str(uuid.uuid4()))
+    mkdir(_local_dir)
+
+
+def get_local_path(path: str) -> str:
+    """
+    Get a local path for storing the file.
+    If the path is already a local path, directly return.
+    """
+    is_hdfs_path, mkdir, _ = _get_filesystem_funcs()
+    if is_hdfs_path(path):
+        path = os.path.join(get_local_dir(), os.path.basename(path))
+    else:
+        mkdir(os.path.dirname(path))
+    return path
+
+
+def convert_dtype(states: Any, dtype: Optional[torch.dtype] = None):
+    """
+    Recursively convert the state_dict to device and dtype.
+    """
+    if dtype is None:
+        return states
+    if torch.is_tensor(states):
+        return states.to("cpu", dtype)
+    if isinstance(states, dict):
+        return {k: convert_dtype(v, dtype) for k, v in states.items()}
+    if isinstance(states, list):
+        return [convert_dtype(v, dtype) for v in states]
+    return states
+
+
+def save(data: Any, path: str, blocking: bool = True, persistence_dir: Optional[str] = None):
+    """
+    安全地将数据保存到指定路径（本地或HDFS）。
+    此版本使用 get_local_dir 来处理临时文件。
+    """
+    is_hdfs_path, _, copy = _get_filesystem_funcs()
+    if not is_hdfs_path(path):
+        if path.endswith(".safetensors"):
+            if isinstance(data, torch.nn.Module):
+                save_safetensors_model(data, path)
+            else:
+                save_safetensors(data, path)
+        else:
+            torch.save(data, path)
+
+        logger.info(f"Early saved to local path: {path}")
+        return
+
+    # --- HDFS 路径处理 ---
+    # 1. 获取一个唯一的本地临时文件路径
+    if persistence_dir is None:
+        persistence_dir = get_local_dir()
+
+    try:
+        # 2. 向临时文件写入数据
+        local_path = osp.join(persistence_dir, osp.basename(path))
+        if path.endswith(".safetensors"):
+            if isinstance(data, torch.nn.Module):
+                save_safetensors_model(data, local_path)
+            else:
+                save_safetensors(data, local_path)
+        else:
+            torch.save(data, local_path)
+        logger.info(f"Saved to local path: {local_path}")
+
+        # 3. 将本地临时文件复制到HDFS
+        copy(local_path, path, blocking=blocking)
+        logger.info(f"Copy {local_path} to HDFS or Local path: {path} done.")
+
+    finally:
+        # NOTE: 因为是重复写入，不需要清理了
+        pass
+
+        # # 4. 清理临时文件
+        # # NOTE: 暂时只在blocking为True的时候清理
+        # if osp.exists(persistence_path) and blocking:
+        #     os.remove(persistence_path)
+        #     logger.info(f"Removed temporary file: {persistence_path}")
+
+def dummy_indexes_searchsorted(packed_text_indexes: torch.LongTensor, ce_loss_indexes: torch.LongTensor) -> torch.LongTensor:
+    """
+    使用 searchsorted 方法：
+    - 对 packed_text_indexes 排序，得到排序值 sorted_vals 和原始下标 sorted_pos。
+    - 在 sorted_vals 中查找 ce_loss_indexes 的位置 loc。
+    - 根据 loc 索引 sorted_pos，得到 dummy_indexes。
+    """
+    sorted_vals, sorted_pos = torch.sort(packed_text_indexes)
+    loc = torch.searchsorted(sorted_vals, ce_loss_indexes)
+    return sorted_pos[loc]
diff --git a/common/utils/tensor_ops.py b/common/utils/tensor_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a744882df45e92c4b4fe809fbec9ae08790172bd
--- /dev/null
+++ b/common/utils/tensor_ops.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from itertools import chain
+from typing import Dict, List, Tuple
+import einops
+import torch
+
+
+def rearrange(
+    hid: torch.FloatTensor,  # (L c)
+    hid_shape: torch.LongTensor,  # (b n)
+    pattern: str,
+    **kwargs: Dict[str, int],
+) -> Tuple[
+    torch.FloatTensor,
+    torch.LongTensor,
+]:
+    return flatten([einops.rearrange(h, pattern, **kwargs) for h in unflatten(hid, hid_shape)])
+
+
+def repeat(
+    hid: torch.FloatTensor,  # (L c)
+    hid_shape: torch.LongTensor,  # (b n)
+    pattern: str,
+    **kwargs: Dict[str, torch.LongTensor],  # (b)
+) -> Tuple[
+    torch.FloatTensor,
+    torch.LongTensor,
+]:
+    hid = unflatten(hid, hid_shape)
+    kwargs = [{k: v[i].item() for k, v in kwargs.items()} for i in range(len(hid))]
+    return flatten([einops.repeat(h, pattern, **a) for h, a in zip(hid, kwargs)])
+
+
+def pack(
+    samples: List[torch.Tensor],  # List of (h w c).
+) -> Tuple[
+    List[torch.Tensor],  # groups [(b1 h1 w1 c1), (b2 h2 w2 c2)]
+    List[List[int]],  # reversal indices.
+]:
+    batches = {}
+    indices = {}
+    for i, sample in enumerate(samples):
+        shape = sample.shape
+        batches[shape] = batches.get(shape, [])
+        indices[shape] = indices.get(shape, [])
+        batches[shape].append(sample)
+        indices[shape].append(i)
+
+    batches = list(map(torch.stack, batches.values()))
+    indices = list(indices.values())
+    return batches, indices
+
+
+def unpack(
+    batches: List[torch.Tensor],
+    indices: List[List[int]],
+) -> List[torch.Tensor]:
+    samples = [None] * (max(chain(*indices)) + 1)
+    for batch, index in zip(batches, indices):
+        for sample, i in zip(batch.unbind(), index):
+            samples[i] = sample
+    return samples
+
+
+# 需要保留的辅助函数，因为 rearrange 和 repeat 依赖它们
+def flatten(
+    hid: List[torch.FloatTensor],  # List of (*** c)
+) -> Tuple[
+    torch.FloatTensor,  # (L c)
+    torch.LongTensor,  # (b n)
+]:
+    assert len(hid) > 0
+    shape = torch.stack([torch.tensor(x.shape[:-1], device=hid[0].device) for x in hid])
+    hid = torch.cat([x.flatten(0, -2) for x in hid])
+    return hid, shape
+
+
+def unflatten(
+    hid: torch.FloatTensor,  # (L c) or (L ... c)
+    hid_shape: torch.LongTensor,  # (b n)
+) -> List[torch.Tensor]:  # List of (*** c) or (*** ... c)
+    hid_len = hid_shape.prod(-1)
+    hid = hid.split(hid_len.tolist())
+    hid = [x.unflatten(0, s.tolist()) for x, s in zip(hid, hid_shape)]
+    return hid
diff --git a/common/val/__init__.py b/common/val/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96bc46d2bd5449c70c495fb037f32cdab21e52c0
--- /dev/null
+++ b/common/val/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .utils import (
+    pad_video_list,
+    decode_video_tensor,
+    map_splits_to_samples,
+    make_padded_latent,
+    make_packed_vit_token_embed,
+    uncond_split_pro,
+)
+from .instructions import INSTRUCTIONS_I2T_LIST
+
+__all__ = [
+    "pad_video_list",
+    "decode_video_tensor",
+    "map_splits_to_samples",
+    "make_padded_latent",
+    "make_packed_vit_token_embed",
+    "uncond_split_pro",
+    "INSTRUCTIONS_I2T_LIST",
+]
diff --git a/common/val/instructions.py b/common/val/instructions.py
new file mode 100644
index 0000000000000000000000000000000000000000..953d21ed2ff7baa92421845ae8f7fa3f237f2d23
--- /dev/null
+++ b/common/val/instructions.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+
+INSTRUCTIONS_I2T_LIST = [
+    "Describe the image or video in detail.\n",
+    "Provide a detailed description of the image or video.\n",
+    "Explain the content of the image or video.\n",
+    "Depict the image or video thoroughly.\n",
+    "Summarize the content of the image or video.\n",
+    "", # no  instructions
+]
diff --git a/common/val/utils.py b/common/val/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cde77c1911c3f33d24c7e572c5afc74b0322a00
--- /dev/null
+++ b/common/val/utils.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import os
+EXP_HW_20250819 = os.environ.get("EXP_HW_20250819", "False").lower() == "true"
+from einops import rearrange
+import torch
+from typing import List
+import imageio
+import glob
+import numpy as np
+
+
+def _vit_denorm_uint8_thwc(video_tensor_c_first: torch.Tensor) -> np.ndarray:
+    """
+    输入: T C H W float，范围近似标准化(mean/std)。输出: T H W C uint8
+    固定用 Qwen2.5-VL vit 的 mean/std，保持与原实现一致。
+    """
+    mean = [0.48145466, 0.4578275, 0.40821073]
+    std = [0.26862954, 0.26130258, 0.27577711]
+    mean_t = torch.tensor(mean, device=video_tensor_c_first.device).view(1, 3, 1, 1)
+    std_t = torch.tensor(std, device=video_tensor_c_first.device).view(1, 3, 1, 1)
+    x = torch.clamp(video_tensor_c_first * std_t + mean_t, 0, 1)
+    x = (x * 255).round().clamp(0, 255).to(torch.uint8)  # T C H W
+    return x.permute(0, 2, 3, 1).cpu().numpy()
+
+
+def pad_video_list(video_tensor):  # video_tensor: List[Tensor], 每个Tensor的shape为[C T H W]
+    video_sizes = [item.shape for item in video_tensor]
+    max_video_size = [max(item) for item in list(zip(*video_sizes))]
+    padded_videos_latent = torch.zeros(size=(len(video_tensor), *max_video_size))
+    for i, video_tensor_ in enumerate(video_tensor):
+        c, t, h, w = video_tensor_.shape
+        padded_videos_latent[i, :c, :t, :h, :w] = video_tensor_
+    return padded_videos_latent
+
+
+def decode_video_tensor(video_tensor, video_type="vae", save_path="", save_half=False, idx="", max_save_num=100000, save_item_name=""):
+    # video_tensor: list [N], 每一项为[C T H W]
+    # video_type: vae, vit
+    N_target = len(video_tensor)
+    if N_target != 1:  # TODO: 支持多个视频目标时需要修改
+        padded_videos_latent = pad_video_list(video_tensor)
+        v_tc_hw = rearrange(padded_videos_latent, "n c t h w -> t c h (n w)")  # T C H' W
+    else:
+        v_tc_hw = video_tensor[0].permute(1, 0, 2, 3)
+    if video_type == "vae":
+        v_thwc = v_tc_hw.float().clip(-1, 1).mul_(0.5).add_(0.5).mul_(255).round().clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).cpu().numpy()
+    elif video_type == "vit":
+        v_thwc = _vit_denorm_uint8_thwc(v_tc_hw)
+    else:
+        raise ValueError(f"video_type {video_type} not supported")
+    if save_path != "":
+        os.makedirs(save_path, exist_ok=True)
+        if save_half:
+            w = v_thwc.shape[2]
+            v_thwc_save = v_thwc[:, :, w // 2:, :]
+        else:
+            v_thwc_save = v_thwc
+        if v_thwc.shape[0] > 1:  # 保存视频
+            existing_files = glob.glob(f"{save_path}/*.mp4")
+            if len(existing_files) > max_save_num:
+                quit()
+            save_path_i = f"{save_path}/{save_item_name}.mp4"
+            imageio.mimsave(save_path_i, v_thwc_save, fps=12, format="mp4")
+        else:  # 保存图像
+            existing_files = glob.glob(f"{save_path}/*.png")
+            if len(existing_files) > max_save_num:
+                quit()
+            save_path_i = f"{save_path}/{save_item_name}.png"
+            imageio.imwrite(save_path_i, v_thwc_save[0], format="png")
+        print(f"视频或图像已保存到: {save_path_i}")
+    return v_thwc
+
+
+def map_splits_to_samples(sample_lens: List[int], split_lens: List[int]) -> List[List[int]]:
+    """
+    将split索引映射到对应的样本
+
+    参数:
+        val_sample_lens: 每个样本的总长度列表
+        val_split_lens: 每个split的长度列表
+
+    返回:
+        列表，其中每个元素是一个列表，包含属于对应样本的split索引
+    """
+    sample_splits = []
+    current_split_idx = 0
+    remaining_length = 0
+
+    for sample_len in sample_lens:
+        splits = []
+        remaining_length = sample_len
+
+        while remaining_length > 0 and current_split_idx < len(split_lens):
+            # 添加当前split索引到样本
+            splits.append(current_split_idx)
+
+            # 减去当前split长度并移动到下一个split
+            remaining_length -= split_lens[current_split_idx]
+            current_split_idx += 1
+
+        sample_splits.append(splits)
+
+    return sample_splits
+
+
+@torch.no_grad()
+def make_padded_latent(padded_videos, data_mode, vae_model):  # 兼容 online 和 offline 两种模式
+    """
+    for vae:
+    data_mode = data['vae_data_mode']
+    padded_videos = data.pop("padded_videos")
+    """
+    if data_mode.count("offline") == 0:  # 全是online模式
+        padded_latent = vae_model.vae_encode(padded_videos)
+    elif data_mode.count("online") == 0:  # 全是offline模式
+        padded_latent = padded_videos
+    else:  # 混合模式
+        online_buf, idxs = [], []
+        padded_latent = [None] * len(padded_videos)
+
+        for i, (x, m) in enumerate(zip(padded_videos, data_mode)):
+            if m.lower().startswith("off"):  # offline: 直接取 latent
+                padded_latent[i] = x
+            else:  # online: 收集待编码的视频张量
+                online_buf.append(x)
+                idxs.append(i)
+
+        lat = vae_model.vae_encode(online_buf)  # 一次性 vae_encode, 提高效率
+        for i, idx in enumerate(idxs):
+            padded_latent[idx] = lat[i]
+
+    del padded_videos
+    torch.cuda.empty_cache()
+    return padded_latent
+
+
+@torch.no_grad()
+def make_packed_vit_token_embed(packed_vit_tokens, vit_data_mode, vit_video_grid_thw, vit_model):  # 兼容 online 和 offline 两种模式
+    """
+    for vit:
+    vit_data_mode = vit_data_mode
+    packed_vit_tokens = packed_vit_tokens
+    """
+    if vit_data_mode.count("offline") == 0:  # 全是online模式
+        packed_vit_tokens = torch.cat(packed_vit_tokens, dim=0)
+        packed_vit_token_embed = vit_model(
+            hidden_states=packed_vit_tokens,  # L x 1176 or 2048
+            grid_thw=vit_video_grid_thw,  # t, h, w
+        )  # L x 1176 or 2048 -> L//4 x 2048
+    elif vit_data_mode.count("online") == 0:  # 全是offline模式
+        packed_vit_token_embed = torch.cat(packed_vit_tokens, dim=0)  # L x 1176 or 2048
+    else:  # 混合模式
+        packed_vit_token_embed, i_online = [], 0
+        for i, (x, m) in enumerate(zip(packed_vit_tokens, vit_data_mode)):
+            if m.lower().startswith("off"):  # offline: 直接取 latent
+                packed_vit_token_embed.append(x)
+            else:
+                if vit_video_grid_thw.shape[0] == len(packed_vit_tokens):  # 即表示 offline 的视频也会写入vit_video_grid_thw
+                    i_online = i
+                thw = vit_video_grid_thw[i_online:i_online+1]
+                packed_vit_token_embed.append(
+                    vit_model(
+                        hidden_states=x,
+                        grid_thw=thw,
+                    )
+                )
+                i_online += 1
+        packed_vit_token_embed = torch.cat(packed_vit_token_embed, dim=0)  # L x 1176 or 2048
+
+    return packed_vit_token_embed
+
+
+def uncond_split_pro(
+    language_model,
+    current_attn_modes,
+    current_split_lens,
+    vae_video_grid_thw,
+    vit_video_grid_thw,
+    curr_vae_split_idx,
+    curr_vit_split_idx,
+    device,
+    dtype,
+    start_id,
+    image_token_id,
+    end_id,
+    BLOCK_SIZE,
+    is_text_uncond=True,
+    is_vit_uncond=False,
+):
+    uncond_split, uncond_pos_ids = [], []
+    (
+        curr_vae_split_idx_,
+        curr_vit_split_idx_,
+        uncond_vae_index,
+        uncond_vit_index,
+        uncond_packed_gen_token_indexes,
+        uncond_packed_und_token_indexes,
+        uncond_split_lens,
+        uncond_attn_modes,
+    ) = (
+        curr_vae_split_idx,
+        curr_vit_split_idx,
+        [],
+        [],
+        [],
+        [],
+        [],
+        [],
+    )
+
+    for i_visual, attn_mode_ in enumerate(current_attn_modes):
+        split_len_ = current_split_lens[i_visual]
+        if attn_mode_ == "causal" and is_text_uncond:
+            continue
+        elif attn_mode_ == "full" and is_vit_uncond:
+            continue
+        elif attn_mode_ in ["noise", "full_noise"]:
+            t, h, w = vae_video_grid_thw[curr_vae_split_idx_]
+            num_visual = int(t * h * w / 4)  # 4 为merge_size 2 的平方
+            uncond_vae_index.extend(range(len(uncond_split) + 1, len(uncond_split) + 1 + num_visual))
+            uncond_packed_und_token_indexes.extend([len(uncond_split), len(uncond_split) + 1 + num_visual])
+            uncond_packed_gen_token_indexes.extend(range(len(uncond_split) + 1, len(uncond_split) + 1 + num_visual))
+            curr_vae_split_idx_ += 1
+        elif attn_mode_ == "full":
+            t, h, w = vit_video_grid_thw[curr_vit_split_idx_]
+            num_visual = int(t * h * w / 4)
+            uncond_vit_index.extend(range(len(uncond_split) + 1, len(uncond_split) + 1 + num_visual))
+            uncond_packed_und_token_indexes.extend(range(len(uncond_split), len(uncond_split) + 2 + num_visual))
+            curr_vit_split_idx_ += 1
+        uncond_split += [start_id] + [image_token_id] * num_visual + [end_id]
+
+        uncond_split_lens.append(split_len_)
+        uncond_attn_modes.append(attn_mode_)
+        uncond_pos_ids += [curr_vae_split_idx_ + curr_vit_split_idx_ - 1] * split_len_
+    uncond_vae_index = torch.tensor(uncond_vae_index, dtype=torch.long, device=device)
+    uncond_vit_index = torch.tensor(uncond_vit_index, dtype=torch.long, device=device)
+    uncond_packed_gen_token_indexes = torch.tensor(uncond_packed_gen_token_indexes, dtype=torch.long, device=device)
+    uncond_packed_und_token_indexes = torch.tensor(uncond_packed_und_token_indexes, dtype=torch.long, device=device)
+
+    # ---- 创建uncond条件 ----
+    uncond_text_ids = torch.tensor(uncond_split, device=device, dtype=torch.long)
+    uncond_sequence = language_model.model.embed_tokens(uncond_text_ids).to(dtype=dtype)
+
+    # 2) 与训练一致 -> 也 pad 掉尾块
+    uncond_seq_len = len(uncond_text_ids)
+    uncond_seq_len_pad = (uncond_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
+    uncond_pad = uncond_seq_len_pad - uncond_seq_len
+    if uncond_pad > 0:
+        uncond_split_lens.append(uncond_pad)
+        uncond_attn_modes.append("causal")
+
+    return (
+        uncond_sequence,
+        uncond_attn_modes,
+        uncond_split_lens,
+        uncond_vae_index,
+        uncond_vit_index,
+        uncond_packed_gen_token_indexes,
+        uncond_packed_und_token_indexes,
+        uncond_text_ids,
+        uncond_seq_len,
+        uncond_pad,
+    )
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d61d21b8c2710bb3e03527dfb5ebccc3f7e85e
--- /dev/null
+++ b/config/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .config_factory import (
+    TemplateArguments,
+    ModelArguments,
+    DataArguments,
+    TrainingArguments,
+    InferenceArguments,
+    EvaluationArguments,
+    get_model_path,
+    get_model_path_config,
+)
+
+__all__ = [
+    "TemplateArguments",
+    "ModelArguments",
+    "DataArguments",
+    "TrainingArguments",
+    "InferenceArguments",
+    "EvaluationArguments",
+    "get_model_path",
+    "get_model_path_config",
+]
diff --git a/config/config_factory.py b/config/config_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be985b21aee4ae91fcd1e968d6f5e58bc01f33f
--- /dev/null
+++ b/config/config_factory.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""
+配置定义与轻量工厂函数。
+
+- TemplateArguments  : 对话模板
+- ModelArguments     : 模型结构/加载参数
+- DataArguments      : 验证集输入参数
+- TrainingArguments  : 推理运行期仍会使用的模型加载/采样/兼容字段
+- InferenceArguments : 推理专用参数（继承 TrainingArguments）
+- EvaluationArguments: 评估专用参数（继承 InferenceArguments）
+
+当前模块同时负责：
+- 定义训练/推理共用的 dataclass
+- 解析 `path_default.yaml`
+- 为推理链提供轻量配置工厂能力
+"""
+
+import re
+from pathlib import Path
+from dataclasses import dataclass, field
+from typing import List, Dict, Any, Optional
+
+import yaml
+
+# ==============================================
+# 模型路径配置管理
+# ==============================================
+
+# 全局缓存，避免重复加载
+_MODEL_PATH_CONFIG_CACHE: Optional[Dict[str, Any]] = None
+_DEFAULT_PATH_FILE = Path(__file__).with_name("path_default.yaml")
+_PLACEHOLDER_PATTERN = re.compile(r"\$\{([^}]+)\}")
+
+
+def _get_nested_value(config: Dict[str, Any], path_key: str) -> Any:
+    """
+    根据点分路径从嵌套配置中取值，例如 "vit.qwen2_5_vl"。
+    """
+    value: Any = config
+    for key in path_key.split("."):
+        if isinstance(value, dict) and key in value:
+            value = value[key]
+        else:
+            raise ValueError(f"Path key '{path_key}' not found in {_DEFAULT_PATH_FILE.name}")
+    return value
+
+
+def _resolve_config_values(value: Any, config: Dict[str, Any]) -> Any:
+    """
+    递归解析配置中的占位符，保持原有的嵌套结构不变。
+    """
+    if isinstance(value, dict):
+        return {k: _resolve_config_values(v, config) for k, v in value.items()}
+    if isinstance(value, str):
+        return _resolve_placeholders(value, config)
+    return value
+
+
+def _resolve_placeholders(path: str, config: Dict[str, Any]) -> str:
+    """
+    递归解析路径中的占位符，例如 ${base_dir} 或 ${vit.qwen2_5_vl}
+    """
+    matches = _PLACEHOLDER_PATTERN.findall(path)
+    
+    if not matches:
+        return path
+    
+    result = path
+    for match in matches:
+        try:
+            value = _get_nested_value(config, match)
+        except ValueError as exc:
+            raise ValueError(f"Placeholder ${match} not found in {_DEFAULT_PATH_FILE.name}") from exc
+
+        # 递归解析值中的占位符
+        resolved_value = _resolve_placeholders(str(value), config)
+        result = result.replace(f"${{{match}}}", resolved_value)
+
+    return result
+
+
+def get_model_path_config(reload: bool = False) -> Dict[str, Any]:
+    """
+    加载并解析 path_default.yaml 配置文件
+    :param reload: 强制重新加载，忽略缓存
+    :return: 解析后的配置字典
+    """
+    global _MODEL_PATH_CONFIG_CACHE
+    
+    if _MODEL_PATH_CONFIG_CACHE is not None and not reload:
+        return _MODEL_PATH_CONFIG_CACHE
+    
+    if not _DEFAULT_PATH_FILE.exists():
+        raise FileNotFoundError(
+            f"Model path configuration file not found: {_DEFAULT_PATH_FILE}"
+        )
+
+    with _DEFAULT_PATH_FILE.open("r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+
+    resolved_config = _resolve_config_values(config, config)
+    _MODEL_PATH_CONFIG_CACHE = resolved_config
+
+    return resolved_config
+
+
+def get_model_path(path_key: str) -> str:
+    """
+    获取指定的路径值
+    :param path_key: 路径键，支持嵌套，例如 "vit.qwen2_5_vl", "data.t2i"
+    :return: 解析后的完整路径
+    """
+    config = get_model_path_config()
+    value = _get_nested_value(config, path_key)
+
+    return str(value) if value is not None else ""
+
+@dataclass
+class TemplateArguments:
+    chat_template: List[str] = (
+        '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n',
+        'Describe this image.<|im_end|>\n<|im_start|>assistant\n',
+    )  # NOTE: instruction 需要考虑适配不同数据类型；模板中间插入 VIT token，最后插入 text token
+    chat_template_T2I: List[str] = (
+        '<|im_start|>system\nDescribe the image by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n<|quad_start|><|im_end|>\n<|im_start|>assistant\n',
+    )  # NOTE: 模板中间插入 text token，最后插入 VAE token
+    pad_token_template_T2I: str = "<|quad_start|>"
+    pad_token_template: str = "<|quad_end|>"
+
+
+@dataclass
+class ModelArguments:
+    model_path:                 str = ""
+    llm_path:                   str = ""
+    llm_qk_norm:                bool = True
+    llm_qk_norm_und:            bool = True
+    llm_qk_norm_gen:            bool = True
+    tie_word_embeddings:        bool = False
+    layer_module:               str = "Qwen2MoTDecoderLayer"
+    vit_path:                   str = ""
+    max_num_frames:             int = 25
+    max_latent_size:            int = 64
+    latent_patch_size:          List[int] = (1, 2, 2)  # pt ph pw
+    vit_patch_size:             int = 14
+    vit_patch_size_temporal:    int = 2
+    vit_max_num_patch_per_side: int = 70
+    connector_act:              str = "gelu_pytorch_tanh"
+    interpolate_pos:            bool = False
+    vit_select_layer:           int = -2
+    vit_rope:                   bool = False
+
+    text_cond_dropout_prob:     float = 0.1
+    vae_cond_dropout_prob:      float = 0.3
+    vit_cond_dropout_prob:      float = 0.3
+    vit_type:                   str = "qwen2_5_vl"  # options: qwen2_5_vl
+
+    val_text_cond_dropout_prob: float = 0
+    val_vae_cond_dropout_prob:  float = 0
+    val_vit_cond_dropout_prob:  float = 0
+
+    cfg_text_scale:             float = 4.0  # for validation
+
+
+@dataclass
+class DataArguments:
+    val_dataset_config_file:    Optional[str] = None
+
+
+@dataclass
+class TrainingArguments:
+    # 推理运行期开关
+    apply_chat_template:        bool = False  # 是否对输入文本套用 Qwen2.5-VL chat template
+    apply_qwen_2_5_vl_pos_emb:  bool = False  # 是否启用 Qwen2.5-VL position embedding
+
+    vae_model_type:             str = "seedance"
+    visual_gen:                 bool = True
+    visual_und:                 bool = True
+    freeze_und:                 bool = False
+    copy_init_moe:              bool = False
+    finetune_from_hf:           bool = False
+    finetune_from_vlm:          bool = False
+    use_flex:                   bool = False
+    num_replicate:              int = 1
+    num_shard:                  int = 1
+
+    global_seed:                int = 2025
+
+    # 采样相关
+    timestep_shift:             float = 1.0
+    validation_data_seed:       int = 42
+    validation_num_timesteps:   int = 30
+    validation_timestep_shift:  float = 3.0
+    validation_max_samples:     int = 8
+    validation_noise_seed:      int = 2025
+    validation_video_saving_fps:int = 12
+    validation_log_type:        str = "direct"
+
+    # CFG 与文本条件控制
+    cfg_type:                   int = 0       # 0: 完全去除文本条件; 1: 仅保留特殊 token; 2: 保留特殊 token + 中间文本 token 替换为 <NULL>
+    cfg_uncond_token_id:        int = 151643  # 仅在 cfg_type=2 时生效
+    cfg_interval:               List[float] = field(default_factory=lambda: [0.4, 1.0])
+    cfg_renorm_min:             float = 0
+    cfg_renorm_type:            str = "global"  # global | channel | ""
+
+    # 额外 embedding 开关
+    use_task_embedding:         bool = False
+    use_modality_embedding:     bool = False
+
+
+@dataclass
+class InferenceArguments(TrainingArguments):
+    save_path_gen:              str = "tmp/results/inference/generation"  # 生成视频/图像保存路径
+    save_path_gt:               str = ""    # ground truth 视频/图像保存路径，默认不保存
+    video_height:               int = 480
+    video_width:                int = 480
+    num_frames:                 int = 50
+    task:                       str = "t2v"  # t2v / t2i / edit / idip ...
+    resolution:                 str = "video_360p"  # image_256res, image_512res, video_192p, video_360p 等
+    text_template:              bool = False  # 是否使用 system_prompt 文本模板
+    max_duration:               float = 6.0  # 最大视频时长（秒）
+
+    system_prompt_type:         str = "SP0"  # options: SP1, SP2 ...
+    use_KVcache:                bool = False
+
+
+@dataclass
+class EvaluationArguments(InferenceArguments):
+    config_json_path:           str = field(default="", metadata={"help": "配置 JSON 文件路径"})  # 提供 config 则用其覆盖参数
+    sample_num_per_prompt:      int = field(default=4, metadata={"help": "每个 case 的采样数量"})
+    max_eval_cases:             int = field(default=0, metadata={"help": "限制评测 case 数量，0 表示全量"})
+    do_sample:                  bool = False  # UND 任务是否使用采样策略
+    evaluation_seed:            int = 42
+    quick_debug:                bool = False  # 快速调试模式
diff --git a/config/examples/image_edit_example.json b/config/examples/image_edit_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d95c7efd3ddb058a5db4aec43586fbfdd1944fc
--- /dev/null
+++ b/config/examples/image_edit_example.json
@@ -0,0 +1,36 @@
+{
+  "0001": {
+    "interleave_array": [
+      "Change background from marble bathroom to outdoor scene with buildings and stacked items",
+      "config/examples/image_edit_examples/index000000_cond1.jpg",
+      "config/examples/image_edit_examples/index000000_cond1.jpg"
+    ],
+    "element_dtype_array": [
+      "text",
+      "image",
+      "image"
+    ],
+    "istarget_in_interleave": [
+      0,
+      0,
+      1
+    ]
+  },
+  "0002": {
+    "interleave_array": [
+      "Change her golden hair color to dark green",
+      "config/examples/image_edit_examples/index000001_cond1.jpg",
+      "config/examples/image_edit_examples/index000001_cond1.jpg"
+    ],
+    "element_dtype_array": [
+      "text",
+      "image",
+      "image"
+    ],
+    "istarget_in_interleave": [
+      0,
+      0,
+      1
+    ]
+  }
+}
diff --git a/config/examples/image_edit_examples/index000000_cond1.jpg b/config/examples/image_edit_examples/index000000_cond1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c7d0165f80a3879bd74e8518a972f089d1f2c19e
--- /dev/null
+++ b/config/examples/image_edit_examples/index000000_cond1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae60f9f27f78946ec1bae0ab80487e808c4d92adfc3cde84f360596f578220e7
+size 1524523
diff --git a/config/examples/image_edit_examples/index000001_cond1.jpg b/config/examples/image_edit_examples/index000001_cond1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..987f0c03bd911e46f6711027882847e63aa27e8b
--- /dev/null
+++ b/config/examples/image_edit_examples/index000001_cond1.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a8ab26f75e574decf77f299cad2ed13dbc85e6a93db85c40114b14eb7d739c9
+size 276906
diff --git a/config/examples/t2i_example.json b/config/examples/t2i_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..609924cf0454518cb4878e1e690fad4f290f5bc0
--- /dev/null
+++ b/config/examples/t2i_example.json
@@ -0,0 +1,11 @@
+{
+  "000000.png": "A beautiful girl, delicate and the half-body shot portrait, light, ultra detailed features, romantic atmosphere, gentle and ethereal mood, The warm light shines on the hair, a half-body shot, a cold and atmospheric scene, holding snowflakes, with some of the snowflakes falling on the head, and the sunlight shining on the upper left corner.",
+  "000001.png": "A cat holds a poster with rainbow text \"STOP\"",
+  "000002.png": "An anthropomorphic rainbow fox, its fur dotted with twinkling stardust, dragging a fluffy seven-color gradient tail, standing on a fantasy grassland full of glowing flowers, with floating colorful crystals in the background, bright and dreamy overall tone, full of details.",
+  "000003.png": "A fantasy bird with colorful feathers, its wings covered with star patterns, its beak is crystal sapphire color, standing on a branch full of colorful fluffy flowers, with a gradient pink-purple sky in the background, and glowing petals falling.",
+  "000004.png": "A fantasy dragon, its body is dark purple gradient, its scales shine with dark gold light, its wings are covered with dark patterns, spitting dark purple flames from its mouth, surrounded by ink-colored clouds and glowing stars, with a mysterious starry sky in the background.",
+  "000005.png": "This is a watercolor illustration of a young girl with short black hair and fair skin, wearing a straw hat adorned with a blue flower, a white blouse, and a blue pinafore dress, sitting on a wooden swing. She holds onto the swing's ropes, surrounded by lush green foliage and colorful flowers. The background is a soft, white wash, enhancing the vibrant colors of the plants. The style is whimsical and slightly impressionistic, with delicate brushstrokes and a serene, idyllic atmosphere.",
+  "000006.png": "his is a black-and-white digital anime-style drawing of a young, androgynous character with short, messy hair and handsome eyes. They wear a high-collared, long-sleeved garment with a pendant. The background is dark, with abstract, brushstroke-like textures. The character's expression is thoughtful and slightly melancholic, with a hand delicately touching their chin. The style is reminiscent of contemporary Japanese manga, with a focus on clean lines and expressive features.",
+  "000007.png": "This photorealistic, Fish-eye lens, low-angle shot captures a ginger tabby cat confidently balancing on a skateboard in a sun-dappled park. The cat, with bright orange fur, large round amber eyes, and a raised tail tipped with white, gazes directly at the viewer with a curious, focused expression. Its paws are firmly planted on the skateboard’s black, grip-taped deck, which features light orange wheels and natural wood trim along the edges.\nBehind the cat, lush green trees line a smooth paved path under a vivid blue sky dotted with faint clouds. Bright sunlight casts sharp, defined shadows across the ground, emphasizing the warm, sunny day. The dynamic perspective makes the cat appear to be cruising forward with playful determination, blending the whimsical subject with a realistic, vibrant setting.",
+  "000008.png": "This image is a 2×2 grid showcasing four distinct dragon heads, each embodying a different elemental or material theme:\nTop-left: A fearsome fire dragon head, seemingly forged from molten metal or lava. It blazes with bright orange and yellow flames, embers flickering around its jagged, glowing teeth and eyes, set against a dark, smoky backdrop.\nTop-right: An ethereal ice dragon head, carved entirely from clear, translucent ice. Its crystalline scales, sharp features, and pale blue glowing eyes give it a cold, frosty appearance, with a snowy, misty background enhancing its wintry theme.\nBottom-left: An ancient stone dragon head, weathered and half-buried in cracked rock and mossy dirt. Patches of green moss cling to its rugged surface, while faint embers glow in its mouth, hinting at dormant power within the aged stone.\nBottom-right: A rustic wooden dragon head, meticulously carved from light brown wood. Its geometric, blocky scales, sharp stylized horns, and defined snout give it a handcrafted, earthy look, resting on a bed of dry straw."
+}
\ No newline at end of file
diff --git a/config/examples/t2v_example.json b/config/examples/t2v_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ec95527585aea6761a2f924e89fc7eb09f26b1f
--- /dev/null
+++ b/config/examples/t2v_example.json
@@ -0,0 +1,5 @@
+{
+  "000000.mp4": "A detailed cinematic portrait begins from a medium view and gradually moves into a close facial framing of a beautiful young woman playing a grand piano in a luminous marble music hall with tall windows, gold sconces, flowing curtains, polished floors, and refined floral arrangements. Styled with pearl earrings, a delicate crystal hairpin, and a layered silver necklace above an elegant satin gown. Both hands stay clearly visible on the piano keys, and every finger movement is elegant, natural, and easy to read as she plays a calm melodic phrase, and her head gives a subtle natural sway in time with the music while the smile slowly grows warmer. Both hands stay visible on the keys as she plays a calm phrase; her head sways subtly, then face the camera and smile warmly.",
+  "000001.mp4": "A premium fantasy-film shot shows a pastel horse appearing in a dreamy cloud valley filled with luminous stars and floating petals. The main subject fills at least two-thirds of the frame and remains the clear visual focus. The setting stays bright and visually rich, with airy depth, polished contrast, and a refined cinematic atmosphere that supports the subject without overwhelming it. Lively eyes, soft blinking, and delicate expression changes create a warm, engaging on-camera presence. The scene is colorful, richly detailed, and highly aesthetic, with dramatic sky depth and crystalline highlights. the camera glides sideways while the starfield pulses and the unicorn steps out of the glow.",
+  "000002.mp4": "A close-to-medium cinematic shot shows a fashion-forward woman riding a horse across a meadow with wildflowers, rolling hills, and luminous bright morning light. The setting remains bright and visually rich, with airy depth, polished color contrast, and a refined cinematic atmosphere that supports the subject without overwhelming it. The facial performance is vivid and natural, with responsive eyes, soft micro-expressions, and delicate changes in the gaze that make the subject feel emotionally present. The subject is beautiful, highly detailed, and photographed with a premium cinematic aesthetic. The subject occupies at least two-thirds of the frame, with beautiful styling, refined facial detail, convincing skin texture, and anatomically correct hands. Her body posture follows the horse's movement naturally, and both hands hold the reins correctly. the camera tracks from the side as the horse moves with graceful rhythm. The horse and the character are complete, with the woman facing forward first, then turning her head to face the camera with a smile."
+}
\ No newline at end of file
diff --git a/config/examples/video_edit_example.json b/config/examples/video_edit_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..c77ed6ec83321319362fd78e73136cb8bde44342
--- /dev/null
+++ b/config/examples/video_edit_example.json
@@ -0,0 +1,36 @@
+{
+  "0001": {
+    "interleave_array": [
+      "change the color of the clouds to pink",
+      "config/examples/video_edit_examples/index000000_cond1.mp4",
+      "config/examples/video_edit_examples/index000000_cond1.mp4"
+    ],
+    "element_dtype_array": [
+      "text",
+      "video",
+      "video"
+    ],
+    "istarget_in_interleave": [
+      0,
+      0,
+      1
+    ]
+  },
+  "0002": {
+    "interleave_array": [
+      "Change the gull into a majestic eagle",
+      "config/examples/video_edit_examples/index000001_cond1.mp4",
+      "config/examples/video_edit_examples/index000001_cond1.mp4"
+    ],
+    "element_dtype_array": [
+      "text",
+      "video",
+      "video"
+    ],
+    "istarget_in_interleave": [
+      0,
+      0,
+      1
+    ]
+  }
+}
diff --git a/config/examples/video_edit_examples/index000000_cond1.mp4 b/config/examples/video_edit_examples/index000000_cond1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fa0171c7c8b88cd4d459cfbbff4515a3833d9adc
--- /dev/null
+++ b/config/examples/video_edit_examples/index000000_cond1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef4737ed3a18bb7353953df82859b886a61ced3941b4eb33933e74d537452008
+size 1889961
diff --git a/config/examples/video_edit_examples/index000001_cond1.mp4 b/config/examples/video_edit_examples/index000001_cond1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fc4ae0c43c8477bc7188ac88e7868dcde9c4c321
--- /dev/null
+++ b/config/examples/video_edit_examples/index000001_cond1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa863fcb192a01b50abe9b8477046e9d3393e1f0d9169168a3a97e90c993a7c5
+size 2657083
diff --git a/config/examples/x2t_image_example.json b/config/examples/x2t_image_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..c9018c07aa90f001b5eb8dda37822f0829e161a9
--- /dev/null
+++ b/config/examples/x2t_image_example.json
@@ -0,0 +1,110 @@
+{
+  "0001": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-01.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "Is the largest segment greater than sum of all the other segments?",
+        "Yes, the largest segment in the pie chart is the blue one, which is the most populous segment."
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0002": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-02.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "What percentage of respondents want better border security?",
+        "29%"
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0003": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-03.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "What is the license plate number of the car?",
+        "The license plate number of the car is BX62 BFY."
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0004": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-04.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "According to the data from the proprietary market research, how much amount was spent on the promotional meetings and events during 1998?",
+        "According to the data from the proprietary market research, the total amount spent on the promotional meetings and events during 1998 was approximately $1.3 billion."
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0005": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-05.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "What is the appearance of the Colosseum in Rome, Italy?",
+        "The Colosseum in Rome, Italy is a magnificent ancient amphitheater made of stone and concrete. It is characterized by its large, arched openings and its semi-circular shape. The Colosseum has been partially restored, and its original color has faded, but the architectural details, like the columns and the semi-circular arches, are still visible. The lighting in the image highlights the grandeur and historical significance of the Colosseum, making it stand out against the backdrop of the evening sky."
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0006": {
+    "interleave_array": [
+      "assets/image-understanding/cases/image-understanding-case-06.png",
+      [
+        "Look at the image carefully and answer the question.",
+        "How does a total solar eclipse look like from Earth?",
+        "A total solar eclipse looks like a dark circle with a bright, white outer edge. This is a result of the Earth's rotation and the Earth's shadow, which partially obscures the sun's light. The bright white edge of the eclipse is caused by the sun's high energy and its interaction with the Earth's atmosphere, while the dark part of the eclipse is due to the Earth's shadow and the surrounding air currents. The solar eclipse's shape, with its bright white edge and dark center, is similar to the shape of a full moon or a dark disk. It is a natural phenomenon that occurs in the atmosphere of the Earth and is an important part of the solar system."
+      ]
+    ],
+    "element_dtype_array": [
+      "image",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  }
+}
diff --git a/config/examples/x2t_video_example.json b/config/examples/x2t_video_example.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3d2b1c81558ab0a9b9405c8331fa3f7c65316f3
--- /dev/null
+++ b/config/examples/x2t_video_example.json
@@ -0,0 +1,110 @@
+{
+  "0001": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-vqa-01.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "How many times did the person launch objects on the table? Options: (A) 3 (B) 2 (C) 4",
+        "(A) 3"
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0002": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-vqa-02.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "The person makes sets of repeated actions. How many distinct repeated actions did the person do? Options: (A) 2 (B) 3 (C) 4",
+        "(A) 2"
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0003": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-vqa-03.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "In which direction does the purple sphere move in the video? Options: (A) Down and to the right. (B) Up and to the left. (C) Up and to the right. (D) The object is stationary.",
+        "(A) Down and to the right."
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0004": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-vqa-04.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "What is the unrealistic phenomenon displayed in the video? Options: (A) The man can manipulate time via phone. (B) Man grabs an object through a phone screen. (C) Chocolate transforms into different objects. (D) Visible means of propulsion enables flight.",
+        "(B) Man grabs an object through a phone screen."
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0005": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-caption-short-01.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "Offer a succinct account of the culinary process shown in this video.",
+        "Add tomato puree and mix it well with chicken pieces."
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  },
+  "0006": {
+    "interleave_array": [
+      "assets/video-understanding/videos/video-understanding-caption-long-01.mp4",
+      [
+        "Watch the video carefully and answer the question.",
+        "Provide a detailed description of the given video, capturing its key moments.",
+        "In a sunlit meadow, a small tortoiseshell butterfly rests on a purple flower. A bee, with black and yellow stripes, lands on the same flower. The butterfly flaps its wings gently, while the bee busies itself, collecting nectar. The flower sways slightly in the breeze. The butterfly then takes off, and the bee follows, both heading to the next flower. The scene is a vivid display of insect interaction in a natural setting, with the colors of the butterfly and the bee contrasting against the green background of the meadow. The video captures this peaceful moment in a short 6-second duration."
+      ]
+    ],
+    "element_dtype_array": [
+      "video",
+      "text"
+    ],
+    "istarget_in_interleave": [
+      0,
+      1
+    ]
+  }
+}
diff --git a/config/path_default.yaml b/config/path_default.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f7856fe70ee516115157205e26e47de6d8d4e3ee
--- /dev/null
+++ b/config/path_default.yaml
@@ -0,0 +1,27 @@
+# 预训练模型根目录
+base_dir: "downloads"
+
+# Lance 模型配置
+lance:
+  image: "${base_dir}/Lance_3B"
+  video: "${base_dir}/Lance_3B_Video"
+
+# Vision Transformer 配置
+vit:
+  qwen2_5_vl: "${base_dir}/Qwen2.5-VL-ViT"
+
+# VAE 配置
+vae:
+  wan: "${base_dir}/Wan2.2_VAE.pth"
+
+# GenEVAL 配置
+geneval:
+  data: "benchmarks/image_gen/GenEVAL/GenEVAL.jsonl"
+
+# DPG 配置
+dpg:
+  data: "benchmarks/image_gen/DPG/DPG.jsonl"
+
+# GEdit 配置
+gedit:
+  data: "benchmarks/image_gen/GEdit/GEdit_en.json"
diff --git a/data/__init__.py b/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5539384e8f9613e1ddef01d1be09819a16e0330f
--- /dev/null
+++ b/data/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
\ No newline at end of file
diff --git a/data/common.py b/data/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75592ed76653a1c40e5e37b555b3b3f110f38b8
--- /dev/null
+++ b/data/common.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import random
+from typing import List
+
+import torch
+
+
+def generate_system_prompt(system_prompt_type="caption", vision_type="video"):
+    if system_prompt_type == "caption":
+        str_list = [
+            f"Generate a detailed and accurate description of the {vision_type}, including all the key moments and visual details.",
+            f"Write an in-depth depiction of the {vision_type}, covering all its aspects.",
+            f"Write an exhaustive depiction of the given {vision_type}, capturing its essence and key moments.",
+            f"Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background.",
+        ]
+    elif system_prompt_type == "t2v" or system_prompt_type == "ff2v":
+        str_list = [f"Describe the {vision_type} by detailing the color, quantity, visible text, shape, size, texture, spatial relationships and motion/camera movements of the objects and background:"]
+    elif system_prompt_type == "t2i":
+        str_list = [f"Describe the {vision_type} by detailing the color, quantity, text, shape, size, texture, spatial relationships of the objects and background:"]
+    elif "edit" in system_prompt_type:
+        str_list = [f"Describe the key features of the input {vision_type} (color, shape, size, texture, objects, background), then explain how the user’s text instruction should alter or modify the {vision_type}. Generate a new {vision_type} that meets the user’s requirements while maintaining consistency with the original input where appropriate."]
+    elif "idip" in system_prompt_type:
+        str_list = [f"Describe the key features of the input image (color, shape, size, texture, objects, background, style), then incorporate the user’s text description to generate a new {vision_type} that satisfies the user’s requirements while preserving the essential identity and object or style information from the reference input."]
+    elif 'maze' in system_prompt_type:
+        str_list = [
+            "Describe the key elements of the input maze image (layout, white path, black walls, blue star, red flag, and overall background), then generate a 2D animation. The blue star should slide smoothly along the white path, stop exactly on the red flag, and then acquire a trophy. Ensure the blue star never crosses or enters the black maze walls. Keep the camera as a static top-down view showing the entire maze."
+        ]
+
+    return random.choice(str_list)
+
+
+def shift_position_ids(
+    position_ids: torch.Tensor,
+    pos_shift: any,
+    attn_modes: List[str],
+    split_lens: int,
+    shift_attn_mode=["full_noise", "full"],
+    pro_type=None,
+    i_sample_task=None,
+    i_sample_modality=None,
+) -> torch.Tensor:
+    curr_split = 0
+    for i, attn_mode in enumerate(attn_modes):
+        if attn_mode in shift_attn_mode:
+            if pro_type == 10:  # 与sample_modality 有关
+                if position_ids[:, :, i_sample_modality == 4].sum() != 0:
+                    pos_shift_type4 = 1000 - position_ids[:, :, i_sample_modality == 4][0, 0, 0]
+                    position_ids[0, :, i_sample_modality == 4] += pos_shift_type4
+                if position_ids[:, :, i_sample_modality == 3].sum() != 0:
+                    pos_shift_type3 = 2000 - position_ids[:, :, i_sample_modality == 3][0, 0, 0]
+                    position_ids[0, :, i_sample_modality == 3] += pos_shift_type3
+                if position_ids[:, :, i_sample_modality == 2].sum() != 0 and sum(i_sample_modality == 2) == sum(i_sample_modality == 1):
+                    position_ids[:, :, i_sample_modality == 1] = position_ids[:, :, i_sample_modality == 2]
+
+        curr_split += split_lens[i]
+
+    return position_ids
diff --git a/data/data_utils.py b/data/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3822fea5de7cd9ae5cbe143478a46a1de107305f
--- /dev/null
+++ b/data/data_utils.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""
+Data helpers used by inference (`inference_lance.py`, `ValidationDataset`) and the
+Lance model core (`modeling/lance/lance.py`).
+
+Exported utilities:
+    - Position id helpers (image / video, interpolate / extrapolate)
+    - Patchify helpers (image + video-with-merge)
+    - create_sparse_mask       : flex-attention sparse mask builder
+    - add_special_tokens       : register chat / vision tokens on a tokenizer
+    - len2weight               : CE loss reweighting factor
+"""
+
+from einops import rearrange
+
+import torch
+from torch.nn.attention.flex_attention import or_masks, and_masks
+
+
+# ------------------------------------------------------------------
+# Position id helpers
+# ------------------------------------------------------------------
+
+def get_flattened_position_ids_interpolate_video(num_frames, img_h, img_w, patch_size, max_num_frames, max_num_patches_per_side):
+    num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
+    # temporal
+    boundaries_t = torch.arange(1 / max_num_frames, 1.0, 1 / max_num_frames)
+    fractional_coords_t = torch.arange(0, 1 - 1e-6, 1 / num_frames)
+    bucket_coords_t = torch.bucketize(fractional_coords_t, boundaries_t, right=True)
+    # spatial
+    boundaries_s = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side)
+    fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h)
+    fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w)
+    bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries_s, right=True)
+    bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries_s, right=True)
+    pos_ids = (
+        bucket_coords_t[:, None, None] * max_num_patches_per_side * max_num_patches_per_side
+        + bucket_coords_h[None, :, None] * max_num_patches_per_side
+        + bucket_coords_w[None, None, :]
+    ).flatten()
+    return pos_ids
+
+
+def get_flattened_position_ids_extrapolate_video(t, h, w, max_latent_size):
+    """
+    默认情况下：
+        num_frames = 7 (对应 25 frames)
+        max_num_patches_per_side = 64
+    """
+    coords_t = torch.arange(0, t)
+    coords_h = torch.arange(0, h)
+    coords_w = torch.arange(0, w)
+    pos_ids = (
+        coords_t[:, None, None] * max_latent_size * max_latent_size
+        + coords_h[None, :, None] * max_latent_size
+        + coords_w[None, None, :]
+    ).flatten()
+    return pos_ids
+
+
+def get_flattened_position_ids_extrapolate(img_h, img_w, patch_size, max_num_patches_per_side):
+    num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
+    coords_h = torch.arange(0, num_patches_h)
+    coords_w = torch.arange(0, num_patches_w)
+    pos_ids = (coords_h[:, None] * max_num_patches_per_side + coords_w).flatten()
+    return pos_ids
+
+
+def get_flattened_position_ids_interpolate(img_h, img_w, patch_size, max_num_patches_per_side):
+    num_patches_h, num_patches_w = img_h // patch_size, img_w // patch_size
+    boundaries = torch.arange(1 / max_num_patches_per_side, 1.0, 1 / max_num_patches_per_side)
+    fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / num_patches_h)
+    fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / num_patches_w)
+    bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+    bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+    pos_ids = (bucket_coords_h[:, None] * max_num_patches_per_side + bucket_coords_w).flatten()
+    return pos_ids
+
+
+# ------------------------------------------------------------------
+# Patchify helpers
+# ------------------------------------------------------------------
+
+def patchify(image, patch_size):
+    p = patch_size
+    c, h, w = image.shape
+    assert h % p == 0 and w % p == 0
+    image = image.reshape(c, h // p, p, w // p, p)
+    image = torch.einsum("chpwq->hwpqc", image)
+    image = image.reshape(-1, p**2 * c)
+    return image
+
+
+def patchify_video_with_merge(video, spatial_patch_size, temporal_patch_size, merge_size=2):
+    """
+    Args:
+        video: Tensor of shape [C, T, H, W]
+        spatial_patch_size: patch size for H/W
+        temporal_patch_size: patch size for T
+        merge_size: merging factor for spatial grid (固定为 2)
+
+    Returns:
+        patches: Tensor of shape [num_patches, patch_dim]
+    """
+    video = rearrange(video, "C T H W -> T C H W")
+    T, C, H, W = video.shape
+    p, tp, ms = spatial_patch_size, temporal_patch_size, merge_size
+
+    gt, gh, gw = T // tp, H // p, W // p
+    video = video.reshape(gt, tp, C, gh // ms, ms, p, gw // ms, ms, p)
+    video = video.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+    patches = video.reshape(gt * gh * gw, C * tp * p * p)
+    return patches
+
+
+# ------------------------------------------------------------------
+# Sparse attention mask (flex-attention)
+# ------------------------------------------------------------------
+
+def create_sparse_mask(document_lens, split_lens, attn_modes, device):
+    def causal_mask(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+
+    def full_and_noise_mask(b, h, q_idx, kv_idx):
+        return (full_and_noise_seq_id[q_idx] == full_and_noise_seq_id[kv_idx]) & (full_and_noise_seq_id[q_idx] >= 0)
+
+    def remove_noise_mask(b, h, q_idx, kv_idx):
+        return ~((noise_seq_id[kv_idx] >= 0) & (noise_seq_id[q_idx] != noise_seq_id[kv_idx]))
+
+    def sample_mask(b, h, q_idx, kv_idx):
+        return document_id[q_idx] == document_id[kv_idx]
+
+    full_and_noise_tmp = []
+    noise_tmp = []
+
+    for i, (length, mode) in enumerate(zip(split_lens, attn_modes)):
+        value = i if mode in ["full", "noise"] else -1
+        full_and_noise_tmp.extend([value] * length)
+        value_noise = i if mode == "noise" else -1
+        noise_tmp.extend([value_noise] * length)
+
+    full_and_noise_seq_id = torch.Tensor(full_and_noise_tmp).to(device)
+    noise_seq_id = torch.Tensor(noise_tmp).to(device)
+
+    document_id = torch.cat([torch.full((l,), i) for i, l in enumerate(document_lens, start=1)]).to(device)
+
+    return and_masks(or_masks(causal_mask, full_and_noise_mask), remove_noise_mask, sample_mask)
+
+
+# ------------------------------------------------------------------
+# Tokenizer / loss helpers
+# ------------------------------------------------------------------
+
+def add_special_tokens(tokenizer):
+    all_special_tokens = []
+    for k, v in tokenizer.special_tokens_map.items():
+        if isinstance(v, str):
+            all_special_tokens.append(v)
+        elif isinstance(v, list):
+            all_special_tokens += v
+
+    new_tokens = []
+    for tok in ("<|im_start|>", "<|im_end|>", "<|vision_start|>", "<|vision_end|>"):
+        if tok not in all_special_tokens:
+            new_tokens.append(tok)
+
+    num_new_tokens = tokenizer.add_tokens(new_tokens)
+    new_token_ids = dict(
+        bos_token_id=tokenizer.convert_tokens_to_ids("<|im_start|>"),
+        eos_token_id=tokenizer.convert_tokens_to_ids("<|im_end|>"),
+        start_of_image=tokenizer.convert_tokens_to_ids("<|vision_start|>"),
+        end_of_image=tokenizer.convert_tokens_to_ids("<|vision_end|>"),
+    )
+    return tokenizer, new_token_ids, num_new_tokens
+
+
+def len2weight(x, loss_reduction="square"):
+    if x == 0:
+        return x
+    if loss_reduction == "token":
+        return 1
+    if loss_reduction == "sample":
+        return 1 / x
+    if loss_reduction == "square":
+        return 1 / (x**0.5)
+    raise NotImplementedError(loss_reduction)
diff --git a/data/dataset_base.py b/data/dataset_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..45a37b56206090e8101a19411252890e5c994099
--- /dev/null
+++ b/data/dataset_base.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Tuple
+
+import torch
+import yaml
+
+
+@dataclass
+class DataConfig:
+    """
+    DataConfig 版本，其中 vae_downsample 是一个三元组。
+    """
+    grouped_datasets: Dict[str, Any] = field(default_factory=dict)
+    text_cond_dropout_prob: float = 0.1
+    vit_cond_dropout_prob: float = 0.4
+    vae_cond_dropout_prob: float = 0.1
+
+    # 将 vae_downsample 改为三元组，分别代表 (时间, 高度, 宽度) 的下采样率
+    vae_downsample: Tuple[int, int, int] = (4, 16, 16)
+
+    max_latent_size: int = 64             # by ModelArguments
+    vit_patch_size: int = 14              # by ModelArguments
+    vit_patch_size_temporal: int = 2      # by ModelArguments
+    vit_max_num_patch_per_side: int = 70  # by ModelArguments
+    max_num_frames: int = 25              # by ModelArguments
+
+    latent_patch_size: int = None         # by ModelArguments
+
+    @classmethod
+    def from_yaml(cls, file_path: str) -> 'DataConfig':
+        """从 YAML/JSON 文件创建 DataConfig 实例"""
+        with open(file_path, "r") as stream:
+            data = yaml.safe_load(stream)
+        return cls(grouped_datasets=data)
+
+
+class SimpleCustomBatch:
+    def __init__(self, batch):
+        data = batch[0]
+        for key, value in data.items():
+            setattr(self, key, value)
+
+    def pin_memory(self):
+        for key, value in self.__dict__.items():
+            if isinstance(value, torch.Tensor):
+                setattr(self, key, value.pin_memory())
+            elif isinstance(value, list) and value and all(isinstance(i, torch.Tensor) for i in value):
+                setattr(self, key, [i.pin_memory() for i in value])
+        return self
+
+    def cuda(self, device):
+        for key, value in self.__dict__.items():
+            if isinstance(value, torch.Tensor):
+                setattr(self, key, value.to(device))
+            elif isinstance(value, list) and value and all(isinstance(i, torch.Tensor) for i in value):
+                setattr(self, key, [i.to(device) for i in value])
+        return self
+
+    def to_dict(self):
+        return self.__dict__.copy()
+
+
+# 顶层函数（可被 pickle）
+def simple_custom_collate(batch):
+    return SimpleCustomBatch(batch)
diff --git a/data/datasets_custom/__init__.py b/data/datasets_custom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a2ba5f3c2498528206de1846b17f4a5f8afdcac
--- /dev/null
+++ b/data/datasets_custom/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""自定义数据集入口。"""
+
+from .validation_dataset import ValidationDataset
+
+__all__ = ["ValidationDataset"]
diff --git a/data/datasets_custom/validation_dataset.py b/data/datasets_custom/validation_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e69ca42f15d3106db077817a5a42690d192a0de
--- /dev/null
+++ b/data/datasets_custom/validation_dataset.py
@@ -0,0 +1,1174 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import json
+import os
+from typing import Any, Dict, List
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import decord
+from decord import VideoReader
+from PIL import Image
+
+from data.video.sampler.utils import FRAME_SAMPLER_TYPES
+from data.video.sampler.frames import FrameSamplerOutput
+from data.transforms import VideoTransform
+from data.data_utils import (
+    get_flattened_position_ids_extrapolate_video,
+    len2weight,
+    patchify_video_with_merge,
+)
+from data.system_prompt_render import render_qwenvl_prompt, expand_and_index_by_token_ids_new
+from data.common import generate_system_prompt
+from modeling.qwen2 import Qwen2Tokenizer
+from config.config_factory import ModelArguments, DataArguments, TrainingArguments
+
+sample_task_map = {
+    't2v': 0,
+    'idip': 1,
+    'edit': 2,
+    'refedit': 3,
+}
+modality_map = {
+    'system_prompt': -1,
+    'text': 0,
+    'noise': 1,
+    'ref_source': 2, # for vae
+    'ref_image': 3, # for vae
+    'ref_vit': 4 # for ref vit
+}
+
+
+class ValidationDataset(Dataset):
+    def __init__(
+        self,
+        jsonl_path: str,
+        tokenizer: Qwen2Tokenizer,
+        data_args: DataArguments,
+        model_args: ModelArguments,
+        training_args: TrainingArguments,
+        new_token_ids: Dict[str, int],
+        dataset_config: None,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        """
+        初始化验证数据集
+
+        Args:
+            jsonl_path: JSONL文件路径
+            tokenizer: 分词器
+        """
+        self.jsonl_path = jsonl_path
+        self.tokenizer = tokenizer
+        self.new_token_ids = new_token_ids
+
+        # 读取JSONL文件
+        try:
+            full_data = self._read_jsonl()
+        except:
+            with open(jsonl_path, 'r', encoding='utf-8') as f:
+                full_data = json.load(f)
+            if isinstance(full_data, dict):
+                # 转换为列表格式，每个元素是独立的字典
+                full_data = [{"index": self.pro_index(index), "data": prompt} for index, prompt in full_data.items()]
+
+        if world_size > 1:
+            self.data = full_data[local_rank::world_size]
+            print(f"Rank {local_rank}/{world_size} will process {len(self.data)} samples")
+        else:
+            self.data = full_data
+
+        self.data_config = dataset_config
+
+        self.bos_token_id = self.new_token_ids["bos_token_id"]
+        self.eos_token_id = self.new_token_ids["eos_token_id"]
+        self.start_of_image = self.new_token_ids["start_of_image"]
+        self.end_of_image = self.new_token_ids["end_of_image"]
+        self.image_token_id = self.new_token_ids["image_token_id"]
+
+        # 视频采样
+        try:
+            max_duration = self.data_config.max_duration
+        except:
+            max_duration = 6.0
+
+        video_frame_sampler_params = {"temporal": 4, "sample_fps": 12, "max_duration": max_duration, "assert_seconds": False, "truncate": False}
+
+        self.frame_sampler = FRAME_SAMPLER_TYPES["multi_clips"](**video_frame_sampler_params)
+        self.cpu_count = os.cpu_count() or 1
+
+        # VideoTransform for vae: 仅在存在原始视频时才发挥作用
+        if self.data_config.resolution in ["video_192p", "image_256res"]:
+            resolution_vae = 256
+            resolution_vit = 224
+        elif self.data_config.resolution == "image_512res":
+            resolution_vae = 512
+            resolution_vit = 448
+        elif self.data_config.resolution == "image_768res":
+            resolution_vae = 768
+            resolution_vit = 672
+        elif self.data_config.resolution == "video_360p":
+            resolution_vae = 480  # 480 for 360fps # 256 for 192p
+            resolution_vit = 476  # 476 for 360fps , 224 for 192p
+        elif self.data_config.resolution == "video_480p":
+            resolution_vae = 640  # 480 for 360fps # 256 for 192p
+            resolution_vit = 616  # 476 for 360fps , 224 for 192p
+        else:
+            raise ValueError(f"Unknown resolution: {self.data_config.resolution}")
+
+        video_transform_args = {
+            "resolution": resolution_vae,
+            "mode": "bucket",
+            "divisible_crop_size": 16,  # 32 # 16 | 32 让视频的分辨率被多少整除
+            "stride_spatial": 16,  # 空间下采样倍率
+            "stride_temporal": 4,  # 时间下采样倍率
+            "aspect_ratios": ["21:9", "16:9", "4:3", "1:1", "3:4", "9:16"],  # 仅在 mode="bucket" 时生效
+            "mean": 0.5,
+            "std": 0.5,
+        }
+        self.transform = VideoTransform(**video_transform_args)
+
+        # VideoTransform for vit
+        vit_video_transform_args = {
+            "resolution": resolution_vit,
+            "mode": "bucket",
+            "divisible_crop_size": 28,  # 让视频的分辨率被多少整除, qwen2.5vl需要被14整除
+            "aspect_ratios": ["21:9", "16:9", "4:3", "1:1", "3:4", "9:16"],  # 仅在 mode="bucket" 时生效
+            "mean": [0.48145466, 0.4578275, 0.40821073],  # Qwen2.5-VL vit 使用的mean
+            "std": [0.26862954, 0.26130258, 0.27577711],
+        }
+        self.vit_transform = VideoTransform(**vit_video_transform_args)
+
+        self.sample = self.set_sequence_status()
+
+        self.frame_condition_idx = []
+
+        if hasattr(self.data_config, 'system_prompt_type'):
+            self.system_prompt_type = self.data_config.system_prompt_type
+        else:
+            self.system_prompt_type = 'SP0'
+
+    def pro_index(self, index: int):
+        if isinstance(index, str):
+            for x in ['.mp4', '.jpg', '.png', '.jpeg']:
+                index = index.replace(x, "")
+        return int(index)
+
+    def set_sequence_status(self):
+        sequence_status = dict(
+            curr=0,  # 指针
+            sample_lens=[],
+            sample_type=[],
+            sample_N_target=[],
+            packed_position_ids=[],
+            nested_attention_masks=[],
+            split_lens=[],
+            attn_modes=[],
+            packed_text_ids=[],
+            packed_text_indexes=[],
+            packed_label_ids=[],
+            ce_loss_indexes=[],
+            ce_loss_weights=[],
+            vae_image_tensors=[],  # image
+            vae_video_tensors=[],  # video
+            packed_latent_position_ids=[],
+            vae_latent_shapes=[],
+            packed_vae_token_indexes=[],
+            packed_timesteps=[],
+            mse_loss_indexes=[],
+            packed_vit_tokens=[],
+            vit_token_seqlens=[],
+            packed_vit_position_ids=[],
+            packed_vit_token_indexes=[],
+            vit_video_grid_thw=[],  # for vit video
+            vae_video_grid_thw=[],  # for vae video
+            video_grid_thw=[],  # for all video tensor
+            vit_video_tensors=[],  # for vit original video tensor
+            # offline 参数
+            vae_video_latent=[],  # for vae video latent offline
+            vae_data_mode=[],  # offline or online
+            vit_data_mode=[],  # offline or online
+            key_frame_mask=[],  # for key frame mask
+            # sample_task for joint training
+            sample_task=[],
+            sample_modality=[],
+        )
+        return sequence_status
+
+    def _read_jsonl(self) -> List[Dict[str, Any]]:
+        """读取JSONL文件"""
+        data = []
+        with open(self.jsonl_path, "r", encoding="utf-8") as f:
+            for line in f:
+                data.append(json.loads(line.strip()))
+        return data
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+
+    @staticmethod
+    def _read_decord(video: VideoReader, frame_idx: List[int]) -> List[Image.Image]:
+        # 使用 get_batch() 替换循环单帧读取，可以大幅提升性能
+        frames_np = video.get_batch(frame_idx).asnumpy()
+        return [Image.fromarray(frame) for frame in frames_np]
+
+    def get_video_tensor_online(self, media_url, vision_stream, worker_id=0, element_dtype="image") -> torch.Tensor:
+        self.vision_stream = vision_stream
+        video_stream = media_url  # BytesIO(self.tos_cli.get_obj_by_url(media_url))
+
+        if element_dtype == "image":
+            image = Image.open(video_stream)
+            if image.mode == "P":
+                image = image.convert("RGBA")
+            if image.mode == "RGBA":
+                # 在白底上合成，去掉透明
+                bg = Image.new("RGB", image.size, (255, 255, 255))
+                bg.paste(image, mask=image.split()[3])  # 用 alpha 通道做掩码
+                image = bg
+            else:
+                image = image.convert("RGB")
+            video_frames = [image]
+        else:  # for video
+            video_reader = VideoReader(video_stream, ctx=decord.cpu(worker_id % self.cpu_count))
+            total_frames = len(video_reader)
+
+            sampler_name = self.frame_sampler.__class__.__name__
+            if sampler_name == "MultiClipsFrameSampler":
+                frames_info = {
+                    "clip_indices": [(0, total_frames)],  # 左闭右开 默认为单个clip
+                    "fps": 24,  # 默认为24
+                }
+            elif sampler_name == "FixedFrameSampler":
+                frames_info = {
+                    "start_frame": 0,
+                    "end_frame": total_frames,
+                    "total_frames": total_frames,
+                }
+            else:
+                raise ValueError(f"Not verified frame sampler type: {sampler_name}")
+
+            frames_sampler_output: FrameSamplerOutput = self.frame_sampler(frames_info)
+            video_frames = self._read_decord(video_reader, frames_sampler_output.indices)
+
+        if vision_stream == "vae_video":
+            video_tensor = self.transform(video_frames)  # fix: use List input
+        elif vision_stream == "vit_video":
+            video_tensor = self.vit_transform(video_frames)  # fix: use List input
+            if element_dtype == "image":
+                video_tensor = video_tensor.repeat(1, 2, 1, 1)  # NOTE 对于单张图像，需要复制一份，因为encoder的temporal patch size = 2
+            # NOTE: 视频长度必须是偶数
+            if video_tensor.shape[1] % 2 == 1:
+                last_frame = video_tensor[:, -1:, :, :]
+                video_tensor = torch.cat([video_tensor, last_frame], dim=1)
+
+        else:
+            raise ValueError(f"Unknown vision_stream: {vision_stream}")
+        return video_tensor  # , self.vision_token_count(video_tensor)
+
+    def process_vit_video(self, video_tensor, curr: int, curr_rope_id: int, curr_split_len: int, curr_video_grid_thw: None, item_loss=0):
+        if not self.data_config.text_template:
+            self.sample["packed_text_ids"].append(self.start_of_image)  # 151652, <|vision_start|>
+            self.sample["packed_text_indexes"].append(curr)
+            curr += 1
+            curr_split_len += 1
+
+        # 在线模式下，video_tensor 为tensor, 离线模式下，video_tensor 为list [latent]
+        if isinstance(video_tensor, torch.Tensor):  # online
+            self.sample["vit_video_tensors"].append(video_tensor)  # CTHW 原始的视频，非latent , 仅用于validation中的可视化
+
+            # preprocess video
+            vit_tokens = patchify_video_with_merge(
+                video_tensor, self.data_config.vit_patch_size, self.data_config.vit_patch_size_temporal
+            )  # C T H W -> (T//2 * H//p * W//p) (p*p*2*C)
+            num_video_tokens = vit_tokens.shape[0] // 4  # 实际上qwen2.5-vl还需要merge，2x2 merge成1个， hardcode for temp
+            t, h, w = video_tensor.size(1), video_tensor.size(2), video_tensor.size(3)
+
+            self.sample["packed_vit_tokens"].append(vit_tokens)
+            self.sample["vit_data_mode"].append("online")
+
+        if t is not None:
+            vit_video_grid_thw = [
+                t // self.data_config.vit_patch_size_temporal,
+                h // self.data_config.vit_patch_size,
+                w // self.data_config.vit_patch_size,
+            ]  # [1, 16, 16]
+        self.sample["vit_video_grid_thw"].append(vit_video_grid_thw)
+        curr_video_grid_thw.append(vit_video_grid_thw)
+
+        self.sample["vit_token_seqlens"].append(num_video_tokens)
+        self.sample["packed_vit_position_ids"].append(
+            torch.zeros(num_video_tokens)
+        )  # TODO : 不一定是 0 ？ 对于多个vit序列会有问题
+
+        if not self.data_config.text_template:
+            self.sample["packed_vit_token_indexes"].extend(range(curr, curr + num_video_tokens))
+            curr += num_video_tokens
+            curr_split_len += num_video_tokens
+
+            # NOTE dummy position_ids
+            self.sample["packed_text_ids"].extend([self.image_token_id] * num_video_tokens)
+
+            # add a <|endofimage|> token
+            self.sample["packed_text_ids"].append(self.end_of_image)  # 151653, <|vision_end|>
+            self.sample["packed_text_indexes"].append(curr)
+            curr += 1
+            curr_split_len += 1
+            self.sample["packed_position_ids"].extend([curr_rope_id] * curr_split_len)
+            curr_rope_id += 1
+
+            # update sequence status
+            self.sample["attn_modes"].append("full")
+            self.sample["split_lens"].append(curr_split_len)
+
+        return self.sample, curr, curr_rope_id, curr_split_len, curr_video_grid_thw, num_video_tokens
+
+    def process_text(self, caption: str, curr: int, curr_rope_id: int, curr_split_len: int, item_loss=0):
+        """处理文本，添加特殊token"""
+        text_ids = self.tokenizer.encode(caption)
+        shifted_text_ids = [self.bos_token_id] + text_ids  # NOTE: self.bos_token_id=151644 <|im_start|>
+
+        self.sample["packed_text_ids"].extend(shifted_text_ids)
+        self.sample["packed_text_indexes"].extend(range(curr, curr + len(shifted_text_ids)))
+
+        # NOTE: 生成还是理解可以通过 item_loss == 1 来判定
+        if item_loss == 1:
+            loss_token_shift = 0  # HACK
+            self.sample["ce_loss_indexes"].extend(range(curr - loss_token_shift, curr + len(shifted_text_ids)))
+            self.sample["ce_loss_weights"].extend([len2weight(len(shifted_text_ids) + loss_token_shift)] * (len(shifted_text_ids) + loss_token_shift))
+            self.sample["packed_label_ids"].extend(text_ids + [self.eos_token_id])  # NOTE: self.eos_token_id=151645 <|im_end|>
+        curr += len(shifted_text_ids)
+        curr_split_len += len(shifted_text_ids)
+
+        # add a <|im_end|> token
+        self.sample["packed_text_ids"].append(self.eos_token_id)
+        self.sample["packed_text_indexes"].append(curr)
+        curr += 1
+        curr_split_len += 1
+
+        # update sequence status
+        self.sample["attn_modes"].append("causal")
+        # if self.apply_chat_template:
+        self.sample["packed_position_ids"].extend(range(curr_rope_id, curr_rope_id + curr_split_len))
+        curr_rope_id += curr_split_len
+
+        # self.sample['sample_modality'].extend([modality_map[item['type']]] * curr_split_len)
+
+        self.sample["split_lens"].append(curr_split_len)
+
+        return self.sample, curr, curr_rope_id, curr_split_len
+
+
+    def process_vae_video(self, video_tensor, curr: int, curr_rope_id: int, curr_split_len: int, curr_video_grid_thw: None, video_sizes: list, item_loss=0):
+        if not self.data_config.text_template:
+            num_special_tokens = 0
+            # 添加 <|startofimage|> token (视频与图像共用) TODO: 要将image和video的special token拆开嘛？
+            self.sample["packed_text_ids"].append(self.start_of_image)  # self.start_of_image=151652, <|vision_start|>
+            self.sample["packed_text_indexes"].append(curr)
+            curr += 1
+            curr_split_len += 1
+            num_special_tokens += 1
+
+        # 在线模式下，video_tensor 为tensor, 离线模式下，video_tensor 为list [latent]
+        if isinstance(video_tensor, torch.Tensor):  # online
+            # 预处理视频
+            self.sample["vae_video_tensors"].append(video_tensor)  # CTHW 原始的视频，非latent
+            # 假设 video_tensor 的形状为 (C, T, H, W)
+            _, T, H, W = video_tensor.shape
+            _T, _H, _W = self.data_config.vae_downsample  # NOTE: 绝对尺度的downsample，包含了patchify的！
+            t = (T - 1) // _T + 1  # k*N+1 一般t维度不做patchify!! 如果t维度要做patchify，写法需要更新
+            h = H // _H
+            w = W // _W
+            self.sample["vae_data_mode"].append("online")
+
+            spatial_merge_size = 2  # TODO：spatial_merge_size 一定是2吗？
+            vae_video_grid_thw = [
+                t,
+                h * spatial_merge_size,
+                w * spatial_merge_size,
+            ]  # 因为Qwen-VL 中的rope 处理默认存在 /spatial_merge_size 的操作（与VI处理匹配），所以对VAE 要额外进行*spatial_merge_size处理
+
+            self.sample["vae_video_grid_thw"].append(vae_video_grid_thw)
+            curr_video_grid_thw.append(vae_video_grid_thw)
+
+            # 使用原生的 (t, h, w) latent shape
+            self.sample["vae_latent_shapes"].append((t, h, w))
+
+            # 使用3D感知的位置编码函数
+            # 外插
+            packed_latent_position_ids = get_flattened_position_ids_extrapolate_video(t, h, w, max_latent_size=self.data_config.max_latent_size)
+
+            self.sample["packed_latent_position_ids"].append(packed_latent_position_ids)
+
+            num_vid_tokens = t * h * w
+            if not self.data_config.text_template:
+                self.sample["packed_vae_token_indexes"].extend(range(curr, curr + num_vid_tokens))
+
+            if item_loss == 1:
+                timestep = np.random.randn()  # NOTE: 外面会sigmoid一下
+
+                frame_condition_idx = self.frame_condition_idx
+                packed_timesteps = [timestep] * num_vid_tokens
+
+                mse_loss_indexes = list(range(curr, curr + num_vid_tokens))
+                frame_condition_indexes = []
+                for idx in frame_condition_idx:
+                    if idx == -1:
+                        idx = t - 1
+                        if idx == 1:
+                            continue  # 如果帧数仅两帧跳过，避免所有帧均为条件帧相同
+                    frame_condition_indexes.extend(mse_loss_indexes[idx * h * w : (idx + 1) * h * w])
+                    packed_timesteps[idx * h * w : (idx + 1) * h * w] = [-sys.float_info.max] * (h * w)
+                if frame_condition_idx:
+                    mse_loss_indexes = sorted(list(set(mse_loss_indexes) - set(frame_condition_indexes)))
+
+                if not self.data_config.text_template:
+                    self.sample["mse_loss_indexes"].extend(mse_loss_indexes)  # range(curr, curr + num_vid_tokens))
+            else:
+                timestep = float("-inf")
+                packed_timesteps = [timestep] * num_vid_tokens
+
+            self.sample["packed_timesteps"].extend(packed_timesteps)
+
+            if not self.data_config.text_template:
+                curr += num_vid_tokens
+                curr_split_len += num_vid_tokens
+
+                self.sample["packed_text_ids"].extend([self.image_token_id] * num_vid_tokens)
+
+                # 添加 <|endofimage|> token
+                self.sample["packed_text_ids"].append(self.end_of_image)  # self.end_of_image=151653, <|vision_end|>
+                self.sample["packed_text_indexes"].append(curr)
+                curr += 1
+                curr_split_len += 1
+                num_special_tokens += 1
+
+                # 更新 sequence status
+                if item_loss == 1:
+                    self.sample["attn_modes"].append("noise")
+                else:
+                    self.sample["attn_modes"].append("full_noise")
+
+                self.sample["packed_position_ids"].extend([curr_rope_id] * (num_vid_tokens + num_special_tokens))  # NOTE: 为什么rope固定？
+                curr_rope_id += 1
+
+                # update sample sequence modality
+                # if item_loss == 1:
+                #     self.sample['sample_modality'].extend([modality_map['noise']] * curr_split_len)
+                # elif item_loss == 0 and sample_task == 'edit':
+                #     self.sample['sample_modality'].extend([modality_map['ref_source']] * curr_split_len)
+                # elif item_loss == 0 and sample_task == 'idip':
+                #     self.sample['sample_modality'].extend([modality_map['ref_image']] * curr_split_len)
+
+                self.sample["split_lens"].append(curr_split_len)
+
+            video_sizes.append([T, H, W])
+
+        return self.sample, curr, curr_rope_id, curr_split_len, curr_video_grid_thw, video_sizes, num_vid_tokens
+
+    def process_text_template(
+        self,
+        text_ids,
+        spans_index,
+        tgt_index,
+        caption_index,
+        video_types: list[str],
+        curr: int,
+        curr_rope_id: int,
+        curr_split_len: int,
+        item_loss=0,
+    ):
+        # video_types = ['vit_video','vae_video_target','vae_video_cond'] 等信息，caption_index 即对应 search_index
+
+        self.sample["packed_text_ids"].extend(text_ids)
+        self.sample["sample_lens"] = len(text_ids)
+        curr_split_idx = curr
+
+        for video_id, span_index in enumerate(spans_index):
+            vision_start, vision_end = curr_split_idx + span_index[0], curr_split_idx + span_index[-1]  # 对应第一和最后一个'<|video_pad|>' 的index
+            self.sample["packed_text_indexes"].extend(range(curr, vision_start))
+            if (vision_start - 1) - curr != 0:  # 确认vision前面有文本split ## HACK 相比llava 版本有修改
+                curr_split_len = (vision_start - 1) - curr
+                self.sample["packed_position_ids"].extend(
+                    range(curr_rope_id, curr_rope_id + curr_split_len)
+                )  # 注意：这里是 vision_start-1 而不是 vision_start，因为 vision_start 是 video split 起始token 的位置
+                curr_rope_id += curr_split_len
+                self.sample["sample_modality"].extend([modality_map["system_prompt"]] * curr_split_len)
+
+                if caption_index != [] and caption_index[0] in range(curr, curr + curr_split_len): # NOTE： 不支持交错的文本，即文本必须连续，
+                    split_len_1 = caption_index[0] - curr  # 文本前system_prompt 的长度
+                    split_len_2 = len(caption_index) # 文本的长度
+                    split_len_3 = curr_split_len - split_len_1 - split_len_2 # 文本后system_prompt 的长度
+
+                    split_len_text = [split_len_1, split_len_2, split_len_3]
+                    split_len_text = [x for x in split_len_text if x != 0]
+                    self.sample["attn_modes"].extend(["causal"] * len(split_len_text))
+                    self.sample["split_lens"].extend(split_len_text)
+                else:
+                    self.sample["attn_modes"].append("causal")
+                    self.sample["split_lens"].append(curr_split_len)
+
+            curr_split_len = len(span_index) + 2
+            if video_types[video_id] == "vit_video":
+                self.sample["packed_vit_token_indexes"].extend(range(vision_start, vision_end + 1))
+                self.sample["attn_modes"].append("full")  # TODO : gen 分支也使用模版则需加上判断
+                self.sample["sample_modality"].extend([modality_map["ref_vit"]] * curr_split_len)
+            elif "vae_video" in video_types[video_id]:
+                self.sample["packed_vae_token_indexes"].extend(range(vision_start, vision_end + 1))
+                if "cond" in video_types[video_id]:
+                    self.sample["attn_modes"].append("full_noise")  # TODO : gen 分支也使用模版则需加上判断
+                    if self.sample_task == "edit":
+                        self.sample["sample_modality"].extend([modality_map["ref_source"]] * curr_split_len)
+                    elif self.sample_task == "idip":
+                        self.sample["sample_modality"].extend([modality_map["ref_image"]] * curr_split_len)
+                elif "target" in video_types[video_id]:
+                    self.sample["mse_loss_indexes"].extend(range(vision_start, vision_end + 1))  # 目前不支持f2v
+                    self.sample["attn_modes"].append("noise")  # TODO : gen 分支也使用模版则需加上判断
+                    self.sample["sample_modality"].extend([modality_map["noise"]] * curr_split_len)
+                else:
+                    raise ValueError(f"video_types {video_types[video_id]} not supported")
+
+            self.sample["packed_position_ids"].extend([curr_rope_id] * curr_split_len)
+            # attn_modes.append("full")  # TODO : gen 分支也使用模版则需加上判断
+            self.sample["split_lens"].append(len(span_index) + 2)
+            curr = vision_end + 1  # 对应 '<|vision_end|>' token 的index
+            curr_rope_id += 1
+            self.sample["packed_text_indexes"].append(curr)
+            curr += 1  # 对应下一个序列的起始token
+
+        len_split_last = self.sample["sample_lens"] - (curr - curr_split_idx) if spans_index != [] else len(text_ids)
+        if len_split_last != 0:  # 即末尾还有一段文本
+            self.sample["split_lens"].append(len_split_last)
+            self.sample["packed_text_indexes"].extend(range(curr, curr + len_split_last))
+            self.sample["packed_position_ids"].extend(range(curr_rope_id, curr_rope_id + len_split_last))
+            self.sample["attn_modes"].append("causal")
+            self.sample["sample_modality"].extend([modality_map["system_prompt"]] * len_split_last)
+
+        if item_loss == 1:  # 即代表为理解任务，需要计算ce loss
+            packed_label_index = tgt_index
+            self.sample["packed_label_ids"].extend(text_ids[packed_label_index[0] :])
+            packed_label_index = np.asarray(packed_label_index, dtype=np.int64) + curr_split_idx
+            ce_loss_indexes = (packed_label_index - 1).tolist()
+            self.sample["ce_loss_indexes"].extend(ce_loss_indexes)
+            self.sample["ce_loss_weights"].extend([len2weight(len(packed_label_index))] * (len(packed_label_index)))
+
+            # 获取文本中 caption 的 index ，修改其sample_modality
+        # caption_index = item.get("cap_index", [])
+        if caption_index != []:
+            self.sample["sample_modality"][caption_index[0] : caption_index[-1] + 1] = [modality_map["text"]] * (caption_index[-1] - caption_index[0] + 1)
+
+        curr_split_idx += len(text_ids)
+        curr = curr_split_idx
+        return self.sample, curr, curr_rope_id, curr_split_len
+    def process_und_template(self, system_prompt, user_prompt, answer, vit_video_tensor):
+        """
+        格式：
+        <|im_start|>system
+        {system_prompt}<|im_end|>
+        <|im_start|>user
+        <|vision_start|><|video_pad|><|vision_end|>{instruction_prompt}<|im_end|>
+        <|im_start|>assistant
+        {answer}<|im_end|>
+        """
+        curr = 0
+        sample_lens = 0
+        curr_rope_id = 0
+        curr_video_grid_thw = []
+
+        # 1. 处理第一部分的文本：
+        # <|im_start|>system
+        # {system_prompt}<|im_end|>
+        # <|im_start|>user
+        prompt_prefix = "<|im_start|>" + "system\n" + system_prompt + "<|im_end|>" + "\n" + "<|im_start|>" + "user\n"
+        text_ids_prompt_prefix = self.tokenizer.encode(prompt_prefix)
+        self.sample["packed_text_ids"].extend(text_ids_prompt_prefix)
+        self.sample["packed_text_indexes"].extend(range(curr, curr + len(text_ids_prompt_prefix)))
+        curr += len(text_ids_prompt_prefix)
+        split_len_prefix = len(text_ids_prompt_prefix)
+
+        # update sequence status
+        self.sample["attn_modes"].append("causal")
+        self.sample["packed_position_ids"].extend(range(curr_rope_id, curr_rope_id + split_len_prefix))
+        self.sample["split_lens"].append(split_len_prefix)
+        curr_rope_id += split_len_prefix
+
+        # 2. 处理vision token部分，添加视觉tokens，在线模式下，video_tensor 为tensor, 离线模式下，video_tensor 为list [latent]
+        self.sample["packed_text_ids"].append(self.start_of_image)  # 151652, <|vision_start|>
+        self.sample["packed_text_indexes"].append(curr)
+        curr += 1
+        split_len_vision_token = 1
+
+        if isinstance(vit_video_tensor, torch.Tensor):  # online
+            self.sample["vit_video_tensors"].append(vit_video_tensor)  # CTHW 原始的视频，非latent , 仅用于validation中的可视化
+
+            # preprocess video
+            vit_tokens = patchify_video_with_merge(
+                vit_video_tensor, self.data_config.vit_patch_size, self.data_config.vit_patch_size_temporal
+            )  # C T H W -> (T//2 * H//p * W//p) (p*p*2*C)
+            num_video_tokens = vit_tokens.shape[0] // 4  # 实际上qwen2.5-vl还需要merge，2x2 merge成1个， hardcode for temp
+            t, h, w = vit_video_tensor.size(1), vit_video_tensor.size(2), vit_video_tensor.size(3)
+
+            self.sample["packed_vit_tokens"].append(vit_tokens)
+            self.sample["vit_data_mode"].append("online")
+
+        if t is not None:
+            vit_video_grid_thw = [
+                t // self.data_config.vit_patch_size_temporal,
+                h // self.data_config.vit_patch_size,
+                w // self.data_config.vit_patch_size,
+            ]  # [1, 16, 16]
+        self.sample["vit_video_grid_thw"].append(vit_video_grid_thw)
+        curr_video_grid_thw.append(vit_video_grid_thw)
+
+        self.sample["vit_token_seqlens"].append(num_video_tokens)
+        self.sample["packed_vit_position_ids"].append(
+            torch.zeros(num_video_tokens)
+        )  # TODO : 不一定是 0 ？ 对于多个vit序列会有问题
+
+        self.sample["packed_vit_token_indexes"].extend(range(curr, curr + num_video_tokens))
+        curr += num_video_tokens
+        split_len_vision_token += num_video_tokens
+
+        # dummy position_ids
+        self.sample["packed_text_ids"].extend([self.image_token_id] * num_video_tokens)
+
+        # add a <|endofimage|> token
+        self.sample["packed_text_ids"].append(self.end_of_image)  # 151653, <|vision_end|>
+        self.sample["packed_text_indexes"].append(curr)
+        curr += 1
+        split_len_vision_token += 1
+
+        # update sequence status
+        self.sample["attn_modes"].append("full")
+        self.sample["packed_position_ids"].extend([curr_rope_id] * split_len_vision_token)
+        self.sample["split_lens"].append(split_len_vision_token)
+        curr_rope_id += 1
+
+        # 3. 处理后半部分的文本：
+        # {instruction_prompt}<|im_end|>
+        # <|im_start|>assistant
+        prompt_postfix = user_prompt + "<|im_end|>" + "\n" + "<|im_start|>" + "assistant"
+        text_ids_prompt_postfix = self.tokenizer.encode(prompt_postfix)
+        self.sample["packed_text_ids"].extend(text_ids_prompt_postfix)
+        self.sample["packed_text_indexes"].extend(range(curr, curr + len(text_ids_prompt_postfix)))
+        curr += len(text_ids_prompt_postfix)
+        split_len_postfix = len(text_ids_prompt_postfix)
+
+        # update sequence status
+        self.sample["attn_modes"].append("causal")
+        self.sample["packed_position_ids"].extend(range(curr_rope_id, curr_rope_id + split_len_postfix))
+        self.sample["split_lens"].append(split_len_postfix)
+        curr_rope_id += split_len_postfix
+
+        # 4. 添加answer
+        answer = "\n" + answer
+        answer_ids = self.tokenizer.encode(answer)
+        shifted_text_ids_answer = answer_ids + [self.eos_token_id]
+        self.sample["packed_text_ids"].extend(shifted_text_ids_answer)
+        self.sample["packed_text_indexes"].extend(range(curr, curr + len(shifted_text_ids_answer)))
+
+        # item_loss == 1:
+        self.sample["ce_loss_indexes"].extend(range(curr, curr + len(shifted_text_ids_answer)))
+        self.sample["ce_loss_weights"].extend([len2weight(len(shifted_text_ids_answer))] * (len(shifted_text_ids_answer)))
+        self.sample["packed_label_ids"].extend(shifted_text_ids_answer)  # NOTE: self.eos_token_id=151645 <|im_end|>
+
+        curr += len(shifted_text_ids_answer)
+        split_len_answer = len(shifted_text_ids_answer)
+
+        # update sequence status
+        self.sample["attn_modes"].append("causal")
+        self.sample["packed_position_ids"].extend(range(curr_rope_id, curr_rope_id + split_len_answer))
+        self.sample["split_lens"].append(split_len_answer)
+        curr_rope_id += split_len_answer
+
+        sample_lens = len(self.sample["packed_text_ids"])
+
+        return sample_lens, curr_video_grid_thw
+
+    def _finalize_sample(self, sample_lens, curr_video_grid_thw, sample_type, sample=None, additional_fields=None, video_sizes=None):
+        """通用 sample 结尾处理，减少代码重复"""
+        self.sample["sample_lens"] = [sample_lens]
+        self.sample["video_grid_thw"] = torch.tensor([curr_video_grid_thw])
+        self.sample["packed_text_ids"] = torch.tensor(self.sample["packed_text_ids"])
+        self.sample["packed_text_indexes"] = torch.tensor(self.sample["packed_text_indexes"])
+
+        self.sample["packed_vae_token_indexes"] = torch.tensor(self.sample["packed_vae_token_indexes"])
+        self.sample["packed_position_ids"] = torch.tensor(self.sample["packed_position_ids"])
+        self.sample["vae_video_grid_thw"] = torch.tensor(self.sample["vae_video_grid_thw"])
+
+        self.sample["vit_video_grid_thw"] = torch.tensor(self.sample["vit_video_grid_thw"])
+        self.sample["packed_vit_token_indexes"] = torch.tensor(self.sample["packed_vit_token_indexes"])
+
+        self.sample["sample_N_target"] = torch.tensor([[1]])
+        self.sample["sample_type"] = [sample_type]
+        self.sample["padded_videos"] = self.sample["vae_video_tensors"]
+
+        if "ce_loss_indexes" in self.sample and len(self.sample["ce_loss_indexes"]) > 0:
+            self.sample["ce_loss_indexes"] = torch.tensor(self.sample["ce_loss_indexes"])
+        # 原始代码总是处理 mse_loss_indexes，即使为空列表
+        self.sample["mse_loss_indexes"] = torch.tensor(self.sample["mse_loss_indexes"])
+        if video_sizes is not None:
+            self.sample["video_sizes"] = torch.tensor(video_sizes)
+        elif "video_sizes" in self.sample:
+            self.sample["video_sizes"] = torch.tensor(self.sample["video_sizes"])
+        if "sample_modality" in self.sample and len(self.sample["sample_modality"]) > 0:
+            self.sample["sample_modality"] = torch.tensor(self.sample["sample_modality"])
+
+        if sample is not None:
+            for key in ["index", "category", "question", "gt"]:
+                if key in sample:
+                    self.sample[key] = sample[key]
+
+        if additional_fields is not None:
+            for key, value in additional_fields.items():
+                self.sample[key] = value
+
+        return self.sample
+
+    def ti2t_sample(self, idx: int) -> Dict[str, Any]:
+        """
+        获取单个样本
+        默认system_prompt和user_prompt中均不包含sos和eos token
+        格式：
+        <|im_start|>system
+        {system_prompt}<|im_end|>
+        <|im_start|>user
+        <|vision_start|><|video_pad|><|vision_end|>{instruction_prompt}<|im_end|>
+        <|im_start|>assistant
+        {answer}<|im_end|>
+        """
+        self.sample = self.set_sequence_status()
+        sample = self.data[idx]
+
+        system_prompt = sample["system_prompt"]
+        user_prompt = sample["user_prompt"]
+        answer = sample["gt"]
+        image_path = sample["image_path"]
+        vit_image_tensor = self.get_video_tensor_online(image_path, vision_stream="vit_video", element_dtype="image")  # [C=3, T=2, H, W]
+
+        sample_lens, curr_video_grid_thw = self.process_und_template(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            answer=answer,
+            vit_video_tensor=vit_image_tensor,
+        )
+
+        self.sample["system_prompt"] = system_prompt
+        self.sample["user_prompt"] = user_prompt
+        self.sample["image_path"] = image_path
+        self.sample["instruction"] = user_prompt
+
+        return self._finalize_sample(
+            sample_lens, curr_video_grid_thw,
+            sample_type="und",
+            sample=sample
+        )
+
+    def t2v_sample(self, idx: int) -> Dict[str, Any]:
+        """获取单个样本"""
+        _T, _H, _W = self.data_config.vae_downsample
+        if self.data_config.task == "t2i":
+            t = 1
+            t_ = 1
+            element_dtype = 'image'
+        else:
+            t = (self.data_config.num_frames - 1) // _T + 1  # k*N+1 一般t维度不做patchify!! 如果t维度要做patchify，写法需要更新
+            t_ = self.data_config.num_frames
+            element_dtype = 'video'
+
+        self.sample = self.set_sequence_status()
+        packed_text_indexes, packed_position_ids, sample_modality = [], [], []
+        sample = self.data[idx]
+        if "prompt_en" in sample.keys():
+            user_prompt = "".join(sample["prompt_en"][0])
+            # user_prompt = sample["prompt_en"][0][0] + sample["prompt_en"][0][1] # image_caption + video_caption
+        else:
+            user_prompt = sample["data"]
+
+        if self.data_config.text_template:
+            caption_instruction = generate_system_prompt(system_prompt_type=self.data_config.task, vision_type=element_dtype)
+
+            text_template_user, text_template_assistant, vit_num_tokens, video_types = [], [], [], []
+            if self.system_prompt_type == 'SP2':
+                user_prompt = caption_instruction + " " + user_prompt # user_prompt 对应caption_q
+                caption_instruction = "You are a helpful assistant. "
+            elif self.system_prompt_type == 'SP1':
+                # SP1: assistant
+                caption_instruction = "You are a helpful assistant. " + caption_instruction
+
+            text_template_user.append({"type": "text", "text": user_prompt})
+        else:
+            # 编码文本
+            text_ids = self.tokenizer.encode(user_prompt)
+            text_ids = [self.new_token_ids["bos_token_id"]] + text_ids + [self.new_token_ids["eos_token_id"]]
+            text_split_len = len(text_ids)
+            packed_text_indexes.extend(range(0, text_split_len))  # curr = 0
+            packed_position_ids.extend(range(0, text_split_len))
+            sample_modality.extend([modality_map['text']] * text_split_len)
+
+        # 视频参数
+
+        h = self.data_config.H // _H
+        w = self.data_config.W // _W
+        spatial_merge_size = 2  # TODO：spatial_merge_size 一定是2吗？
+        # vae_video_grid_thw = torch.tensor([[t, h * spatial_merge_size, w * spatial_merge_size]])
+        num_vid_tokens = t * h * w
+
+        if self.data_config.text_template:
+            text_template_assistant.append({"type":element_dtype})
+        else:
+            text_ids.append(self.new_token_ids["start_of_image"])
+            packed_text_indexes.append(text_split_len)
+            packed_vae_token_indexes = torch.tensor(range(len(text_ids), len(text_ids) + num_vid_tokens))
+            text_ids.extend([self.image_token_id] * num_vid_tokens)
+            text_ids.append(self.new_token_ids["end_of_image"])
+            packed_text_indexes.append(len(text_ids) - 1)
+            video_split_len = num_vid_tokens + 2
+            packed_position_ids.extend([text_split_len] * video_split_len)
+            sample_modality.extend([modality_map['noise']] * video_split_len)
+
+        if self.data_config.text_template:
+            all_token_id, spans_index, tgt_index, search_index = self.render_template(caption_instruction, text_template_assistant, text_template_user, [num_vid_tokens], search_text=user_prompt)
+
+            # 计算
+            self.sample, curr, curr_rope_id, curr_split_len = self.process_text_template(
+                all_token_id,
+                spans_index,
+                tgt_index,
+                search_index,
+                video_types=['target_vae_video'],
+                curr=0,
+                curr_rope_id=0,
+                curr_split_len=0,
+                item_loss=0,
+                )
+
+        # 构造返回字典
+        return {
+            "packed_text_ids": torch.tensor(text_ids) if not self.data_config.text_template else torch.tensor(self.sample["packed_text_ids"]),
+            "packed_text_indexes": torch.tensor(packed_text_indexes) if not self.data_config.text_template else torch.tensor(self.sample["packed_text_indexes"]),
+            "packed_vae_token_indexes": packed_vae_token_indexes if not self.data_config.text_template else torch.tensor(self.sample["packed_vae_token_indexes"]),
+            "vae_video_grid_thw": torch.tensor([[t, h * spatial_merge_size, w * spatial_merge_size]]),
+            "video_grid_thw": torch.tensor([[[t, h * spatial_merge_size, w * spatial_merge_size]]]),
+            "sample_N_target": torch.tensor([[1]]),  # 生成一个视频
+            "split_lens": [text_split_len, video_split_len] if not self.data_config.text_template else self.sample["split_lens"],
+            "attn_modes": ["causal", "noise"] if not self.data_config.text_template else self.sample["attn_modes"],
+            "sample_lens": [text_split_len + video_split_len] if not self.data_config.text_template else [self.sample["sample_lens"]],
+            "val_sample_type": ["gen"],  # 生成任务
+            "padded_latent": None,
+            "mse_loss_indexes": packed_vae_token_indexes if not self.data_config.text_template else torch.tensor(self.sample["mse_loss_indexes"]),
+            "video_sizes": torch.tensor([[t_, self.data_config.H, self.data_config.W]]),
+            "packed_position_ids": torch.tensor(packed_position_ids) if not self.data_config.text_template else torch.tensor(self.sample["packed_position_ids"]),
+            "caption": user_prompt,  # 用于可视化
+            "sample_type": ["gen"],  # 生成任务
+            "index": sample["index"],
+            "caption_cn": user_prompt,
+            "original_prompt_en": sample["original_prompt_en"] if "original_prompt_en" in sample.keys() else user_prompt,  # 新增字段，用于保存的命名
+            "sample_task": torch.zeros(text_split_len + video_split_len) if not self.data_config.text_template else torch.zeros(self.sample["sample_lens"]),
+            "sample_modality": torch.tensor(sample_modality) if not self.data_config.text_template else torch.tensor(self.sample["sample_modality"]),
+            "additional_info": sample["additional_info"] if "additional_info" in sample.keys() else None,
+        }
+
+    def tv2v_sample(self, idx: int) -> Dict[str, Any]:
+        """获取单个样本 - 使用 tiv2v_sample 的通用 interleave 格式"""
+        sample = self.data[idx]
+        user_prompt = "Create a 2D animation based on the provided image of a maze. The blue star slides smoothly along the white path, stopping perfectly on the red flag and then acquiring a trophy. The blue star never slides or crosses into the black segments of the maze. The camera is a static, top-down view showing the entire maze."
+        
+        # 转换为 tiv2v 的 interleave 格式
+        sample["data"] = {
+            "interleave_array": [user_prompt, sample["image_path"], sample["image_path"], sample["video_path"]],
+            "element_dtype_array": ["text", "image", "image", "video"],
+            "istarget_in_interleave": [0, 0, 0, 1]
+        }
+        
+        self.sample_task = 'edit'
+        result = self.tiv2v_sample(idx)
+        
+        # 额外设置一些 tv2v 特有的字段
+        result["caption"] = user_prompt
+        result["caption_cn"] = user_prompt
+        
+        return result
+
+    def tiv2v_sample(self, idx: int) -> Dict[str, Any]: # 构造一个统一的interleave数据处理函数
+        """获取单个样本"""
+        sample_modality, text_template_user, text_template_assistant, vit_num_tokens, video_types = [], [], [], [], []
+        self.sample = self.set_sequence_status()
+        sample_lens = 0
+        sample = self.data[idx]
+
+        index = sample["index"]
+        data_sample = sample["data"] # {'interleave_array': [...], 'element_dtype_array': [...], 'istarget_in_interleave': [...]}}
+        additional_info = sample["data"]["additional_info"] if "additional_info" in sample["data"] else [] #sample["data"]["additional_info"]
+
+        interleave_array, element_dtype_array, istarget_in_interleave = data_sample["interleave_array"], data_sample["element_dtype_array"], data_sample["istarget_in_interleave"]
+
+        curr, curr_rope_id, curr_split_len, curr_video_grid_thw, video_sizes, caption_all = 0, 0, 0, [], [], ''
+        for element, element_dtype, is_target in zip(interleave_array, element_dtype_array, istarget_in_interleave):
+            if element_dtype == "text":
+                # 文本 序列处理
+                caption_all += element
+                if self.data_config.text_template:
+                    text_template_user.append({"type": "text", "text": element})
+                    search_text = element
+                else:
+                    self.sample, curr, curr_rope_id, curr_split_len = self.process_text(element, curr=curr, curr_rope_id=curr_rope_id, curr_split_len=0, item_loss=is_target)
+                    sample_lens += curr_split_len
+                    sample_modality.extend([modality_map['text']] * curr_split_len)
+            elif element_dtype in ["image", "video"]:
+                if is_target == 0: # condition 需要 vit 处理
+                    vit_image_tensor = self.get_video_tensor_online(element, vision_stream="vit_video", element_dtype=element_dtype)  # [C=3, T, H, W]
+                    self.sample, curr, curr_rope_id, curr_split_len, curr_video_grid_thw, num_tokens_ = self.process_vit_video(
+                        vit_image_tensor, curr=curr, curr_rope_id=curr_rope_id, curr_split_len=0, curr_video_grid_thw=curr_video_grid_thw, item_loss=0
+                        )
+                    if self.data_config.text_template:
+                        text_template_user.append({"type": element_dtype})
+                        vit_num_tokens.append(num_tokens_)
+                        video_types.append("vit_video")
+                    else:
+                        sample_lens += curr_split_len
+                        sample_modality.extend([modality_map['ref_vit']] * curr_split_len)
+
+                # vae condition/target 处理
+                vae_image_tensor = self.get_video_tensor_online(element, vision_stream="vae_video", element_dtype=element_dtype)  # [C=3, T=1, H, W]
+                self.sample, curr, curr_rope_id, curr_split_len, curr_video_grid_thw, video_sizes, num_tokens_ = self.process_vae_video(
+                    vae_image_tensor, curr=curr, curr_rope_id=curr_rope_id, curr_split_len=0, curr_video_grid_thw=curr_video_grid_thw, video_sizes=video_sizes, item_loss=is_target
+                )
+                if self.data_config.text_template:
+                    vit_num_tokens.append(num_tokens_)
+                    if is_target == 0:
+                        text_template_user.append({"type": element_dtype})
+                        video_types.append("cond_vae_video")
+                    else:
+                        text_template_assistant.append({"type": element_dtype})
+                        video_types.append("target_vae_video")
+                else:
+                    sample_lens += curr_split_len
+                    if is_target == 0:
+                        sample_modality.extend([modality_map[f'ref_{element_dtype}']] * curr_split_len)
+                    else:
+                        sample_modality.extend([modality_map[f'noise']] * curr_split_len)
+
+        if self.data_config.text_template:
+            if text_template_user[0]['type']=='text': # 先图像/视频后文本的处理：
+                text_template_user = text_template_user[1:] + text_template_user[:1] # HACK
+            caption_instruction = generate_system_prompt(system_prompt_type=self.data_config.task, vision_type=element_dtype)
+            all_token_id, spans_index, tgt_index, search_index = self.render_template(caption_instruction, text_template_assistant, text_template_user, vit_num_tokens, search_text=search_text)
+            # 计算
+            self.sample, curr, curr_rope_id, curr_split_len = self.process_text_template(
+                all_token_id,
+                spans_index,
+                tgt_index,
+                search_index,
+                video_types=video_types,
+                curr=0,
+                curr_rope_id=0,
+                curr_split_len=0,
+                item_loss=0,
+                )
+            sample_lens = len(all_token_id)
+            sample_modality = self.sample["sample_modality"]
+
+
+        additional_fields = {
+            "caption": caption_all,
+            "caption_cn": caption_all,
+            "index": sample["index"],
+            "additional_info": additional_info
+        }
+
+        if self.sample_task == 'edit':
+            self.sample["sample_task"] = torch.ones(sample_lens) * sample_task_map['edit']
+        elif self.sample_task == 'idip':
+            self.sample["sample_task"] = torch.ones(sample_lens) * sample_task_map['idip']
+
+        return self._finalize_sample(
+            sample_lens, curr_video_grid_thw,
+            sample_type="gen",
+            sample=sample,
+            additional_fields=additional_fields,
+            video_sizes=video_sizes
+        )
+
+    def render_template(self, instruction, text_template_assistant, text_template_user, vit_num_tokens, search_text=""):
+        # NOTE: 无target 文本的样本，设置 caption_a = ""
+        # caption_i, caption_q, caption_a = element[0], element[1], element[2]
+
+        # text_template_assistant.append({"type": "text", "text": caption_a}) # caption
+        # if caption_q != "":
+        #     text_template_user.append({"type": "text", "text": caption_q})
+
+        messages = [
+            {
+                "role": "user",
+                "content": text_template_user, # 原使用
+            },
+            {
+                "role": "assistant",
+                "content": text_template_assistant,
+            },
+        ]
+        caption_all = render_qwenvl_prompt(messages, default_system=instruction, include_assistant_content=True) # NOTE: 是否添加 You are a helpful assistant.
+
+        all_token_id, spans_index, tgt_index, search_index = expand_and_index_by_token_ids_new(
+            rendered_text=caption_all.strip(), tokens=vit_num_tokens, target_text=f"assistant\n", tokenizer=self.tokenizer, search_text=search_text
+        )
+        assert len(all_token_id[tgt_index[0] :]) == len(tgt_index)
+        return all_token_id, spans_index, tgt_index, search_index
+
+    def x2t_sample(self, idx: int) -> Dict[str, Any]: # 构造一个统一的interleave数据处理函数
+        """获取单个样本"""
+        sample_modality = []
+        self.sample = self.set_sequence_status()
+        sample_lens = 0
+        sample = self.data[idx]
+        index = sample["index"]
+        data_sample = sample["data"]  # {'interleave_array': [...], 'element_dtype_array': [...], 'istarget_in_interleave': [...]}}
+
+        interleave_array, element_dtype_array, istarget_in_interleave = data_sample["interleave_array"], data_sample["element_dtype_array"], data_sample["istarget_in_interleave"]
+
+        curr, curr_rope_id, curr_split_len, curr_video_grid_thw, video_sizes, caption_all = 0, 0, 0, [], [], ""
+        if self.data_config.text_template:
+            text_template_user, text_template_assistant, vit_num_tokens, video_types = [], [], [], []
+        for element, element_dtype, is_target in zip(interleave_array, element_dtype_array, istarget_in_interleave):
+            if element_dtype == "text":
+                # 文本 序列处理
+                if is_target == 1:  # 对应target 文本
+                    if self.data_config.text_template:  # 即使用system_prompt
+                        if isinstance(element, str):  # 即只有一条文本
+                            caption_a = element
+                            caption_i = generate_system_prompt(system_prompt_type="caption", vision_type=element_dtype_array[0])
+                            caption_q = ""
+                            element = [caption_i, caption_q, caption_a]
+
+                        # ====================== SP1 + SP2 处理 START ======================
+                        caption_i, caption_q, caption_a = element[0], element[1], element[2]
+                        if self.system_prompt_type == 'SP2':
+                            caption_q = caption_i + " " + caption_q
+                            caption_i = "You are a helpful assistant. "
+                        elif self.system_prompt_type == 'SP1':
+                            # SP1: assistant
+                            caption_i = "You are a helpful assistant. " + caption_i
+                        element = [caption_i, caption_q, caption_a]
+
+                        print('element',element)
+                        # ====================== SP1 + SP2 处理 END ======================
+
+                        caption_i, caption_q, caption_a = element[0], element[1], element[2]
+
+                        text_template_assistant.append({"type": "text", "text": caption_a}) # caption
+                        if caption_q != "":
+                            text_template_user.append({"type": "text", "text": caption_q})
+
+                        all_token_id, spans_index, tgt_index, search_index = self.render_template(caption_i, text_template_assistant, text_template_user, vit_num_tokens)
+                        self.sample, curr, curr_rope_id, curr_split_len = self.process_text_template(
+                            all_token_id,
+                            spans_index,
+                            tgt_index,
+                            search_index,
+                            video_types,
+                            curr=curr,
+                            curr_rope_id=curr_rope_id,
+                            curr_split_len=0,
+                            item_loss=is_target,
+                        )
+                        sample_lens += curr_split_len
+
+                        caption_all += "\n".join(element)
+                        caption_answer = element[-1]  # 传出element
+                    else:
+                        if isinstance(element, list):
+                            element = element[-1]  # 使用 element = "" 效果是一样的，对生成理解文本无影响
+                        self.sample, curr, curr_rope_id, curr_split_len = self.process_text(
+                            element, curr=curr, curr_rope_id=curr_rope_id, curr_split_len=0, item_loss=is_target
+                        )
+                        sample_lens += curr_split_len
+                        sample_modality.extend([modality_map["text"]] * curr_split_len)
+                        caption_all += element
+                        caption_answer = element  # NOTE unsure
+
+            elif element_dtype in ["image", "video"]:
+
+                vit_image_tensor = self.get_video_tensor_online(element, vision_stream="vit_video", element_dtype=element_dtype)  # [C=3, T, H, W]
+                self.sample, curr, curr_rope_id, curr_split_len, curr_video_grid_thw, num_tokens_ = self.process_vit_video(
+                    vit_image_tensor, curr=curr, curr_rope_id=curr_rope_id, curr_split_len=0, curr_video_grid_thw=curr_video_grid_thw, item_loss=0
+                )
+                sample_lens += curr_split_len
+                sample_modality.extend([modality_map["ref_vit"]] * curr_split_len)
+                index_video_path_name = element.split("/")[-1]
+
+                if self.data_config.text_template:
+                    text_template_user.append({"type": element_dtype})
+                    vit_num_tokens.append(num_tokens_)
+                    video_types.append("vit_video")
+
+        if self.sample["sample_lens"] != []:
+            sample_lens = self.sample["sample_lens"]
+
+        if self.sample["sample_modality"] != []:
+            sample_modality = self.sample["sample_modality"]
+        self.sample["sample_modality"] = sample_modality
+        self.sample["sample_task"] = torch.ones(self.sample["sample_lens"]) * sample_task_map["t2v"]
+
+        additional_fields = {
+            "caption": caption_all,
+            "caption_cn": caption_all,
+            "caption_answer": caption_answer,
+            "index_item": index,
+            "index": index_video_path_name,
+            "additional_information": data_sample["additional_information"] if "additional_information" in data_sample.keys() else {},
+            "visual_path": data_sample["interleave_array"][0],
+            "question": data_sample["interleave_array"][1][1] if isinstance(data_sample["interleave_array"][1], list) and len(data_sample["interleave_array"][1]) > 1 else None,
+            "answer": data_sample["interleave_array"][1][2] if isinstance(data_sample["interleave_array"][1], list) and len(data_sample["interleave_array"][1]) > 2 else None
+        }
+
+        return self._finalize_sample(
+            sample_lens, curr_video_grid_thw,
+            sample_type="und",
+            additional_fields=additional_fields
+        )
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        if self.data_config.task == "tv2v":
+            return self.tv2v_sample(idx)
+        elif self.data_config.task in ["t2i","t2v"]:
+            return self.t2v_sample(idx)
+        elif self.data_config.task == "ti2t":
+            return self.ti2t_sample(idx)
+        elif "tiv2v" in self.data_config.task:
+            if 'edit' in self.data_config.task:
+                self.sample_task = 'edit'
+            elif 'idip' in self.data_config.task:
+                self.sample_task = 'idip'
+            return self.tiv2v_sample(idx)
+        elif self.data_config.task == "video_edit":
+            self.sample_task = 'edit'
+            return self.tiv2v_sample(idx)
+        elif self.data_config.task == "video_idip" or self.data_config.task == "video_idip_multiref":
+            self.sample_task = 'idip'
+            return self.tiv2v_sample(idx)
+        elif self.data_config.task == "image_edit":
+            self.sample_task = 'edit'
+            return self.tiv2v_sample(idx)
+        elif self.data_config.task == "image_idip":
+            self.sample_task = 'idip'
+            return self.tiv2v_sample(idx)
+        elif self.data_config.task in ["x2t", "x2t_image", "x2t_video"]:
+            return self.x2t_sample(idx)
+        else:
+            raise ValueError(f"Unknown task: {self.data_config.task}")
diff --git a/data/system_prompt_render.py b/data/system_prompt_render.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb25c62e71a0ebc1879df71267687d5ab527da1
--- /dev/null
+++ b/data/system_prompt_render.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+from typing import Any, Dict, List, Tuple
+from jinja2 import Environment, BaseLoader
+
+
+# —— 模板：用显式 \n 控制换行，并用 -%} / {%- 去掉多余空白 ——
+JINJA_PROMPT_TMPL = (
+    "<|im_start|>system\n"
+    "{{ system_prompt }}<|im_end|>\n"
+    "{% for m in msgs -%}"
+    "<|im_start|>{{ m.role }}\n"
+    "{% if not (m.role == 'assistant' and not include_assistant_content) -%}"
+    "{{ m.content | render_mm_list }}"
+    "{% endif -%}"
+    "{% if (not (loop.last and m.role == 'assistant')) or include_assistant_content -%}"
+    "<|im_end|>\n"
+    "{% endif -%}"
+    "{% endfor -%}"
+)
+
+VS, VE = "<|vision_start|>", "<|vision_end|>"
+VP, IP = "<|video_pad|>", "<|image_pad|>"
+
+def expand_and_index_by_token_ids_new(
+    rendered_text: str,
+    tokens: List[int],  # 遇到 VP/IP 的顺序逐个取 K
+    tokenizer,  # HF tokenizer（需含 VP/IP/VE/VS 等special tokens）
+    target_text: str = "",  # 如 "assistant\n"
+    search_text: str = "",  # 如 ""
+) -> Tuple[str, List[int], List[List[int]], List[int]]:
+    """
+    返回:
+      new_rendered_text: 扩展后的文本
+      all_token_id     : new_rendered_text 的 token ids
+      spans_index      : 每个pad块在 all_token_id 中的索引列表（按出现顺序），如 [[100..199], [350..549], ...]
+      tgt_index        : target_text 在 all_token_id 中的索引列表（找不到返回 []）
+    """
+    vs_ids = tokenizer(VS, add_special_tokens=False)["input_ids"]
+    ve_ids = tokenizer(VE, add_special_tokens=False)["input_ids"]
+    vp_ids = tokenizer(VP, add_special_tokens=False)["input_ids"]
+    ip_ids = tokenizer(IP, add_special_tokens=False)["input_ids"]
+
+    enc = tokenizer(rendered_text, add_special_tokens=False)
+    base_ids = enc["input_ids"]
+
+    # ---------- 1) 扫描并按出现顺序扩展 VP/IP 为 K 次，占位信息入 pad_blocks ----------
+    # find all VS positions and pair them with nearest VE after each VS
+
+    all_ids: List[int] = []
+    spans_index: List[List[int]] = []
+
+    i = 0               # base_ids 扫描指针
+    tk_ptr = 0          # tokens(K) 指针
+
+    while True:
+        try:
+            vs_positions_ = base_ids[i:].index(vs_ids[0]) + i
+        except:
+            all_ids.extend(base_ids[i:])
+            break
+        all_ids.extend(base_ids[i: vs_positions_])
+        i = vs_positions_ + 3
+
+        # 进行序列扩展，插入占位信息入 pad_ids
+        pad_ids = base_ids[vs_positions_ + 1:vs_positions_ + 2]
+        K = int(tokens[tk_ptr])
+        start, end = len(all_ids) + 1, len(all_ids) + 1 + K
+        all_ids.extend(vs_ids + pad_ids * K + ve_ids)
+        tk_ptr += 1
+
+        # 获取 每个pad token 在 all_token_id 中的索引列表（按出现顺序），如 [[100..199], [350..549], ...]
+        #start, end = vs_positions_ + 1, vs_positions_ + 1 + K
+        spans_index.append(list(range(start, end)))
+
+    tgt_index: List[int] = []
+    if target_text:
+        tgt_ids_identify = tokenizer(target_text, add_special_tokens=False)["input_ids"]
+        i = 0               # base_ids 扫描指针
+
+        while i < len(all_ids):
+            tgt_positions_ = all_ids[i:].index(tgt_ids_identify[0]) + i
+            if all_ids[tgt_positions_+len(tgt_ids_identify)-1] == tgt_ids_identify[-1]:
+                tgt_index = list(range(tgt_positions_+len(tgt_ids_identify), len(all_ids)))
+                break
+            else:
+                i = tgt_positions_ + 1
+
+    search_index: List[int] = []
+    if search_text:
+        search_ids_identify = tokenizer(search_text, add_special_tokens=False)["input_ids"]
+        i = 0               # base_ids 扫描指针
+
+        while i < len(all_ids):
+            search_positions_ = all_ids[i:].index(search_ids_identify[0]) + i
+            if all_ids[search_positions_:search_positions_+len(search_ids_identify)] == search_ids_identify:
+                search_index = list(range(search_positions_, search_positions_+len(search_ids_identify)))
+                break
+            else:
+                i = search_positions_ + 1
+
+
+    return all_ids, spans_index, tgt_index, search_index
+
+def _extract_system_prompt(messages: List[Dict[str, Any]], default_system: str) -> str:
+    for m in messages:
+        if m.get("role") == "system":
+            c = m.get("content", "")
+            if isinstance(c, str):
+                return c
+            if isinstance(c, list):
+                texts = [it.get("text", "") for it in c if isinstance(it, dict) and it.get("type") == "text"]
+                if texts:
+                    return "".join(texts)
+    return default_system
+
+
+def _normalize_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    norm: List[Dict[str, Any]] = []
+    for m in messages:
+        role = m.get("role")
+        if role == "system":
+            continue
+        c = m.get("content", "")
+        if isinstance(c, str):
+            items = [{"type": "text", "text": c}]
+        elif isinstance(c, list):
+            items = c
+        else:
+            items = []
+        norm.append({"role": role, "content": items})
+    return norm
+
+
+def render_qwenvl_prompt(
+    messages: List[Dict[str, Any]],
+    default_system: str = "You are a helpful assistant.",
+    include_assistant_content: bool = False,  # 关键参数：是否渲染 assistant 文本
+    force_video_pad: bool = False,
+) -> str:
+    system_prompt = _extract_system_prompt(messages, default_system)
+    msgs = _normalize_messages(messages)
+
+    def _render_mm_list(items: Any) -> str:
+        if isinstance(items, str):
+            return items
+        if not isinstance(items, list):
+            return ""
+        parts: List[str] = []
+        for it in items:
+            if not isinstance(it, dict):
+                continue
+            t = it.get("type")
+            if t == "text":
+                parts.append(it.get("text", ""))
+            elif t == "image":
+                if force_video_pad:
+                    parts.append("<|vision_start|><|image_pad|><|vision_end|>")
+                else:
+                    parts.append("<|vision_start|><|video_pad|><|vision_end|>")
+            elif t == "video":
+                parts.append("<|vision_start|><|video_pad|><|vision_end|>")
+            # 其他模态可在这里扩展
+        return "".join(parts)
+
+    env = Environment(
+        loader=BaseLoader(),
+        autoescape=False,
+        trim_blocks=True,  # 去掉块结束后的换行
+        lstrip_blocks=True,  # 去掉块起始前的空白
+        newline_sequence="\n",
+        keep_trailing_newline=False,
+    )
+    env.filters["render_mm_list"] = _render_mm_list
+    template = env.from_string(JINJA_PROMPT_TMPL)
+
+    return template.render(
+        system_prompt=system_prompt,
+        msgs=msgs,
+        include_assistant_content=include_assistant_content,
+    )
diff --git a/data/transforms.py b/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc22c9b298b3f79565fb3b3e50ba5f78c873fd8
--- /dev/null
+++ b/data/transforms.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from PIL import Image
+
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.transforms import functional as F
+from torchvision.transforms import InterpolationMode, Compose, Normalize
+
+from .video.transforms.na_resize import NaResize
+from .video.transforms.divisible_crop import DivisibleCrop
+from .video.transforms.rearrange import Rearrange
+
+
+class MaxLongEdgeMinShortEdgeResize(torch.nn.Module):
+    """Resize the input image so that its longest side and shortest side are within a specified range,
+    ensuring that both sides are divisible by a specified stride.
+
+    Args:
+        max_size (int): Maximum size for the longest edge of the image.
+        min_size (int): Minimum size for the shortest edge of the image.
+        stride (int): Value by which the height and width of the image must be divisible.
+        max_pixels (int): Maximum pixels for the full image.
+        interpolation (InterpolationMode): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR``, and ``InterpolationMode.BICUBIC`` are supported.
+        The corresponding Pillow integer constants, e.g., ``PIL.Image.BILINEAR`` are also accepted.
+        antialias (bool, optional): Whether to apply antialiasing (default is True).
+    """
+
+    def __init__(
+        self,
+        max_size: int,
+        min_size: int,
+        stride: int,
+        max_pixels: int,
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True
+    ):
+        super().__init__()
+        self.max_size = max_size
+        self.min_size = min_size
+        self.stride = stride
+        self.max_pixels = max_pixels
+        self.interpolation = interpolation
+        self.antialias = antialias
+
+    def _make_divisible(self, value, stride):
+        """Ensure the value is divisible by the stride."""
+        return max(stride, int(round(value / stride) * stride))
+
+    def _apply_scale(self, width, height, scale):
+        new_width = round(width * scale)
+        new_height = round(height * scale)
+        new_width = self._make_divisible(new_width, self.stride)
+        new_height = self._make_divisible(new_height, self.stride)
+        return new_width, new_height
+
+    def forward(self, img, img_num=1):
+        """
+        Args:
+            img (PIL Image): Image to be resized.
+            img_num (int): Number of images, used to change max_tokens.
+        Returns:
+            PIL Image or Tensor: Rescaled image with divisible dimensions.
+        """
+        if isinstance(img, torch.Tensor):
+            height, width = img.shape[-2:]
+        else:
+            width, height = img.size
+
+        scale = min(self.max_size / max(width, height), 1.0)
+        scale = max(scale, self.min_size / min(width, height))
+        new_width, new_height = self._apply_scale(width, height, scale)
+
+        # Ensure the number of pixels does not exceed max_pixels
+        if new_width * new_height > self.max_pixels / img_num:
+            scale = self.max_pixels / img_num / (new_width * new_height)
+            new_width, new_height = self._apply_scale(new_width, new_height, scale)
+
+        # Ensure longest edge does not exceed max_size
+        if max(new_width, new_height) > self.max_size:
+            scale = self.max_size / max(new_width, new_height)
+            new_width, new_height = self._apply_scale(new_width, new_height, scale)
+
+        return F.resize(img, (new_height, new_width), self.interpolation, antialias=self.antialias)
+
+
+class ImageTransform:
+    def __init__(
+        self,
+        max_image_size,
+        min_image_size,
+        image_stride,
+        max_pixels=14*14*9*1024,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5]
+    ):
+        self.stride = image_stride
+
+        self.resize_transform = MaxLongEdgeMinShortEdgeResize(
+            max_size=max_image_size,
+            min_size=min_image_size,
+            stride=image_stride,
+            max_pixels=max_pixels,
+        )
+        self.to_tensor_transform = transforms.ToTensor()
+        self.normalize_transform = transforms.Normalize(mean=image_mean, std=image_std, inplace=True)
+
+    def __call__(self, img, img_num=1):
+        img = self.resize_transform(img, img_num=img_num)
+        img = self.to_tensor_transform(img)
+        img = self.normalize_transform(img)
+        return img
+
+
+class VideoTransform:
+    def __init__(
+        self,
+        resolution=640,
+        mode="area",
+        divisible_crop_size=16,
+        aspect_ratios=("21:9", "16:9", "4:3", "1:1", "3:4", "9:16"),
+        stride_spatial=16,
+        stride_temporal=4,
+        mean=0.5,
+        std=0.5,
+        **kwargs
+    ):
+        self.transform = Compose(
+            [
+                NaResize(
+                    resolution=resolution,
+                    mode=mode,
+                    downsample_only=True,
+                    stride=stride_spatial,
+                    # NOTE: aspect_ratios are only for `bucket` resize.
+                    aspect_ratios=aspect_ratios,
+                ),
+                DivisibleCrop(divisible_crop_size),
+                Normalize(mean, std),
+                Rearrange("t c h w -> c t h w"),
+            ]
+        )
+        # self.stride = divisible_crop_size if isinstance(divisible_crop_size, int) else divisible_crop_size[0]
+        self.stride_spatial = stride_spatial
+        self.stride_temporal = stride_temporal
+
+    def __call__(self, video):
+        return self.transform(video)
+
+
+class VisualTransform:
+    def __init__(
+        self,
+        max_frame_size,
+        min_frame_size,
+        image_stride,
+        max_pixels=14*14*9*1024,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5]
+    ):
+        self.stride = image_stride
+        self.resize_transform = MaxLongEdgeMinShortEdgeResize(
+            max_size=max_frame_size,
+            min_size=min_frame_size,
+            stride=image_stride,
+            max_pixels=max_pixels,
+        )
+        self.to_tensor_transform = transforms.ToTensor()
+        self.normalize_transform = transforms.Normalize(mean=image_mean, std=image_std, inplace=True)
+
+    def _process_single(self, img, img_num=1):
+        img = self.resize_transform(img, img_num=img_num)
+        img = self.to_tensor_transform(img)
+        img = self.normalize_transform(img)
+        return img
+
+    def __call__(self, img, img_num=1):
+        # --- 视频序列处理 ---
+        if isinstance(img, (list, tuple)):
+            # List of PIL.Image or tensors
+            out = torch.stack([self._process_single(frame, img_num=img_num) for frame in img])  # [T, C, H, W]
+            out = out.permute(1, 0, 2, 3)  # [C, T, H, W]
+            return out
+        elif isinstance(img, np.ndarray) and img.ndim == 4:
+            # numpy array: [T, H, W, C] or [T, C, H, W]
+            frames = [img[i] for i in range(img.shape[0])]
+            processed_frames = [self._process_single(Image.fromarray(frame) if frame.shape[-1] in [3, 4] else frame, img_num=img_num)
+                                for frame in frames]
+            out = torch.stack(processed_frames)  # [T, C, H, W]
+            out = out.permute(1, 0, 2, 3)  # [C, T, H, W]
+            return out
+        elif isinstance(img, torch.Tensor) and img.ndim == 4:
+            # torch tensor: [T, C, H, W] or [T, H, W, C]
+            frames = [img[i] for i in range(img.shape[0])]
+            processed_frames = [self._process_single(frame, img_num=img_num) for frame in frames]
+            out = torch.stack(processed_frames)  # [T, C, H, W]
+            out = out.permute(1, 0, 2, 3)  # [C, T, H, W]
+            return out
+        else:
+            # 单帧
+            return self._process_single(img, img_num=img_num)
diff --git a/data/video/sampler/frames.py b/data/video/sampler/frames.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a525d8d491647dc224645e6a19d1444344180b
--- /dev/null
+++ b/data/video/sampler/frames.py
@@ -0,0 +1,624 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""
+Frame samplers.
+
+TODO: 可能需要写一下满足自定义需求的frame sampler
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, NamedTuple, Optional, Tuple, Union
+import numpy as np
+
+
+class FrameSamplerOutput(NamedTuple):
+    """
+    Return indices for frame decoding,
+    and optionally additional information to return to user.
+    """
+
+    indices: List[int]
+    additional_info: Dict[str, Any] = {}
+
+
+class FrameSampler(ABC):
+    """
+    Frame sampler base class.
+
+    Child class must implement __call__ method to return the decoding indices.
+    Or raise if the video cannot be sampled (e.g. too short, etc.)
+    """
+
+    @abstractmethod
+    def __call__(self, num_frames: int) -> FrameSamplerOutput:
+        raise NotImplementedError
+
+
+class AllFrameSampler(FrameSampler):
+    """
+    All frame sampler. Returns all frames in a video.
+    """
+
+    def __call__(self, num_frames: int) -> FrameSamplerOutput:
+        return FrameSamplerOutput(indices=list(range(num_frames)))
+
+class OnlyFirstFrameSampler:
+    """
+    Only first frame sampler. Returns only the first frame of a video.
+    """
+
+    def __call__(self, frames_info: Dict[str, int], **kwargs) -> FrameSamplerOutput:
+        return FrameSamplerOutput(indices=[0])
+
+class FixedFrameSampler:
+    """
+    固定帧数采样器（上/下采样统一算法）：
+    - 接受包含 start_frame, end_frame, total_frames 的 frames_info dict；
+    - 对任意 total_frames ≥ 1，总是返回长度为 num_frames 的帧编号列表；
+    - 保证首尾对应 start_frame 和 end_frame - 1，内部等距离分布；
+    - 当 total_frames < num_frames 时会重复索引，如 [0,1,2] → [0,0,1,1,2,2]。
+    """
+    def __init__(self, num_frames: int):
+        if num_frames < 1:
+            raise ValueError("num_frames must be ≥ 1")
+        self.num_frames = num_frames
+
+    def __call__(self, frames_info: Dict[str, int]) -> List[int]:
+        """
+        参数:
+            frames_info: 包含 'start_frame', 'end_frame', 'total_frames' 的字典
+        返回:
+            List[int]: 采样后的全局帧编号列表，长度恒为 num_frames
+        """
+        start = frames_info.get('start_frame')
+        total = frames_info.get('total_frames')
+        end = frames_info.get('end_frame')
+        if start is None or total is None or end is None:
+            raise ValueError("frames_info must contain 'start_frame', 'end_frame', and 'total_frames'")
+        if total < 1:
+            raise ValueError("total_frames must be ≥ 1")
+        # 计算相对索引
+        rel_indices = self._get_indices(total)
+        # 转换为全局并确保不越界
+        indices = [min(start + idx, end - 1) for idx in rel_indices]
+
+        return FrameSamplerOutput(
+            indices=indices,
+            additional_info={
+                "start_frame": start,
+                "end_frame": end,
+                "total_frames": total,
+            },
+        )
+
+    def _get_indices(self, total: int) -> List[int]:
+        # 单帧特殊处理
+        if self.num_frames == 1:
+            return [0]
+        # 统一采样公式，包括上采样和下采样场景
+        return [
+            int(round(i * (total - 1) / (self.num_frames - 1)))
+            for i in range(self.num_frames)
+        ]
+
+
+class ConsecutiveFrameSampler(FrameSampler):
+    """
+    Adaptive frame sampler.
+
+    Arguments:
+        stride: frame skip.
+                For example, 1 denotes no skip. 2 denotes select every other frame. 3
+                denotes select every third frame. When a list is given, stride is randomly
+                chosen with even probability. However, user may set it to [1,1,2] to
+                denote 1 with 66% probability and 2 with 33% proability.
+        clip:   clip location.
+                    "center":   clip video at the center.
+                    "uniform":  clip video uniformly at random.
+        jitter: jitter to the location.
+                Only applicable when clip is "center".
+                The value is the stdev of the normal distribution to shift the index.
+    """
+
+    def __init__(
+        self,
+        strides: Union[int, List[int]] = 1,
+        temporal: int = 4,
+        clip: Literal["center", "uniform"] = "uniform",
+        jitter: float = 0.0,
+    ):
+        strides = [strides] if isinstance(strides, int) else strides
+        assert len(strides) > 0
+        self.strides = np.array(strides)
+        self.temporal = temporal
+        self.clip = clip
+        self.jitter = jitter
+
+    def __call__(self, frames_info: Dict[str, int]) -> FrameSamplerOutput:
+
+        start_frame = frames_info["start_frame"]
+        end_frame = frames_info["end_frame"]
+        num_frames = frames_info["total_frames"]
+
+        stride = np.random.choice(self.strides)
+
+        frames = end_frame - start_frame
+        length = frames // stride
+
+        # Calculate the maximum integer of the form kn + 1 that does not exceed the given length.
+        def _max_kn_plus_1(length, k):
+            if length < 1:
+                raise ValueError("Length must be at least 1.")
+            n = (length - 1) // k
+            return k * n + 1
+
+        length = _max_kn_plus_1(length, self.temporal)
+
+        # Choose start index.
+        min_start_index = start_frame
+        max_start_index = end_frame - 1 - stride * (length - 1)
+
+        mid_start_index = round((min_start_index + max_start_index) / 2)
+        jitter = round(np.random.normal(loc=0, scale=self.jitter))
+
+        if self.clip == "head":
+            start_index = min_start_index
+        elif self.clip == "tail":
+            start_index = max_start_index
+        elif self.clip == "center":
+            start_index = mid_start_index + jitter
+        elif self.clip == "uniform":
+            start_index = np.random.randint(min_start_index, max_start_index + 1)
+        else:
+            raise NotImplementedError
+
+        start_index = np.clip(start_index, min_start_index, max_start_index)
+
+        # Compute indices
+        indices = np.arange(start_index, start_index + length * stride, stride)
+
+        # Return indices and additional information to return to user.
+        return FrameSamplerOutput(
+            indices=indices.tolist(),
+            additional_info={
+                "stride": stride,
+                "start_frame": start_index,
+                "end_frame": start_index + length * stride,
+                "total_frames": num_frames,
+            },
+        )
+
+
+class AdaptiveFrameSampler(FrameSampler):
+    """
+    Adaptive frame sampler.
+
+    Arguments:
+        length: frame length to return.
+                For example, [5,10] denotes to always return 5 frames or 10 frames.
+                It will choose the longest length that fits the original video.
+                For example, if the video is 9 frames total, it will clip to 5 frames.
+        stride: frame skip.
+                For example, 1 denotes no skip. 2 denotes select every other frame. 3
+                denotes select every third frame. When a list is given, stride is randomly
+                chosen with even probability. However, user may set it to [1,1,2] to
+                denote 1 with 66% probability and 2 with 33% proability.
+        clip:   clip location.
+                    "center":   clip video at the center.
+                    "uniform":  clip video uniformly at random.
+        jitter: jitter to the location.
+                Only applicable when clip is "center".
+                The value is the stdev of the normal distribution to shift the index.
+    """
+
+    def __init__(
+        self,
+        lengths: Union[int, List[int]],
+        strides: Union[int, List[int]] = 1,
+        clip: Literal["center", "uniform"] = "uniform",
+        jitter: float = 0.0,
+    ):
+        lengths = [lengths] if isinstance(lengths, int) else lengths
+        strides = [strides] if isinstance(strides, int) else strides
+        assert len(lengths) > 0
+        assert len(strides) > 0
+        assert clip in ["center", "uniform"]
+        assert jitter >= 0
+        self.lengths = np.array(lengths)
+        self.strides = np.array(strides)
+        self.clip = clip
+        self.jitter = jitter
+
+    def __call__(
+        self,
+        num_frames: int,
+    ) -> FrameSamplerOutput:
+        # Choose stride.
+        # Drop strides that are too long for this video.
+        # Then randomly choose a valid stride.
+        valid_strides = np.any(num_frames // self.strides >= self.lengths.reshape(-1, 1), axis=0)
+        valid_strides = self.strides[valid_strides]
+        if valid_strides.size <= 0:
+            raise ValueError(f"Video is too short ({num_frames} frames).")
+        stride = np.random.choice(valid_strides)
+
+        # Choose length.
+        # Pick the max length that can fit the video under the current stride.
+        valid_lengths = self.lengths[num_frames // stride >= self.lengths]
+        length = np.max(valid_lengths)
+
+        # Choose start index.
+        min_start_index = 0
+        max_start_index = num_frames - 1 - stride * (length - 1)
+        mid_start_index = round((min_start_index + max_start_index) / 2)
+        jitter = round(np.random.normal(loc=0, scale=self.jitter))
+
+        if self.clip == "center":
+            start_index = mid_start_index + jitter
+        elif self.clip == "uniform":
+            start_index = np.random.randint(min_start_index, max_start_index + 1)
+        else:
+            raise NotImplementedError
+
+        start_index = np.clip(start_index, min_start_index, max_start_index)
+
+        # Compute indices
+        indices = np.arange(start_index, start_index + length * stride, stride)
+
+        # Return indices and additional information to return to user.
+        return FrameSamplerOutput(
+            indices=indices.tolist(),
+            additional_info={
+                "stride": stride,
+                "start_frame": start_index,
+                "end_frame": start_index + length * stride,
+                "total_frames": num_frames,
+            },
+        )
+
+
+@dataclass
+class AdaptiveAdvancedFrameSamplerStrategy:
+    stride: int
+    stride_prob: float
+    frame_lengths: List[int]
+    frame_lengths_prob: Union[Literal["uniform", "harmonic"], List[float]]
+
+
+class AdaptiveAdvancedFrameSampler(FrameSampler):
+    """
+    Advanced adaptive frame sampler supports different frame lengths for different strides,
+    and supports probabilistic sampling of both the stride and the frame length.
+
+    strategies: A list of strategies to sample from.
+    clip:   clip location.
+            "center":   clip video at the center.
+            "uniform":  clip video uniformly at random.
+    jitter: jitter to the location.
+            Only applicable when clip is "center".
+            The value is the stdev of the normal distribution to shift the index.
+    """
+
+    def __init__(
+        self,
+        strategies: List[AdaptiveAdvancedFrameSamplerStrategy],
+        clip: Literal["center", "uniform"] = "uniform",
+        jitter: float = 0.0,
+    ):
+        assert len(strategies) > 0, "Strategies must not be empty"
+        assert len({s.stride for s in strategies}) == len(strategies), "Strides cannot duplicate."
+        assert clip in ["center", "uniform"]
+        assert jitter >= 0
+        self.clip = clip
+        self.jitter = jitter
+        self.strides = []
+        self.strides_prob = []
+        self.frame_lengths = []
+        self.frame_lengths_prob = []
+
+        for strategy in sorted(strategies, key=lambda s: s.stride):
+            # Validate strides.
+            assert isinstance(strategy.stride, int), "Stride must be an integer."
+            assert strategy.stride > 0, "Stride must be a positive integer."
+            self.strides.append(strategy.stride)
+
+            # Assign strides_prob.
+            assert isinstance(strategy.stride_prob, (int, float)), "Stride prob is not int/float."
+            assert strategy.stride_prob >= 0, "Stride prob must be non-negative."
+            self.strides_prob.append(strategy.stride_prob)
+
+            # Assign frame lengths, sort by value.
+            assert len(strategy.frame_lengths) > 0, "Frame lengths must not be empty."
+            frame_lengths = np.array(strategy.frame_lengths)
+            assert frame_lengths.dtype == int, "Frame lengths must be integers."
+            assert np.all(frame_lengths > 0), "Frame lengths must be positive integers."
+            frame_lengths_sorted_idx = np.argsort(frame_lengths)
+            frame_lengths = frame_lengths[frame_lengths_sorted_idx]
+            self.frame_lengths.append(frame_lengths)
+
+            # Assign frame lengths prob, apply the sorting to prob as well.
+            if strategy.frame_lengths_prob == "uniform":
+                # e.g. [0.2, 0.2, 0.2, 0.2, 0.2]
+                frame_lengths_prob = np.full(len(frame_lengths), 1.0 / len(frame_lengths))
+            elif strategy.frame_lengths_prob == "harmonic":
+                # e.g. [0.2, 0.25, 0.33, 0.5, 1]
+                frame_lengths_prob = np.flip(1 / np.arange(1, len(frame_lengths) + 1))
+            elif isinstance(strategy.frame_lengths_prob, list):
+                frame_lengths_prob = np.array(strategy.frame_lengths_prob)
+                frame_lengths_prob = frame_lengths_prob[frame_lengths_sorted_idx]
+            else:
+                raise NotImplementedError
+            assert len(frame_lengths_prob) == len(frame_lengths), "Frame lengths prob mismatch."
+            assert np.all(frame_lengths_prob >= 0), "Frame lengths prob must not be negative."
+            assert frame_lengths_prob.sum() > 0, "Frame lengths prob must not be all zeros."
+            frame_lengths_prob /= frame_lengths_prob.sum()
+            self.frame_lengths_prob.append(frame_lengths_prob)
+
+        self.strides = np.array(self.strides)
+        self.strides_prob = np.array(self.strides_prob)
+        assert self.strides_prob.sum() > 0, "Strides prob must not be all zeros."
+        self.strides_prob /= self.strides_prob.sum()
+
+    def __call__(self, num_frames: int):
+        sample_result = adptive_sample_framelen_and_stride(
+            num_frames=num_frames,
+            strides=self.strides,
+            strides_prob=self.strides_prob,
+            frame_lengths=self.frame_lengths,
+            frame_lengths_prob=self.frame_lengths_prob,
+        )
+
+        stride = sample_result["stride"]
+        length = sample_result["frame_length"]
+
+        # Choose start index.
+        min_start_index = 0
+        max_start_index = num_frames - 1 - stride * (length - 1)
+        mid_start_index = round((min_start_index + max_start_index) / 2)
+        jitter = round(np.random.normal(loc=0, scale=self.jitter))
+
+        if self.clip == "center":
+            start_index = mid_start_index + jitter
+        elif self.clip == "uniform":
+            start_index = np.random.randint(min_start_index, max_start_index + 1)
+        else:
+            raise NotImplementedError
+
+        start_index = np.clip(start_index, min_start_index, max_start_index)
+
+        # Compute indices
+        indices = np.arange(start_index, start_index + length * stride, stride)
+
+        # Return indices and additional information to return to user.
+        return FrameSamplerOutput(
+            indices=indices.tolist(),
+            additional_info={
+                "stride": stride,
+                "start_frame": start_index,
+                "end_frame": start_index + length * stride,
+                "total_frames": num_frames,
+            },
+        )
+
+
+class MultiClipsFrameSampler(FrameSampler):
+    """
+    multi clips frame sampler.
+
+    Arguments:
+        temporal: downsample factor on temporal
+        sample_fps: fps of sampled frames
+        truncate: whether to truncate by max duration of the video (default = false, already truncate in clip_indices)
+        max_duration: truncate by max duration of the video
+    """
+
+    def __init__(
+        self,
+        temporal: int = 4,
+        sample_fps: int = 12,
+        truncate: bool = False,
+        max_duration: int = 12,
+        length_type: Literal["kn", "kn+1"] = "kn+1",
+        assert_seconds: bool = True,
+    ):
+        self.temporal = temporal
+        self.sample_fps = sample_fps
+        self.truncate = truncate
+        self.max_duration = max_duration
+        self.length_type = length_type
+        self.assert_seconds = assert_seconds
+
+    def __call__(self, frames_info: Dict[str, int]) -> FrameSamplerOutput:
+
+        clip_indices = frames_info["clip_indices"]
+        origin_fps = frames_info["fps"]
+
+        if self.truncate:
+            clip_indices = self.truncate_to_bucket(clip_indices, origin_fps)
+
+        if self.assert_seconds:
+            duration_sec = int(round(sum([(end - start) / origin_fps for start, end in clip_indices])))
+            if not self.truncate:                         # 新增：即使不截段也限制总时长
+                duration_sec = min(duration_sec, self.max_duration)
+            duration = int(round(duration_sec))
+
+            n_frames = duration * self.sample_fps
+            if self.length_type == "kn+1":
+                n_frames += 1
+        else:
+            duration = sum([(end - start) / origin_fps for start, end in clip_indices])
+            if not self.truncate:                         # 新增
+                duration = min(duration, self.max_duration)
+            n_frames = int(round(duration * self.sample_fps))
+            if self.length_type == "kn+1":
+                if n_frames % self.temporal != 0:
+                    n_frames = n_frames // self.temporal * self.temporal + 1
+                else:
+                    n_frames = n_frames // self.temporal * self.temporal + 1 - self.temporal
+        clip_n_frames = self.split_n_frames_by_clip(n_frames, clip_indices)
+        sample_indices = self.sample_frame_indices(clip_indices, clip_n_frames)
+
+        clip_n_latent_frames = [(n + self.temporal - 1) // self.temporal for n in clip_n_frames]
+
+        return FrameSamplerOutput(
+            indices=sample_indices,
+            additional_info={
+                "clip_n_frames": clip_n_frames,
+                "clip_n_latent_frames": clip_n_latent_frames,
+            },
+        )
+
+    def truncate_to_bucket(self, clip_indices, fps):
+        clip_indices = [tuple(index) for index in clip_indices]
+        durations = []
+        for start, end in clip_indices:
+            durations.append((end - start) / fps)
+        duration = sum(durations)
+        max_duration = min(int(duration), self.max_duration)
+        cutoff = duration - max_duration
+        if cutoff > 0:
+            if durations[-1] - cutoff > durations[0] - cutoff:  # 截掉尾部
+                start, end = clip_indices[-1]
+                end = min(round((durations[-1] - cutoff) * fps), end) + start
+                clip_indices[-1] = (start, end)
+            else:
+                start, end = clip_indices[0]
+                start = max(end - round((durations[0] - cutoff) * fps), start)
+                clip_indices[0] = (start, end)
+        return clip_indices
+
+    def split_n_frames_by_clip(self, n_frames, clip_indices):
+        n_latent_frames = n_frames // self.temporal
+        clip_lengths = [(end - start) for start, end in clip_indices]
+        clip_n_latent_frames = [int(l / sum(clip_lengths) * n_latent_frames) for l in clip_lengths]
+        n_remains = n_latent_frames - sum(clip_n_latent_frames)
+        for i in range(n_remains):
+            clip_n_latent_frames[i] += 1
+        clip_n_frames = [n * self.temporal for n in clip_n_latent_frames]
+        if self.length_type == "kn+1":
+            clip_n_frames[0] += 1
+        return clip_n_frames
+
+    def sample_frame_indices(self, clip_indices, clip_n_frames):
+        shift_clip_indices = []
+        accum_n_frames = 0
+        for start, end in clip_indices:
+            start, end = accum_n_frames, accum_n_frames + (end - start)
+            shift_clip_indices.append((start, end))
+            accum_n_frames += end - start
+
+        all_sample_indices = []
+        for i, ((start, end), (shift_start, shift_end), n_frames) in enumerate(
+            zip(clip_indices, shift_clip_indices, clip_n_frames)
+        ):
+            indices = np.arange(start, end)
+            next_shift_start = (
+                shift_clip_indices[i + 1][0] if i < len(clip_indices) - 1 else shift_end
+            )
+            shift_sample_indices = (
+                np.linspace(shift_start, next_shift_start - 1, n_frames, dtype=int) - shift_start
+            )
+            sample_indices = indices[shift_sample_indices].tolist()
+            all_sample_indices.extend(sample_indices)
+
+        return all_sample_indices
+
+
+def normalize_probabilities(
+    items: np.ndarray,
+    probs: np.ndarray,
+    masks: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray]:
+    assert len(items), "Items must not be empty."
+    assert len(items) == len(masks) == len(probs), "Lengths must match."
+    # assert (items, np.ndarray), "isinstanceItems must be an np.ndarray."
+    assert isinstance(probs, np.ndarray), "Probs must be an np.ndarray."
+    assert isinstance(masks, np.ndarray), "Masks must be an np.ndarray."
+    assert masks.dtype == bool, "Masks must be boolean."
+    assert np.any(masks), "Masks must not be all False."
+    assert np.all(np.diff(masks.astype("int")) <= 0), "Masks must not break monotonicity."
+
+    ret_items = items[masks]
+    ret_probs = probs[masks]
+
+    # Accumulate the probabilities of infeasible items to the last feasible one.
+    ret_probs[-1] += probs[~masks].sum()
+
+    return ret_items, ret_probs
+
+
+def adptive_sample_framelen_and_stride(
+    num_frames: int,
+    strides: np.ndarray,
+    strides_prob: np.ndarray,
+    frame_lengths: List[np.ndarray],
+    frame_lengths_prob: List[Optional[np.ndarray]],
+) -> Dict[str, Any]:
+    """Adaptively sample frame length and stride for a video.
+
+    Args:
+        num_frames: Number of frames in the current video.
+        strides: A list of strides.
+        strides_prob: The probability for each stride.
+        frame_lengths: The number of frames (sorted) to sample from at the current stride.
+            For example, `frame_length=10` at `stride=2` means that we need to have 20 frames.
+            When the number of frames to sample is infeasible, it will select the feasible frame
+            lengths and re-normalize the probability according to the feasible frames at hand.
+            For example, if `num_frames=10`, `frame_lengths[stride2]=[4, 5]`,
+            `frame_lengths[stride3]=[1, 3, 5]`, we can sample frame lengths 1, 2, and 5 at
+            `stride=2` (2, 4, and 10 frames) but only frame lengths 1, 3 at `stride=3`. In this
+            case, we will add the probability of `frame_length=5` at `stride=3` to `frame_length=3`
+            at `stride=3`, making it more likely to be selected.
+        frame_lengths_prob: The frame probabilities to sample from the corresponding frame lengths.
+            Defaults to None for uniform sampling.
+    Returns:
+        dictionary: A dictionary containing the selected frames and strides. if none is feasible,
+        it will raise an exception.
+    """
+    assert len(strides) == len(strides_prob) == len(frame_lengths) == len(frame_lengths_prob)
+
+    # Prepare frame_lengths_mask for each stride.
+    frame_lengths_mask = [num_frames // s >= l for s, l in zip(strides, frame_lengths)]
+
+    # Prepare stride mask and prob.
+    strides_idxs = np.arange(len(strides))
+    strides_mask = np.array([np.any(mask) for mask in frame_lengths_mask])
+    assert np.any(strides_mask), (
+        f"Cannot sample frames={num_frames} "
+        + f"from strides={strides} and lengths={frame_lengths}"
+    )
+
+    # Drop infeasible strides and normalize probability.
+    strides_idxs, strides_prob = normalize_probabilities(strides_idxs, strides_prob, strides_mask)
+
+    # Choose stride.
+    stride_idx = np.random.choice(strides_idxs, p=strides_prob)
+    stride = strides[stride_idx]
+
+    # Prepare frame_lengths mask and prob for the current stride.
+    lengths = frame_lengths[stride_idx]
+    lengths_mask = frame_lengths_mask[stride_idx]
+    lengths_prob = frame_lengths_prob[stride_idx]
+    if lengths_prob is None:
+        lengths_prob = np.full(len(lengths), 1.0 / len(lengths))
+
+    # Drop infeasible lengths and normalize probability.
+    lengths, lengths_prob = normalize_probabilities(lengths, lengths_prob, lengths_mask)
+
+    # Choose frame length.
+    length = np.random.choice(lengths, p=lengths_prob)
+    return dict(stride=stride, frame_length=length)
diff --git a/data/video/sampler/utils.py b/data/video/sampler/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..52371617ac46de394cae6c9bf6aa7622847c5ad4
--- /dev/null
+++ b/data/video/sampler/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .frames import (
+    AdaptiveAdvancedFrameSampler,
+    AdaptiveFrameSampler,
+    AllFrameSampler,
+    ConsecutiveFrameSampler,
+    MultiClipsFrameSampler,
+    FixedFrameSampler,
+    OnlyFirstFrameSampler,
+)
+
+FRAME_SAMPLER_TYPES = {
+    "only_first_frame": OnlyFirstFrameSampler,
+    "fixed": FixedFrameSampler,
+    "all": AllFrameSampler,
+    "adaptive": AdaptiveFrameSampler,
+    "adaptive_advanced": AdaptiveAdvancedFrameSampler,
+    "range": ConsecutiveFrameSampler,
+    "multi_clips": MultiClipsFrameSampler,
+}
diff --git a/data/video/transforms/area_resize.py b/data/video/transforms/area_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da0a9df5a541e785e9700d5b4481cdb00c20d92
--- /dev/null
+++ b/data/video/transforms/area_resize.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Union
+import torch
+from PIL import Image
+from torchvision.transforms import functional as TVF
+from torchvision.transforms.functional import InterpolationMode, to_tensor
+
+
+class AreaResize:
+    def __init__(
+        self,
+        max_area: float,
+        downsample_only: bool = False,
+        interpolation: InterpolationMode = InterpolationMode.BICUBIC,
+    ):
+        self.max_area = max_area
+        self.downsample_only = downsample_only
+        self.interpolation = interpolation
+
+    def __call__(self, image: Union[torch.Tensor, Image.Image, List[Image.Image]]):
+
+        if isinstance(image, torch.Tensor):
+            height, width = image.shape[-2:]
+        elif isinstance(image, Image.Image):
+            width, height = image.size
+        elif isinstance(image, list) and isinstance(image[0], Image.Image):
+            width, height = image[0].size
+        else:
+            raise NotImplementedError
+
+        scale = math.sqrt(self.max_area / (height * width))
+
+        # keep original height and width for small pictures.
+        scale = 1 if scale >= 1 and self.downsample_only else scale
+
+        resized_height, resized_width = round(height * scale), round(width * scale)
+
+        if isinstance(image, list) and isinstance(image[0], Image.Image):
+            image = torch.stack(
+                [
+                    to_tensor(
+                        TVF.resize(
+                            _image,
+                            size=(resized_height, resized_width),
+                            interpolation=self.interpolation,
+                        )
+                    )
+                    for _image in image
+                ]
+            )
+        else:
+            image = TVF.resize(
+                image,
+                size=(resized_height, resized_width),
+                interpolation=self.interpolation,
+            )
+            if isinstance(image, Image.Image):
+                image = to_tensor(image)
+        return image
diff --git a/data/video/transforms/bucket_resize.py b/data/video/transforms/bucket_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b1b977cd8573d7edd6424cf7db7e2c8a8452a97
--- /dev/null
+++ b/data/video/transforms/bucket_resize.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import math
+from typing import List, Tuple, Union
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms import RandomResizedCrop
+from torchvision.transforms.functional import InterpolationMode, to_tensor
+
+
+class BucketResize:
+    def __init__(
+        self,
+        max_area: float,
+        interpolation: InterpolationMode = InterpolationMode.LANCZOS,
+        aspect_ratios: List[str] = None,
+        stride: Union[int, Tuple[int]] = None,
+    ):
+        self.max_area = max_area
+        self.interpolation = interpolation
+
+        assert aspect_ratios and stride, "`aspect_ratios` or `stride` not given!"
+        self.buckets, self.bucket_ratios = self.init_buckets(aspect_ratios, max_area, stride)
+        self.bucket_resize = {
+            # NOTICE: 虽然名字叫 random, 但在这个 setting 下是 center crop, 无随机性
+            # bucket: (h,w)
+            bucket: RandomResizedCrop(
+                size=(bucket[0], bucket[1]),
+                scale=(1, 1),
+                ratio=(bucket_ratio, bucket_ratio),
+                interpolation=self.interpolation,
+            )
+            for bucket, bucket_ratio in zip(self.buckets, self.bucket_ratios)
+        }
+
+    def __call__(self, image: Union[torch.Tensor, Image.Image, List[Image.Image]]):
+
+        if isinstance(image, torch.Tensor):
+            height, width = image.shape[-2:]
+        elif isinstance(image, Image.Image):
+            width, height = image.size
+        elif isinstance(image, list) and isinstance(image[0], Image.Image):
+            width, height = image[0].size
+        else:
+            raise NotImplementedError
+
+        bucket = self.find_nearest_bucket(width, height)
+        resizer = self.bucket_resize[bucket]
+
+        if isinstance(image, list) and isinstance(image[0], Image.Image):
+            return torch.stack([to_tensor(resizer(_image)) for _image in image])
+        else:
+            image = resizer(image)
+            if isinstance(image, Image.Image):
+                image = to_tensor(image)
+            return image
+
+    def find_nearest_bucket(self, width, height):
+        """
+        找到与给定图片最近的bucket尺寸
+        """
+        image_ratio = width / height
+        diff = np.abs(image_ratio - self.bucket_ratios)
+        index = diff.argmin()
+        return self.buckets[index]
+
+    @staticmethod
+    def init_buckets(aspect_ratio_names, max_area, stride):
+        """
+        指定一些列最接近给定宽高比和面积的,同时整除vae降采样和patch_size倍数的宽高
+        """
+        if not isinstance(stride, (tuple, list)):
+            stride = (stride, stride)
+        height_factor, width_factor = stride
+
+        buckets, bucket_ratios = [], []
+        for name in aspect_ratio_names:
+            w, h = (int(v) for v in name.split(":"))
+            aspect_ratio = w / h
+
+            resize_width1 = math.sqrt(max_area * aspect_ratio)
+            bucket_width1 = round(resize_width1 / width_factor) * width_factor
+            resize_height1 = bucket_width1 / aspect_ratio
+            bucket_height1 = round(resize_height1 / height_factor) * height_factor
+            bucket_ratio1 = bucket_width1 / bucket_height1
+            bucket_area1 = bucket_width1 * bucket_height1
+
+            resize_height2 = math.sqrt(max_area / aspect_ratio)
+            bucket_height2 = round(resize_height2 / height_factor) * height_factor
+            resize_width2 = bucket_height2 * aspect_ratio
+            bucket_width2 = round(resize_width2 / width_factor) * width_factor
+            bucket_ratio2 = bucket_width2 / bucket_height2
+            bucket_area2 = bucket_width2 * bucket_height2
+
+            if abs(bucket_ratio1 - aspect_ratio) < abs(bucket_ratio2 - aspect_ratio):
+                bucket_width, bucket_height = bucket_width1, bucket_height1
+            elif abs(bucket_ratio1 - aspect_ratio) > abs(bucket_ratio2 - aspect_ratio):
+                bucket_width, bucket_height = bucket_width2, bucket_height2
+            else:
+                if abs(bucket_area1 - max_area) <= abs(bucket_area2 - max_area):
+                    bucket_width, bucket_height = bucket_width1, bucket_height1
+                else:
+                    bucket_width, bucket_height = bucket_width2, bucket_height2
+
+            bucket_ratio = bucket_width / bucket_height
+
+            buckets.append((bucket_height, bucket_width))
+            bucket_ratios.append(bucket_ratio)
+
+        bucket_ratios = np.array(bucket_ratios)
+
+        return buckets, bucket_ratios
+
+
+# ================================================================= #
+# <<< 这里是为您编写的 check 函数 >>>
+# ================================================================= #
+
+def check_buckets(max_area: int, aspect_ratios: List[str], stride: int):
+    """
+    一个检查并打印 BucketResize.init_buckets 输出的辅助函数。
+
+    Args:
+        max_area (int): 目标总像素面积。
+        aspect_ratios (List[str]): 目标宽高比列表 (例如: ["1:1", "4:3"])。
+        stride (int): 步幅，高度和宽度必须是它的整数倍。
+    """
+    print(f"--- Checking Configuration ---")
+    print(f"Max Area: {max_area} | Aspect Ratios: {aspect_ratios} | Stride: {stride}")
+    print("-" * 35)
+
+    buckets, bucket_ratios = BucketResize.init_buckets(aspect_ratios, max_area, stride)
+
+    print("Generated Buckets (Height, Width) and Ratios:")
+    for (h, w), ratio in zip(buckets, bucket_ratios):
+        # 打印每个桶的尺寸、宽高比和总面积
+        print(f"  - Bucket: ({h:4d}, {w:4d})  |  Ratio: {ratio:.4f}  |  Area: {h*w}")
+    print("\n")
+
+
+if __name__ == '__main__':
+    # 示例1: 您提到的 256x256 的情况
+    # 注意: max_area 是总像素，所以是 256*256
+    check_buckets(
+        # max_area=256*256,
+        max_area=224*224,
+        aspect_ratios=["21:9", '1:1', '4:3', '3:4', '9:16', '16:9'],
+        stride=28 #16
+    )
+
+    # check_buckets(
+    #     max_area=640*640,
+    #     aspect_ratios=['1:1', '4:3', '3:4', '9:16', '16:9'],
+    #     stride=16
+    # )
+
+    # check_buckets(
+    #     max_area=512*512,
+    #     aspect_ratios=['1:1', '4:3', '3:4', '9:16', '16:9'],
+    #     stride=16
+    # )
+
+    # check_buckets(
+    #     max_area=1024*1024,
+    #     aspect_ratios=['1:1', '4:3', '3:4', '16:9', '9:16'],
+    #     stride=16
+    # )
diff --git a/data/video/transforms/divisible_crop.py b/data/video/transforms/divisible_crop.py
new file mode 100644
index 0000000000000000000000000000000000000000..66f7fbba02e742d2cb2ec8578e54b2f8836b0254
--- /dev/null
+++ b/data/video/transforms/divisible_crop.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Union
+import torch
+from PIL import Image
+from torchvision.transforms import functional as TVF
+
+
+class DivisibleCrop:
+    def __init__(self, factor):
+        if not isinstance(factor, tuple):
+            factor = (factor, factor)
+
+        self.height_factor, self.width_factor = factor[0], factor[1]
+
+    def __call__(self, image: Union[torch.Tensor, Image.Image]):
+        if isinstance(image, torch.Tensor):
+            height, width = image.shape[-2:]
+        elif isinstance(image, Image.Image):
+            width, height = image.size
+        else:
+            raise NotImplementedError
+
+        cropped_height = height - (height % self.height_factor)
+        cropped_width = width - (width % self.width_factor)
+
+        image = TVF.center_crop(img=image, output_size=(cropped_height, cropped_width))
+        return image
diff --git a/data/video/transforms/na_resize.py b/data/video/transforms/na_resize.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4d7968bc22831e1e868c2929a92c139b0972cd6
--- /dev/null
+++ b/data/video/transforms/na_resize.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Literal
+from torchvision.transforms import CenterCrop, Compose, InterpolationMode, Resize
+
+from .area_resize import AreaResize
+from .bucket_resize import BucketResize
+
+def NaResize(
+    resolution: int,
+    mode: Literal["area", "square", "bucket"],
+    downsample_only: bool,
+    interpolation: InterpolationMode = InterpolationMode.BICUBIC,
+    **kwargs,
+):
+    if mode == "area":
+        return AreaResize(
+            max_area=resolution**2,
+            downsample_only=downsample_only,
+            interpolation=interpolation,
+        )
+    elif mode == "square":
+        return Compose(
+            [
+                Resize(
+                    size=resolution,
+                    interpolation=interpolation,
+                ),
+                CenterCrop(resolution),
+            ]
+        )
+    elif mode == "bucket":
+        aspect_ratios = kwargs.get("aspect_ratios", ["21:9", "16:9", "4:3", "1:1", "3:4", "9:16"])
+        stride = kwargs.get("stride", 16)
+        return Compose(
+            [
+                BucketResize(
+                    max_area=resolution**2,
+                    interpolation=interpolation,
+                    aspect_ratios=aspect_ratios,
+                    stride=stride,
+                )
+            ]
+        )
+    raise ValueError(f"Unknown resize mode: {mode}")
diff --git a/data/video/transforms/rearrange.py b/data/video/transforms/rearrange.py
new file mode 100644
index 0000000000000000000000000000000000000000..16fa66188a921deade2181a3551a317d3d5c3562
--- /dev/null
+++ b/data/video/transforms/rearrange.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from einops import rearrange
+
+
+class Rearrange:
+    def __init__(self, pattern: str, **kwargs):
+        self.pattern = pattern
+        self.kwargs = kwargs
+
+    def __call__(self, x):
+        return rearrange(x, self.pattern, **self.kwargs)
diff --git a/inference_lance.py b/inference_lance.py
new file mode 100644
index 0000000000000000000000000000000000000000..17de3c7238cbef4f36c4e2c32fc2b68ee2a9166a
--- /dev/null
+++ b/inference_lance.py
@@ -0,0 +1,671 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import warnings
+warnings.filterwarnings("ignore", message=".*pkg_resources is deprecated.*", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning, module="diffusers.models.transformers.transformer_2d")
+import os
+import time
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
+
+import os.path as osp
+from copy import deepcopy
+import json
+from typing import Tuple, cast, Optional
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from transformers import HfArgumentParser, set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from safetensors.torch import load_file
+
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.data_utils import add_special_tokens
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.lance import LanceConfig, Lance, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from common.utils.misc import tuple_mul, AutoEncoderParams
+from common.utils.distributed import get_global_rank
+from common.utils.logging import get_logger
+from common.val.utils import make_padded_latent, decode_video_tensor
+from data.datasets_custom import ValidationDataset
+from config.config_factory import (
+    ModelArguments,
+    DataArguments,
+    InferenceArguments,
+    get_model_path,
+)
+
+from tqdm import trange
+
+
+# Constants
+MAX_GENERATION_LENGTH = 256
+PROMPT_JSON_FILENAME = "prompt.json"
+RESULT_JSON_FILENAME = "result.json"
+INTERNAL_VALIDATION_MAX_SAMPLES = 100000
+TASK_T2V = "t2v"
+TASK_T2I = "t2i"
+TASK_X2T_IMAGE = "x2t_image"
+TASK_X2T_VIDEO = "x2t_video"
+TASK_IMAGE_EDIT = "image_edit"
+TASK_VIDEO_EDIT = "video_edit"
+GENERATION_TASKS = {
+    TASK_T2V,
+    TASK_T2I,
+    TASK_IMAGE_EDIT,
+    TASK_VIDEO_EDIT,
+}
+UNDERSTANDING_TASKS = {
+    TASK_X2T_IMAGE,
+    TASK_X2T_VIDEO,
+}
+TASK_DEFAULT_CONFIGS = {
+    TASK_T2I: {
+        "model_family": "image",
+        "example_json": "config/examples/t2i_example.json",
+        "save_path_prefix": "results/t2i_sample",
+    },
+    TASK_T2V: {
+        "model_family": "video",
+        "example_json": "config/examples/t2v_example.json",
+        "save_path_prefix": "results/t2v_sample",
+    },
+    TASK_IMAGE_EDIT: {
+        "model_family": "image",
+        "example_json": "config/examples/image_edit_example.json",
+        "save_path_prefix": "results/image_edit_sample",
+    },
+    TASK_VIDEO_EDIT: {
+        "model_family": "video",
+        "example_json": "config/examples/video_edit_example.json",
+        "save_path_prefix": "results/video_edit_sample",
+    },
+    TASK_X2T_IMAGE: {
+        "model_family": "image",
+        "example_json": "config/examples/x2t_image_example.json",
+        "save_path_prefix": "results/x2t_image_sample",
+    },
+    TASK_X2T_VIDEO: {
+        "model_family": "video",
+        "example_json": "config/examples/x2t_video_example.json",
+        "save_path_prefix": "results/x2t_video_sample",
+    },
+}
+
+def init_from_model_path_if_needed(model: Qwen2ForCausalLM, model_args: ModelArguments):
+    # Always load the trained Lance checkpoint from model_path.
+    path_dir = model_args.model_path
+    ema_path = osp.join(path_dir, "ema.safetensors")
+    model_path = osp.join(path_dir, "model.safetensors")
+
+
+    model_path_ft = None
+    if osp.exists(model_path):
+        model_path_ft = model_path
+    elif osp.exists(ema_path):
+        model_path_ft = ema_path
+
+    if model_path_ft:
+        model_state_dict = load_file(model_path_ft, device="cpu")
+    else:
+        raise FileNotFoundError(
+            f"Fine-tuning failed: No valid checkpoint ('ema.safetensors' or 'model.safetensors') found in {path_dir}"
+        )
+
+    # NOTE: position embeds are fixed sinusoidal embeddings, so we can just pop it off,
+    # which makes it easier to adapt to different resolutions.
+    if 'latent_pos_embed.pos_embed' in model_state_dict:
+        model_state_dict.pop('latent_pos_embed.pos_embed')
+
+    msg = model.load_state_dict(model_state_dict, strict=False)  # strict = True | False
+    clean_memory(model_state_dict)
+
+    return msg
+
+
+def clean_memory(*objects):
+    """Clear temporary container references and release unused GPU allocator cache."""
+    for obj in objects:
+        if isinstance(obj, dict):
+            obj.clear()
+        elif isinstance(obj, (list, set)):
+            obj.clear()
+    import gc
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
+def apply_inference_defaults(
+    model_args: ModelArguments,
+    data_args: DataArguments,
+    inference_args: InferenceArguments,
+) -> None:
+    if inference_args.task not in TASK_DEFAULT_CONFIGS:
+        raise ValueError(f"Unsupported inference task: {inference_args.task}")
+
+    task_config = TASK_DEFAULT_CONFIGS[inference_args.task]
+    default_inference_args = InferenceArguments()
+
+    model_family = task_config.get("model_family", "")
+    if not model_args.model_path and model_family:
+        model_args.model_path = get_model_path(f"lance.{model_family}")
+    if not getattr(model_args, "llm_path", ""):
+        model_args.llm_path = model_args.model_path
+    if not model_args.vit_path:
+        model_args.vit_path = get_model_path("vit.qwen2_5_vl")
+
+    if not data_args.val_dataset_config_file and task_config.get("example_json"):
+        data_args.val_dataset_config_file = task_config["example_json"]
+
+    if inference_args.save_path_gen == default_inference_args.save_path_gen and task_config.get("save_path_prefix"):
+        inference_args.save_path_gen = task_config["save_path_prefix"]
+    if inference_args.validation_max_samples == default_inference_args.validation_max_samples:
+        inference_args.validation_max_samples = INTERNAL_VALIDATION_MAX_SAMPLES
+    if inference_args.video_height == default_inference_args.video_height:
+        inference_args.video_height = int(task_config.get("video_height", default_inference_args.video_height))
+    if inference_args.video_width == default_inference_args.video_width:
+        inference_args.video_width = int(task_config.get("video_width", default_inference_args.video_width))
+    if inference_args.resolution == default_inference_args.resolution:
+        inference_args.resolution = task_config.get("resolution", default_inference_args.resolution)
+    if inference_args.text_template == default_inference_args.text_template:
+        inference_args.text_template = bool(task_config.get("text_template", default_inference_args.text_template))
+
+
+def save_prompt_results(prompt_data_dict, save_path_gen, logger):
+    """Save validation results to a JSON file."""
+    prompt_json_path = os.path.join(save_path_gen, PROMPT_JSON_FILENAME)
+    with open(prompt_json_path, 'w', encoding='utf-8') as f:
+        json.dump(prompt_data_dict, f, ensure_ascii=False, indent=2)
+
+
+def normalize_understanding_answer(text: Optional[str]) -> str:
+    """Normalize generated understanding text before exporting it."""
+    if text is None:
+        return ""
+    return text.replace("<|im_end|>", "").strip()
+
+
+def save_understanding_results(
+    prompt_data_dict: dict,
+    dataset_config_file: str,
+    save_path_gen: str,
+) -> None:
+    """Save x2t results as a structured result.json file."""
+    with open(dataset_config_file, "r", encoding="utf-8") as f:
+        dataset_samples = json.load(f)
+
+    result_entries = []
+    for sample_key, sample in dataset_samples.items():
+        interleave_array = sample.get("interleave_array", [])
+        element_dtype_array = sample.get("element_dtype_array", [])
+        if len(interleave_array) < 2 or not element_dtype_array:
+            continue
+
+        visual_path = interleave_array[0]
+        text_payload = interleave_array[1]
+        question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
+        modality = element_dtype_array[0]
+
+        lookup_keys = [os.path.basename(visual_path), sample_key]
+        generated_answer = ""
+        for lookup_key in lookup_keys:
+            if lookup_key in prompt_data_dict:
+                generated_answer = prompt_data_dict[lookup_key]
+                break
+
+        result_entries.append(
+            {
+                modality: visual_path,
+                "question": question,
+                "answer": normalize_understanding_answer(generated_answer),
+            }
+        )
+
+    result_json_path = os.path.join(save_path_gen, RESULT_JSON_FILENAME)
+    with open(result_json_path, "w", encoding="utf-8") as f:
+        json.dump(result_entries, f, ensure_ascii=False, indent=2)
+
+
+def validate_on_fixed_batch(
+    fsdp_model: Lance,
+    vae_model: Optional[WanVideoVAE],
+    tokenizer: Qwen2Tokenizer,
+    val_data_cpu: dict,
+    training_args: InferenceArguments,
+    model_args: ModelArguments,
+    inference_args: InferenceArguments,
+    new_token_ids,
+    image_token_id: int,
+    device: int,
+    save_source_video: bool = False,
+    save_path_gen: str = "",
+    save_path_gt: str = "",
+):
+    val_data = val_data_cpu.cuda(device).to_dict()
+    fsdp_model = fsdp_model.to(device=device, dtype=torch.bfloat16)
+
+    with torch.no_grad(), torch.amp.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+        # Compute padded_latent.
+        if "padded_videos" in val_data.keys():
+            val_data["padded_latent"] = make_padded_latent(val_data["padded_videos"], val_data["vae_data_mode"], vae_model)
+
+        # -------------------- Generation branch --------------------
+        if inference_args.task in GENERATION_TASKS:
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_packed_vae_token_indexes": val_data["packed_vae_token_indexes"],
+                "timestep_shift": training_args.validation_timestep_shift,
+                "num_timesteps": training_args.validation_num_timesteps,
+                "val_mse_loss_indexes": val_data.get("mse_loss_indexes", None),
+                "val_padded_latent": val_data["padded_latent"],
+                "video_sizes": val_data["video_sizes"],
+                "cfg_text_scale": model_args.cfg_text_scale,
+                "cfg_interval": training_args.cfg_interval,
+                "cfg_renorm_min": training_args.cfg_renorm_min,
+                "cfg_renorm_type": training_args.cfg_renorm_type,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "max_samples": training_args.validation_max_samples,
+                "validation_noise_seed": training_args.validation_noise_seed,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "image_token_id": image_token_id,
+                "val_packed_vit_token_indexes": val_data.get("packed_vit_token_indexes", None),
+                "val_packed_vit_tokens": val_data.get("packed_vit_tokens", None),
+                "vit_video_grid_thw": val_data.get("vit_video_grid_thw", None),
+                "vae_video_grid_thw": val_data["vae_video_grid_thw"],
+                "video_grid_thw": val_data.get("video_grid_thw", None),
+                "caption": val_data.get("caption", None),  # The dataset uses "caption" as the default caption field.
+                "sample_task": val_data["sample_task"],
+                "sample_modality": val_data["sample_modality"],
+                "cfg_type": training_args.cfg_type,
+                "cfg_uncond_token_id": training_args.cfg_uncond_token_id,
+                "index": val_data["index"],
+                "val_padded_videos": val_data["padded_videos"] if save_source_video else None,
+            }
+            if inference_args.use_KVcache:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen_KVcache(**params)
+            else:
+                denoise_latent, captions, padded_videos, index = fsdp_model.validation_gen(**params)
+
+            # Decode.
+            for i_val, latent in enumerate(denoise_latent):
+                if inference_args.task in {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}:
+                    target_latents = [latent[-1]]
+                else:
+                    target_latents = latent
+
+                v_list = []
+                for latent_ in target_latents:
+                    v_list.append(vae_model.vae_decode([latent_])[0])
+
+                save_item_name = f"{index:06d}" if isinstance(index, int) else index
+                v_thwc = decode_video_tensor(v_list, save_path=save_path_gen, save_half=False, save_item_name=save_item_name)
+
+                if v_thwc.shape[0] > 1:
+                    prompt_data_path = f"{save_item_name}.mp4"
+                else:
+                    prompt_data_path = f"{save_item_name}.png"
+                inference_args.prompt_data_dict[prompt_data_path] = captions[i_val]
+
+                if save_source_video:
+                    curr_padded_videos = padded_videos[i_val * 2 : (i_val + 1) * 2]
+                    v_thwc_gt = decode_video_tensor(curr_padded_videos[-1:], save_path=save_path_gt, save_item_name=save_item_name)
+                    del curr_padded_videos, v_thwc_gt
+
+                del v_list, v_thwc, latent, target_latents
+                clean_memory()
+
+            del denoise_latent, captions, padded_videos, params
+            clean_memory()
+
+        elif inference_args.task in UNDERSTANDING_TASKS:
+            generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(
+                val_packed_text_ids=val_data["packed_text_ids"],
+                val_packed_text_indexes=val_data["packed_text_indexes"],
+                val_packed_position_ids=val_data["packed_position_ids"],
+                val_sample_N_target=val_data["sample_N_target"],
+                val_split_lens=val_data["split_lens"],
+                val_attn_modes=val_data["attn_modes"],
+                val_sample_lens=val_data["sample_lens"],
+                val_sample_type=val_data["sample_type"],
+                val_packed_vit_tokens=val_data["packed_vit_tokens"],
+                val_vit_video_grid_thw=val_data["vit_video_grid_thw"],
+                val_ce_loss_indexes=val_data["ce_loss_indexes"],
+                max_samples=training_args.validation_max_samples,
+                max_length=MAX_GENERATION_LENGTH,
+                device=device,
+                dtype=torch.bfloat16,
+                new_token_ids=new_token_ids,
+                pad_token_id=tokenizer.pad_token_id,
+                vocab_size=len(tokenizer),
+                caption=val_data.get("caption_cn", None),
+                tokenizer=tokenizer,
+                apply_chat_template=training_args.apply_chat_template,
+                apply_qwen_2_5_vl_pos_emb=training_args.apply_qwen_2_5_vl_pos_emb,
+                do_sample=False,
+                image_token_id=image_token_id,
+                index=val_data["index"],
+            )
+
+            for i_val, generated_sequence in enumerate(generated_sequence_all):
+                cap = tokenizer.decode(generated_sequence[:, 0])
+                # inference_args.prompt_data_dict[index] = f"target_caption: {captions} /// generated_caption: {cap} "
+                inference_args.prompt_data_dict[index] = f"{cap}"
+                del generated_sequence
+
+            del generated_sequence_all, captions
+            clean_memory()
+
+    del val_data
+    clean_memory()
+
+
+def main():
+    # ========================= Env setup ==============================
+    assert torch.cuda.is_available()
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        GLOBAL_RANK = dist.get_rank()
+        WORLD_SIZE = dist.get_world_size()
+    else:
+        GLOBAL_RANK = 0
+        WORLD_SIZE = 1
+
+    LOCAL_RANK = GLOBAL_RANK % torch.cuda.device_count()
+    DEVICE = LOCAL_RANK
+    torch.cuda.set_device(DEVICE)
+
+    # ========================= Args and logger setup ==============================
+    parser = HfArgumentParser((ModelArguments, DataArguments, InferenceArguments))
+    model_args, data_args, inference_args = cast(
+        Tuple[ModelArguments, DataArguments, InferenceArguments],
+        parser.parse_args_into_dataclasses(),
+    )
+    training_args = inference_args
+
+    # ========================= Load task paths and example JSONs from defaults ==============================
+    apply_inference_defaults(model_args, data_args, inference_args)
+    training_args.validation_noise_seed = training_args.validation_data_seed
+
+    logger = get_logger()
+    log_rank0 = print if GLOBAL_RANK == 0 else (lambda *_: None)  # Only print on rank 0.
+
+    def log_stage(stage_name: str, start_time: float, extra: str = ""):
+        elapsed = time.perf_counter() - start_time
+        suffix = f" | {extra}" if extra else ""
+        log_rank0(f"[startup] {stage_name} done in {elapsed:.2f}s{suffix}")
+
+    # Set seed:
+    seed = training_args.global_seed * WORLD_SIZE + GLOBAL_RANK
+    set_seed(seed)
+
+    # ========================= LLM model setup ==============================
+    stage_start = time.perf_counter()
+    log_rank0(f"[startup] Loading LLM config: {osp.join(model_args.model_path, 'llm_config.json')}")
+    llm_config: Qwen2Config = Qwen2Config.from_json_file(osp.join(model_args.model_path, "llm_config.json"))
+    log_stage("LLM config load", stage_start)
+
+    llm_config.layer_module = model_args.layer_module
+    llm_config.qk_norm = model_args.llm_qk_norm
+    llm_config.qk_norm_und = model_args.llm_qk_norm_und
+    llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+
+    llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+    llm_config.freeze_und = training_args.freeze_und
+    llm_config.apply_qwen_2_5_vl_pos_emb = training_args.apply_qwen_2_5_vl_pos_emb
+
+    stage_start = time.perf_counter()
+    log_rank0(f"[startup] Initializing LLM weights: {model_args.model_path}")
+    language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+    log_stage("LLM weight init", stage_start)
+
+    if training_args.visual_und:
+        if model_args.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+            stage_start = time.perf_counter()
+            log_rank0(f"[startup] Loading VIT config: {model_args.vit_path}")
+            vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+            log_stage("VIT config load", stage_start)
+
+            stage_start = time.perf_counter()
+            log_rank0(f"[startup] Loading VIT weights: {osp.join(model_args.vit_path, 'vit.safetensors')}")
+            vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+            vit_weights = load_file(osp.join(model_args.vit_path, "vit.safetensors"))
+            vit_model.load_state_dict(vit_weights, strict=True)
+            log_stage("VIT weight load", stage_start)
+        else:
+            raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+
+        clean_memory(vit_weights)
+
+    if training_args.visual_gen:
+        stage_start = time.perf_counter()
+        log_rank0("[startup] Initializing VAE")
+        vae_model = WanVideoVAE()
+        vae_config: AutoEncoderParams = deepcopy(vae_model.vae_config)
+        log_stage("VAE init", stage_start)
+    else:
+        vae_model = None
+        vae_config = None
+
+    # Lance configuration
+    config = LanceConfig(
+        visual_gen=training_args.visual_gen,
+        visual_und=training_args.visual_und,
+        llm_config=llm_config,
+        vit_config=vit_config if training_args.visual_und else None,
+        vae_config=vae_config if training_args.visual_gen else None,
+        latent_patch_size=model_args.latent_patch_size,
+        max_num_frames=model_args.max_num_frames,
+        max_latent_size=model_args.max_latent_size,
+        vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+        connector_act=model_args.connector_act,
+        interpolate_pos=model_args.interpolate_pos,
+        timestep_shift=training_args.timestep_shift,
+    )
+    model: Lance = Lance(
+        language_model=language_model,
+        vit_model=vit_model if training_args.visual_und else None,
+        vit_type=model_args.vit_type,
+        config=config,
+        training_args=training_args,
+    )
+    stage_start = time.perf_counter()
+    log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
+    model = model.to(DEVICE)
+    log_stage("Lance model move to GPU", stage_start)
+
+    # Setup tokenizer for model:
+    stage_start = time.perf_counter()
+    log_rank0(f"[startup] Loading tokenizer: {model_args.model_path}")
+    tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+
+    tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+    log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
+
+    # Initialize MoE before loading the checkpoint.
+    if training_args.copy_init_moe:
+        language_model.init_moe()
+
+    init_from_model_path_if_needed(model, model_args)
+
+    # Resize afterward to avoid checkpoint shape mismatches or overwritten weights.
+    if num_new_tokens > 0:
+        model.language_model.resize_token_embeddings(len(tokenizer))
+        model.config.llm_config.vocab_size = len(tokenizer)
+        model.language_model.config.vocab_size = len(tokenizer)
+
+    if model_args.vit_type.lower() == "qwen2_5_vl":
+        from common.model.hacks import hack_qwen2_5_vl_config
+        language_model = hack_qwen2_5_vl_config(language_model)
+
+    image_token_id = language_model.config.video_token_id # image_token_id # <|image_pad|>
+    new_token_ids.update({"image_token_id": image_token_id})
+    model.update_tokenizer(tokenizer=tokenizer)
+
+    if model_args.tie_word_embeddings: # and training_args.finetune_from_hf is False:
+        # HACK: Handle the tying logic manually.
+        model.language_model.untie_lm_head() # NOTE: untied lm head weights
+        model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens) # NOTE: copy the new token rows into lm_head
+
+        # Make sure this stays False.
+        model_args.tie_word_embeddings = False
+        llm_config.tie_word_embeddings = False
+    else: # HACK!!!
+        assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
+
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
+    model.eval()
+    if vae_model is not None and hasattr(vae_model, "eval"):
+        vae_model.eval()
+
+    # Setup packed dataloader
+    stage_start = time.perf_counter()
+    log_rank0(f"[startup] Loading dataset config and validation set: {data_args.val_dataset_config_file}")
+    dataset_config = DataConfig.from_yaml(data_args.val_dataset_config_file)
+
+    # NOTE: This block performs in-place assignments. ⚠️
+    if training_args.visual_und:
+        dataset_config.vit_patch_size = model_args.vit_patch_size
+        dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal # TODO: fix
+        dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+        # dataset_config.vit_downsample = vit_downsample # NOTE: need to update !
+    if training_args.visual_gen:
+        assert len(model_args.latent_patch_size) == 3, "len(latent_patch_size) must be 3"
+        vae_downsample = tuple_mul(
+            model_args.latent_patch_size, (vae_config.downsample_temporal, vae_config.downsample_spatial, vae_config.downsample_spatial)
+        )  # NOTE: This already includes patch_size.
+        dataset_config.latent_patch_size = model_args.latent_patch_size
+        dataset_config.vae_downsample = vae_downsample  # NOTE: update !
+        dataset_config.max_latent_size = model_args.max_latent_size  # NOTE: update!
+        dataset_config.max_num_frames = model_args.max_num_frames  # NOTE: update!
+
+    # Fix: share dropout settings.
+    dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+    dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+    dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+    # Load inference parameters.
+    dataset_config.num_frames = inference_args.num_frames
+    dataset_config.H = inference_args.video_height
+    dataset_config.W = inference_args.video_width
+    dataset_config.task = inference_args.task
+    dataset_config.resolution = inference_args.resolution
+    dataset_config.text_template = inference_args.text_template
+    val_dataset = ValidationDataset(
+        jsonl_path= data_args.val_dataset_config_file,
+        tokenizer=tokenizer,
+        data_args=data_args,
+        model_args=model_args,
+        training_args=training_args,
+        new_token_ids=new_token_ids,
+        dataset_config=dataset_config,
+        local_rank=GLOBAL_RANK,  # global rank, not local rank
+        world_size=WORLD_SIZE,
+    )
+
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        pin_memory=True,
+        collate_fn=simple_custom_collate,     # Top-level function
+        drop_last=True,
+        prefetch_factor=None,
+        persistent_workers=False,
+        multiprocessing_context=None,
+    )
+    log_stage("validation set and DataLoader init", stage_start, extra=f"dataset_size={len(val_dataset)}")
+
+    # Prepare the validation data loader iterator.
+    val_loader_iter = iter(val_loader)
+
+    # Initialize a local dictionary to avoid accumulating stale data.
+    if not hasattr(inference_args, "prompt_data_dict"):
+        inference_args.prompt_data_dict = {}
+
+    if not os.path.exists(inference_args.save_path_gen):
+        os.makedirs(inference_args.save_path_gen)
+
+    for epoch in trange(len(val_loader), desc="Validating", unit="batch", leave=True, ncols=80, disable=(GLOBAL_RANK != 0)):
+        try:
+            val_data_cpu = next(val_loader_iter)
+        except StopIteration:
+            break
+
+        validate_on_fixed_batch(
+            fsdp_model=model,
+            vae_model=vae_model,
+            tokenizer=tokenizer,
+            val_data_cpu=val_data_cpu,
+            training_args=training_args,
+            model_args=model_args,
+            inference_args=inference_args,
+            new_token_ids=new_token_ids,
+            image_token_id=image_token_id,
+            device=DEVICE,
+            save_source_video=False, # Whether to save the GT video
+            save_path_gen=inference_args.save_path_gen, # Generated video path
+            save_path_gt="", # GT video path
+        )
+        del val_data_cpu
+        clean_memory()
+
+    # Final gather after all generation loops
+    if dist.is_initialized():
+        dist.barrier()
+        gathered = [None for _ in range(dist.get_world_size())]
+        dist.all_gather_object(gathered, inference_args.prompt_data_dict)
+
+        if GLOBAL_RANK == 0:
+            merged = {}
+            for d in gathered:
+                merged.update(d)
+            inference_args.prompt_data_dict = merged
+            save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen, logger)
+            if inference_args.task in UNDERSTANDING_TASKS:
+                save_understanding_results(
+                    prompt_data_dict=inference_args.prompt_data_dict,
+                    dataset_config_file=data_args.val_dataset_config_file,
+                    save_path_gen=inference_args.save_path_gen,
+                )
+
+    elif GLOBAL_RANK == 0:
+        save_prompt_results(inference_args.prompt_data_dict, inference_args.save_path_gen, logger)
+        if inference_args.task in UNDERSTANDING_TASKS:
+            save_understanding_results(
+                prompt_data_dict=inference_args.prompt_data_dict,
+                dataset_config_file=data_args.val_dataset_config_file,
+                save_path_gen=inference_args.save_path_gen,
+            )
+
+    if dist.is_initialized():
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference_lance.sh b/inference_lance.sh
new file mode 100644
index 0000000000000000000000000000000000000000..db1d4bae04b90ff7794567342ffad04bdbd184df
--- /dev/null
+++ b/inference_lance.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+cd "$SCRIPT_DIR"
+source "$SCRIPT_DIR/benchmarks/sample_env.sh"
+
+# ========================= Inference Parameters =========================
+NUM_GPUS=1
+
+TASK_NAME=x2t_image # t2i | image_edit | t2v | video_edit | x2t_image | x2t_video
+
+VALIDATION_NUM_TIMESTEPS=30 # 50
+VALIDATION_TIMESTEP_SHIFT=3.5
+VALIDATION_DATA_SEED=42
+CFG_TEXT_SCALE=4.0
+USE_KVCACHE=true
+
+NUM_FRAMES=50             # max: 121 frames, unused for image tasks
+VIDEO_HEIGHT=768          # unused for editing
+VIDEO_WIDTH=768           # unused for editing
+RESOLUTION="video_480p"   # image_768res | video_480p
+TEXT_TEMPLATE=true
+
+MODEL_PATH="downloads/Lance_3B_Video"
+
+# ========================= Auto-generated Paths =========================
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+KVCACHE_TAG=""
+if [ "$USE_KVCACHE" = "true" ]; then
+    KVCACHE_TAG="_kvcache"
+fi
+SAVE_PATH_GEN="results/${TASK_NAME}_sample_ts${VALIDATION_NUM_TIMESTEPS}_tts${VALIDATION_TIMESTEP_SHIFT}_seed${VALIDATION_DATA_SEED}_cfg${CFG_TEXT_SCALE}${KVCACHE_TAG}_${TIMESTAMP}"
+
+if [ -z "$MODEL_PATH" ]; then
+    echo "Error: please set MODEL_PATH manually in the configuration section at the top of this script."
+    exit 1
+fi
+
+# ============================== Environment and Distributed Setup ==============================
+lance_setup_common_env
+lance_setup_distributed_env "$NUM_GPUS"
+lance_setup_shard_env 1
+
+# ========================= Show Task Configuration =========================
+echo "================================================"
+echo "Lance Inference"
+echo "================================================"
+echo "Task: ${TASK_NAME}"
+echo "Number of GPUs: ${NUM_GPUS}"
+echo "Save path: ${SAVE_PATH_GEN}"
+echo "Resolution: ${VIDEO_HEIGHT}x${VIDEO_WIDTH}"
+echo "Output frames: ${NUM_FRAMES}"
+echo "Model path: ${MODEL_PATH}"
+echo ""
+echo "Key parameters:"
+echo "  - validation_num_timesteps: ${VALIDATION_NUM_TIMESTEPS}"
+echo "  - validation_timestep_shift: ${VALIDATION_TIMESTEP_SHIFT}"
+echo "  - validation_data_seed: ${VALIDATION_DATA_SEED}"
+echo "  - cfg_text_scale: ${CFG_TEXT_SCALE}"
+echo "  - num_frames: ${NUM_FRAMES}"
+echo "  - use_KVcache: ${USE_KVCACHE}"
+echo "================================================"
+echo ""
+
+# ============================== Run Inference ==============================
+accelerate launch \
+    --num_machines          $NUM_MACHINES \
+    --num_processes         $TOTAL_RANK \
+    --machine_rank          $MACHINE_RANK \
+    --main_process_ip       $MAIN_PROCESS_IP \
+    --main_process_port     $MAIN_PROCESS_PORT \
+    --mixed_precision       bf16 \
+    inference_lance.py \
+    --model_path            "$MODEL_PATH" \
+    --vit_type              qwen_2_5_vl_original \
+    --llm_qk_norm           true \
+    --llm_qk_norm_und       true \
+    --llm_qk_norm_gen       true \
+    --tie_word_embeddings   false \
+    --validation_num_timesteps $VALIDATION_NUM_TIMESTEPS \
+    --validation_timestep_shift $VALIDATION_TIMESTEP_SHIFT \
+    --copy_init_moe         true \
+    --max_num_frames        121 \
+    --max_latent_size       64 \
+    --latent_patch_size     1 1 1 \
+    --visual_und            true \
+    --visual_gen            true \
+    --vae_model_type        wan \
+    --apply_qwen_2_5_vl_pos_emb true \
+    --apply_chat_template   false \
+    --cfg_type              0 \
+    --validation_data_seed  $VALIDATION_DATA_SEED \
+    --video_height          $VIDEO_HEIGHT \
+    --video_width           $VIDEO_WIDTH \
+    --num_frames            $NUM_FRAMES \
+    --task                  $TASK_NAME \
+    --save_path_gen         "$SAVE_PATH_GEN" \
+    --resolution            "$RESOLUTION" \
+    --text_template         "$TEXT_TEMPLATE" \
+    --cfg_text_scale        $CFG_TEXT_SCALE \
+    --use_KVcache           "$USE_KVCACHE"
+
+echo ""
+echo "================================================"
+echo "Done! Results: ${SAVE_PATH_GEN}"
+echo "================================================"
diff --git a/lance_gradio_t2v_v2t.py b/lance_gradio_t2v_v2t.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6eeb7a46e98973982eac8c04dad2fdaee2d7fec
--- /dev/null
+++ b/lance_gradio_t2v_v2t.py
@@ -0,0 +1,954 @@
+from __future__ import annotations
+
+import argparse
+import concurrent.futures
+import json
+import random
+import threading
+import time
+import traceback
+from collections import deque
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+import gradio as gr
+import torch
+from safetensors.torch import load_file
+from transformers import set_seed
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+
+from common.utils.logging import get_logger
+from common.utils.misc import AutoEncoderParams, tuple_mul
+from config.config_factory import DataArguments, InferenceArguments, ModelArguments
+from data.data_utils import add_special_tokens
+from data.dataset_base import DataConfig, simple_custom_collate
+from data.datasets_custom import ValidationDataset
+from inference_lance import (
+    PROMPT_JSON_FILENAME,
+    apply_inference_defaults,
+    clean_memory,
+    init_from_model_path_if_needed,
+    save_prompt_results,
+    validate_on_fixed_batch,
+)
+from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
+from modeling.qwen2 import Qwen2Tokenizer
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+from modeling.vae.wan.model import WanVideoVAE
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+
+
+REPO_ROOT = Path(__file__).resolve().parent
+GRADIO_TMP_ROOT = REPO_ROOT / "tmps" / "gradio_t2v_v2t"
+TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
+RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
+GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
+RUN_RECORD_FILENAME = "generation_record.json"
+
+DEFAULT_MODEL_PATH = REPO_ROOT / "downloads" / "Lance_3B_Video"
+DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
+DEFAULT_TASK = "t2v"
+DEFAULT_TIMESTEPS = 30
+DEFAULT_TIMESTEP_SHIFT = 3.5
+DEFAULT_CFG_TEXT_SCALE = 4.0
+DEFAULT_RESOLUTION = "video_480p"
+DEFAULT_BASIC_SEED = -1
+DEFAULT_HEIGHT = 480
+DEFAULT_WIDTH = 848
+DEFAULT_NUM_FRAMES = 50
+DEFAULT_GPUS = "0"
+DEFAULT_QUEUE_SIZE = 32
+USE_KVCACHE = True
+TEXT_TEMPLATE = True
+RECORD_WRITE_LOCK = threading.Lock()
+
+TASK_T2V = "t2v"
+TASK_V2T = "v2t"
+TASK_X2T = "x2t"
+TASK_X2T_VIDEO = "x2t_video"
+TASK_CHOICES = [TASK_T2V, TASK_V2T]
+VIDEO_RESOLUTION_CHOICES = ["video_192p", "video_360p", "video_480p"]
+V2T_SYSTEM_PROMPT = "Watch the video carefully and answer the question."
+
+
+def ensure_dirs() -> None:
+    TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
+    RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
+
+
+def save_generation_record(record: dict, save_dir: Path) -> None:
+    ensure_dirs()
+    run_record_path = save_dir / RUN_RECORD_FILENAME
+    with run_record_path.open("w", encoding="utf-8") as f:
+        json.dump(record, f, ensure_ascii=False, indent=2)
+
+    with RECORD_WRITE_LOCK:
+        with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
+            f.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+
+def normalize_seed(seed: int) -> int:
+    return random.randint(0, 2**31 - 1) if seed == -1 else seed
+
+
+def normalize_task(task: str) -> str:
+    task = (task or DEFAULT_TASK).strip().lower()
+    if task == TASK_V2T:
+        return TASK_X2T_VIDEO
+    if task == TASK_X2T:
+        return TASK_X2T_VIDEO
+    if task not in {TASK_T2V, TASK_X2T_VIDEO}:
+        raise ValueError(f"Unsupported task type: {task}")
+    return task
+
+
+def create_request_json(task: str, prompt: str, input_video: Optional[str], question: str) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"
+
+    if task == TASK_T2V:
+        payload = {"000000.mp4": prompt}
+    elif task == TASK_X2T_VIDEO:
+        if not input_video:
+            raise ValueError("The v2t task requires an input video.")
+        payload = {
+            "000000": {
+                "interleave_array": [input_video, [V2T_SYSTEM_PROMPT, question, ""]],
+                "element_dtype_array": ["video", "text"],
+                "istarget_in_interleave": [0, 1],
+            }
+        }
+    else:
+        raise ValueError(f"Unsupported task type: {task}")
+
+    with prompt_file.open("w", encoding="utf-8") as f:
+        json.dump(payload, f, ensure_ascii=False, indent=2)
+    return prompt_file
+
+
+def build_save_dir(task: str) -> Path:
+    ensure_dirs()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"
+
+
+def find_generated_video(save_dir: Path) -> Optional[Path]:
+    videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return videos[0] if videos else None
+
+
+def extract_text_result(save_dir: Path) -> str:
+    prompt_result_path = save_dir / PROMPT_JSON_FILENAME
+    if not prompt_result_path.exists():
+        return ""
+    with prompt_result_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not data:
+        return ""
+    first_value = next(iter(data.values()))
+    return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)
+
+
+class LanceT2VV2TPipeline:
+    def __init__(self, device_id: int) -> None:
+        self._init_lock = threading.Lock()
+        self._generate_lock = threading.Lock()
+        self.initialized = False
+        self.device = device_id
+        self.logger = get_logger(f"lance_t2v_v2t_gpu{device_id}")
+
+        self.model: Optional[Lance] = None
+        self.vae_model: Optional[WanVideoVAE] = None
+        self.vae_config: Optional[AutoEncoderParams] = None
+        self.tokenizer: Optional[Qwen2Tokenizer] = None
+        self.new_token_ids: Optional[dict] = None
+        self.image_token_id: Optional[int] = None
+        self.base_model_args: Optional[ModelArguments] = None
+        self.base_data_args: Optional[DataArguments] = None
+        self.base_inference_args: Optional[InferenceArguments] = None
+
+    def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
+        elapsed = time.perf_counter() - start_time
+        suffix = f" | {extra}" if extra else ""
+        print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)
+
+    def _build_base_model_args(self) -> ModelArguments:
+        model_path = str(DEFAULT_MODEL_PATH) if DEFAULT_MODEL_PATH.exists() else ""
+        return ModelArguments(
+            model_path=model_path,
+            vit_type=DEFAULT_VIT_TYPE,
+            llm_qk_norm=True,
+            llm_qk_norm_und=True,
+            llm_qk_norm_gen=True,
+            tie_word_embeddings=False,
+            max_num_frames=121,
+            max_latent_size=64,
+            latent_patch_size=[1, 1, 1],
+        )
+
+    def _build_base_inference_args(self) -> InferenceArguments:
+        return InferenceArguments(
+            validation_num_timesteps=DEFAULT_TIMESTEPS,
+            validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
+            copy_init_moe=True,
+            visual_und=True,
+            visual_gen=True,
+            vae_model_type="wan",
+            apply_qwen_2_5_vl_pos_emb=True,
+            apply_chat_template=False,
+            cfg_type=0,
+            validation_data_seed=42,
+            video_height=DEFAULT_HEIGHT,
+            video_width=DEFAULT_WIDTH,
+            num_frames=DEFAULT_NUM_FRAMES,
+            task=DEFAULT_TASK,
+            save_path_gen=str(RESULTS_ROOT),
+            resolution=DEFAULT_RESOLUTION,
+            text_template=TEXT_TEMPLATE,
+            use_KVcache=USE_KVCACHE,
+        )
+
+    def initialize(self) -> None:
+        with self._init_lock:
+            if self.initialized:
+                return
+
+            ensure_dirs()
+            if not torch.cuda.is_available():
+                raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
+            if self.device >= torch.cuda.device_count():
+                raise RuntimeError(
+                    f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
+                )
+            torch.cuda.set_device(self.device)
+
+            model_args = self._build_base_model_args()
+            data_args = DataArguments()
+            inference_args = self._build_base_inference_args()
+            apply_inference_defaults(model_args, data_args, inference_args)
+            inference_args.validation_noise_seed = inference_args.validation_data_seed
+
+            self.base_model_args = model_args
+            self.base_data_args = data_args
+            self.base_inference_args = inference_args
+
+            set_seed(inference_args.global_seed)
+
+            stage_start = time.perf_counter()
+            print(
+                f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
+                flush=True,
+            )
+            llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
+            self._log_stage("LLM config load", stage_start)
+
+            llm_config.layer_module = model_args.layer_module
+            llm_config.qk_norm = model_args.llm_qk_norm
+            llm_config.qk_norm_und = model_args.llm_qk_norm_und
+            llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
+            llm_config.tie_word_embeddings = model_args.tie_word_embeddings
+            llm_config.freeze_und = inference_args.freeze_und
+            llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
+
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
+            language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
+            self._log_stage("LLM weight init", stage_start)
+
+            vit_model = None
+            vit_config = None
+            if inference_args.visual_und:
+                if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+                    raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
+                vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
+                self._log_stage("VIT config load", stage_start)
+
+                stage_start = time.perf_counter()
+                print(
+                    f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
+                    flush=True,
+                )
+                vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
+                vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
+                vit_model.load_state_dict(vit_weights, strict=True)
+                self._log_stage("VIT weight load", stage_start)
+                clean_memory(vit_weights)
+
+            if inference_args.visual_gen:
+                stage_start = time.perf_counter()
+                print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
+                vae_model = WanVideoVAE()
+                vae_config = deepcopy(vae_model.vae_config)
+                self._log_stage("VAE init", stage_start)
+            else:
+                vae_model = None
+                vae_config = None
+
+            config = LanceConfig(
+                visual_gen=inference_args.visual_gen,
+                visual_und=inference_args.visual_und,
+                llm_config=llm_config,
+                vit_config=vit_config if inference_args.visual_und else None,
+                vae_config=vae_config if inference_args.visual_gen else None,
+                latent_patch_size=model_args.latent_patch_size,
+                max_num_frames=model_args.max_num_frames,
+                max_latent_size=model_args.max_latent_size,
+                vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
+                connector_act=model_args.connector_act,
+                interpolate_pos=model_args.interpolate_pos,
+                timestep_shift=inference_args.timestep_shift,
+            )
+            model: Lance = Lance(
+                language_model=language_model,
+                vit_model=vit_model if inference_args.visual_und else None,
+                vit_type=model_args.vit_type,
+                config=config,
+                training_args=inference_args,
+            )
+
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
+            model = model.to(self.device)
+            self._log_stage("Lance model move to GPU", stage_start)
+
+            stage_start = time.perf_counter()
+            print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
+            tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
+            tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
+            self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
+
+            if inference_args.copy_init_moe:
+                language_model.init_moe()
+
+            init_from_model_path_if_needed(model, model_args)
+
+            if num_new_tokens > 0:
+                model.language_model.resize_token_embeddings(len(tokenizer))
+                model.config.llm_config.vocab_size = len(tokenizer)
+                model.language_model.config.vocab_size = len(tokenizer)
+
+            if model_args.vit_type.lower() == "qwen2_5_vl":
+                from common.model.hacks import hack_qwen2_5_vl_config
+
+                language_model = hack_qwen2_5_vl_config(language_model)
+
+            image_token_id = language_model.config.video_token_id
+            new_token_ids.update({"image_token_id": image_token_id})
+            model.update_tokenizer(tokenizer=tokenizer)
+
+            if model_args.tie_word_embeddings:
+                model.language_model.untie_lm_head()
+                model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
+                model_args.tie_word_embeddings = False
+                llm_config.tie_word_embeddings = False
+            else:
+                assert (
+                    model.language_model.get_input_embeddings().weight.data.data_ptr()
+                    != model.language_model.get_output_embeddings().weight.data.data_ptr()
+                ), "tie_word_embeddings conflict"
+
+            model = model.to(device=self.device, dtype=torch.bfloat16)
+            model.eval()
+            if vae_model is not None and hasattr(vae_model, "eval"):
+                vae_model.eval()
+
+            self.model = model
+            self.vae_model = vae_model
+            self.vae_config = vae_config
+            self.tokenizer = tokenizer
+            self.new_token_ids = new_token_ids
+            self.image_token_id = image_token_id
+            self.initialized = True
+            print(f"[startup][gpu:{self.device}] Lance T2V/V2T Gradio model loaded and ready for reuse.", flush=True)
+
+    def _build_request_batch(
+        self,
+        prompt_file: Path,
+        model_args: ModelArguments,
+        data_args: DataArguments,
+        inference_args: InferenceArguments,
+    ):
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.vae_config is not None
+
+        dataset_config = DataConfig.from_yaml(str(prompt_file))
+        if inference_args.visual_und:
+            dataset_config.vit_patch_size = model_args.vit_patch_size
+            dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
+            dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
+        if inference_args.visual_gen:
+            vae_downsample = tuple_mul(
+                tuple(model_args.latent_patch_size),
+                (
+                    self.vae_config.downsample_temporal,
+                    self.vae_config.downsample_spatial,
+                    self.vae_config.downsample_spatial,
+                ),
+            )
+            dataset_config.latent_patch_size = model_args.latent_patch_size
+            dataset_config.vae_downsample = vae_downsample
+            dataset_config.max_latent_size = model_args.max_latent_size
+            dataset_config.max_num_frames = model_args.max_num_frames
+
+        dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
+        dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
+        dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
+
+        dataset_config.num_frames = inference_args.num_frames
+        dataset_config.H = inference_args.video_height
+        dataset_config.W = inference_args.video_width
+        dataset_config.task = inference_args.task
+        dataset_config.resolution = inference_args.resolution
+        dataset_config.text_template = inference_args.text_template
+
+        val_dataset = ValidationDataset(
+            jsonl_path=str(prompt_file),
+            tokenizer=self.tokenizer,
+            data_args=data_args,
+            model_args=model_args,
+            training_args=inference_args,
+            new_token_ids=self.new_token_ids,
+            dataset_config=dataset_config,
+            local_rank=0,
+            world_size=1,
+        )
+        return simple_custom_collate([val_dataset[0]])
+
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        input_video: Optional[str],
+        question: str,
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+    ):
+        self.initialize()
+        internal_task = normalize_task(task)
+        prompt = (prompt or "").strip()
+        question = (question or "").strip()
+        input_video = str(input_video).strip() if input_video else ""
+
+        if internal_task == TASK_T2V and not prompt:
+            return None, "", "Please enter a prompt.", ""
+        if internal_task == TASK_X2T_VIDEO and not question:
+            return None, "", "Please enter a question.", ""
+        if internal_task == TASK_X2T_VIDEO and not input_video:
+            return None, "", "Please upload an input video.", ""
+        if height <= 0 or width <= 0:
+            return None, "", "Height and width must be greater than 0.", ""
+        if num_frames <= 0:
+            return None, "", "The number of frames must be greater than 0.", ""
+
+        assert self.model is not None
+        assert self.tokenizer is not None
+        assert self.new_token_ids is not None
+        assert self.image_token_id is not None
+        assert self.base_model_args is not None
+        assert self.base_data_args is not None
+        assert self.base_inference_args is not None
+
+        with self._generate_lock:
+            torch.cuda.set_device(self.device)
+            actual_seed = normalize_seed(int(seed))
+            prompt_file = create_request_json(
+                task=internal_task,
+                prompt=prompt,
+                input_video=input_video,
+                question=question,
+            )
+            save_dir = build_save_dir(internal_task)
+            save_dir.mkdir(parents=True, exist_ok=True)
+            request_started_at = datetime.now().isoformat(timespec="seconds")
+
+            request_model_args = deepcopy(self.base_model_args)
+            request_model_args.cfg_text_scale = float(cfg_text_scale)
+
+            request_data_args = deepcopy(self.base_data_args)
+            request_data_args.val_dataset_config_file = str(prompt_file)
+
+            request_inference_args = deepcopy(self.base_inference_args)
+            request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
+            request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
+            request_inference_args.validation_data_seed = actual_seed
+            request_inference_args.validation_noise_seed = actual_seed
+            request_inference_args.video_height = int(height)
+            request_inference_args.video_width = int(width)
+            request_inference_args.num_frames = int(num_frames)
+            request_inference_args.resolution = resolution
+            request_inference_args.save_path_gen = str(save_dir)
+            request_inference_args.task = internal_task
+            request_inference_args.text_template = TEXT_TEMPLATE
+            request_inference_args.prompt_data_dict = {}
+
+            try:
+                print(
+                    "[lance_gradio_t2v_v2t] Start generation "
+                    f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
+                    f"size={height}x{width} | frames={num_frames} | resolution={resolution}",
+                    flush=True,
+                )
+                val_data_cpu = self._build_request_batch(
+                    prompt_file=prompt_file,
+                    model_args=request_model_args,
+                    data_args=request_data_args,
+                    inference_args=request_inference_args,
+                )
+                generate_start = time.perf_counter()
+                validate_on_fixed_batch(
+                    fsdp_model=self.model,
+                    vae_model=self.vae_model,
+                    tokenizer=self.tokenizer,
+                    val_data_cpu=val_data_cpu,
+                    training_args=request_inference_args,
+                    model_args=request_model_args,
+                    inference_args=request_inference_args,
+                    new_token_ids=self.new_token_ids,
+                    image_token_id=self.image_token_id,
+                    device=self.device,
+                    save_source_video=False,
+                    save_path_gen=request_inference_args.save_path_gen,
+                    save_path_gt="",
+                )
+                elapsed = time.perf_counter() - generate_start
+                save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
+                clean_memory()
+
+                video_path = find_generated_video(save_dir) if internal_task == TASK_T2V else None
+                text_result = extract_text_result(save_dir) if internal_task == TASK_X2T_VIDEO else ""
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "success",
+                    "task": internal_task,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "question": question,
+                    "input_video": input_video,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "elapsed_seconds": round(elapsed, 3),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "video_path": str(video_path) if video_path is not None else "",
+                    "text_result": text_result,
+                }
+                if internal_task == TASK_T2V and video_path is None:
+                    record["status"] = "completed_without_video"
+                if internal_task == TASK_X2T_VIDEO and not text_result:
+                    record["status"] = "completed_without_text"
+                save_generation_record(record, save_dir)
+
+                logs = "\n".join(
+                    [
+                        "[lance_gradio_t2v_v2t] Generation finished in-process.",
+                        f"task={internal_task}",
+                        f"gpu={self.device}",
+                        f"seed={actual_seed}",
+                        f"height={height}",
+                        f"width={width}",
+                        f"num_frames={num_frames}",
+                        f"resolution={resolution}",
+                        f"validation_num_timesteps={validation_num_timesteps}",
+                        f"validation_timestep_shift={validation_timestep_shift}",
+                        f"cfg_text_scale={cfg_text_scale}",
+                        f"elapsed={elapsed:.2f}s",
+                        f"output_dir={save_dir}",
+                    ]
+                )
+
+                if internal_task == TASK_T2V:
+                    if video_path is None:
+                        status = (
+                            "Inference completed, but no generated video was found.\n\n"
+                            f"- Task: `{internal_task}`\n"
+                            f"- GPU: `{self.device}`\n"
+                            f"- Actual seed: `{actual_seed}`\n"
+                            f"- Output directory: `{save_dir}`"
+                        )
+                        return None, "", status, logs
+                    status = (
+                        "Inference completed.\n\n"
+                        f"- Task: `{internal_task}`\n"
+                        f"- GPU: `{self.device}`\n"
+                        f"- Actual seed: `{actual_seed}`\n"
+                        f"- Output directory: `{save_dir}`\n"
+                        f"- Result file: `{video_path}`"
+                    )
+                    return str(video_path), "", status, logs
+
+                status = (
+                    "Understanding completed.\n\n"
+                    f"- Task: `{task}`\n"
+                    f"- GPU: `{self.device}`\n"
+                    f"- Actual seed: `{actual_seed}`\n"
+                    f"- Output directory: `{save_dir}`"
+                )
+                return None, text_result, status, logs
+            except Exception:
+                error_trace = traceback.format_exc()
+                print(error_trace, flush=True)
+                record = {
+                    "request_started_at": request_started_at,
+                    "request_finished_at": datetime.now().isoformat(timespec="seconds"),
+                    "status": "failed",
+                    "task": internal_task,
+                    "gpu": self.device,
+                    "prompt": prompt,
+                    "question": question,
+                    "input_video": input_video,
+                    "seed": actual_seed,
+                    "height": int(height),
+                    "width": int(width),
+                    "num_frames": int(num_frames),
+                    "resolution": resolution,
+                    "validation_num_timesteps": int(validation_num_timesteps),
+                    "validation_timestep_shift": float(validation_timestep_shift),
+                    "cfg_text_scale": float(cfg_text_scale),
+                    "prompt_file": str(prompt_file),
+                    "output_dir": str(save_dir),
+                    "video_path": "",
+                    "text_result": "",
+                    "error": error_trace,
+                }
+                save_generation_record(record, save_dir)
+                status = (
+                    "Inference failed.\n\n"
+                    f"- Task: `{internal_task}`\n"
+                    f"- GPU: `{self.device}`\n"
+                    f"- Actual seed: `{actual_seed}`\n"
+                    f"- Output directory: `{save_dir}`"
+                )
+                return None, "", status, error_trace
+
+
+class PipelinePool:
+    def __init__(self, gpu_ids: list[int]) -> None:
+        if not gpu_ids:
+            raise ValueError("At least one GPU must be configured.")
+        self.gpu_ids = gpu_ids
+        self.pipelines = [LanceT2VV2TPipeline(device_id=gpu_id) for gpu_id in gpu_ids]
+        self._available = deque(self.pipelines)
+        self._condition = threading.Condition()
+
+    @property
+    def size(self) -> int:
+        return len(self.pipelines)
+
+    @property
+    def gpu_summary(self) -> str:
+        return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
+
+    def initialize_all(self) -> None:
+        print(f"[startup] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
+        exceptions: list[Exception] = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
+            futures = {
+                executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
+            }
+            for future in concurrent.futures.as_completed(futures):
+                gpu_id = futures[future]
+                try:
+                    future.result()
+                except Exception as exc:
+                    print(f"[startup][gpu:{gpu_id}] Preload failed: {exc}", flush=True)
+                    exceptions.append(exc)
+        if exceptions:
+            raise RuntimeError(f"Preload failed on {len(exceptions)} GPU(s). Please check the terminal logs.") from exceptions[0]
+        print(f"[startup] GPU preload finished. Ready to handle {self.size} concurrent request(s).", flush=True)
+
+    def acquire(self) -> LanceT2VV2TPipeline:
+        with self._condition:
+            while not self._available:
+                self._condition.wait()
+            return self._available.popleft()
+
+    def release(self, pipeline: LanceT2VV2TPipeline) -> None:
+        with self._condition:
+            self._available.append(pipeline)
+            self._condition.notify()
+
+    def generate(
+        self,
+        task: str,
+        prompt: str,
+        input_video: Optional[str],
+        question: str,
+        height: int,
+        width: int,
+        num_frames: int,
+        seed: int,
+        resolution: str,
+        validation_num_timesteps: int,
+        validation_timestep_shift: float,
+        cfg_text_scale: float,
+    ):
+        pipeline = self.acquire()
+        try:
+            return pipeline.generate(
+                task=task,
+                prompt=prompt,
+                input_video=input_video,
+                question=question,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                seed=seed,
+                resolution=resolution,
+                validation_num_timesteps=validation_num_timesteps,
+                validation_timestep_shift=validation_timestep_shift,
+                cfg_text_scale=cfg_text_scale,
+            )
+        finally:
+            self.release(pipeline)
+
+
+PIPELINE_POOL: Optional[PipelinePool] = None
+QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
+
+
+def run_task(
+    task: str,
+    prompt: str,
+    input_video: Optional[str],
+    question: str,
+    height: int,
+    width: int,
+    num_frames: int,
+    seed: int,
+    resolution: str,
+    validation_num_timesteps: int,
+    validation_timestep_shift: float,
+    cfg_text_scale: float,
+):
+    assert PIPELINE_POOL is not None
+    return PIPELINE_POOL.generate(
+        task=task,
+        prompt=prompt,
+        input_video=input_video,
+        question=question,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        seed=seed,
+        resolution=resolution,
+        validation_num_timesteps=validation_num_timesteps,
+        validation_timestep_shift=validation_timestep_shift,
+        cfg_text_scale=cfg_text_scale,
+    )
+
+
+def build_status_markdown() -> str:
+    gpu_text = "unknown"
+    concurrency = 1
+    if PIPELINE_POOL is not None:
+        gpu_text = PIPELINE_POOL.gpu_summary
+        concurrency = PIPELINE_POOL.size
+    return (
+        f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
+        f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Preload mode: `parallel`"
+    )
+
+
+def update_task_ui(task: str):
+    task = (task or DEFAULT_TASK).strip().lower()
+    if task == TASK_T2V:
+        return (
+            gr.update(label="Prompt", placeholder="Describe the video you want to generate...", visible=True),
+            gr.update(label="Input Video", visible=False, value=None),
+            gr.update(label="Question", placeholder="Please enter a question", visible=False, value=""),
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(visible=True),
+            gr.update(value=""),
+        )
+    return (
+        gr.update(label="Prompt", placeholder="This task does not require a prompt", visible=False, value=""),
+        gr.update(label="Input Video", visible=True),
+        gr.update(label="Question", placeholder="Describe the question you want the model to answer", visible=True),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(visible=False),
+        gr.update(value=""),
+    )
+
+
+def build_demo() -> gr.Blocks:
+    with gr.Blocks(title="Lance T2V/V2T Gradio") as demo:
+        gr.Markdown(
+            """
+            # Lance T2V/V2T
+
+            Supports two tasks: `t2v` and `v2t`.
+            `v2t` is mapped to the internal `x2t_video` task in the backend.
+            The service preloads one model per GPU at startup, and requests are automatically dispatched to idle GPUs.
+            """
+        )
+        gr.Markdown(build_status_markdown())
+
+        with gr.Row():
+            with gr.Column(scale=1):
+                task = gr.Dropdown(label="Task", choices=TASK_CHOICES, value=DEFAULT_TASK)
+                prompt = gr.Textbox(
+                    label="Prompt",
+                    lines=6,
+                    placeholder="Describe the video you want to generate...",
+                )
+                input_video = gr.Video(label="Input Video", visible=False)
+                question = gr.Textbox(
+                    label="Question",
+                    lines=3,
+                    placeholder="Describe the question you want the model to answer",
+                    visible=False,
+                )
+                with gr.Row():
+                    height = gr.Slider(
+                        minimum=192,
+                        maximum=1024,
+                        step=16,
+                        value=DEFAULT_HEIGHT,
+                        label="Height",
+                    )
+                    width = gr.Slider(
+                        minimum=192,
+                        maximum=1024,
+                        step=16,
+                        value=DEFAULT_WIDTH,
+                        label="Width",
+                    )
+                num_frames = gr.Slider(
+                    minimum=1,
+                    maximum=121,
+                    step=1,
+                    value=DEFAULT_NUM_FRAMES,
+                    label="Output Frames",
+                )
+                seed = gr.Number(
+                    label="Seed",
+                    value=DEFAULT_BASIC_SEED,
+                    precision=0,
+                    info="-1 means using a random seed each time",
+                )
+                resolution = gr.Dropdown(
+                    label="RESOLUTION",
+                    choices=VIDEO_RESOLUTION_CHOICES,
+                    value=DEFAULT_RESOLUTION,
+                )
+
+                with gr.Accordion("Advanced Parameters", open=False):
+                    validation_num_timesteps = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=DEFAULT_TIMESTEPS,
+                        label="VALIDATION_NUM_TIMESTEPS",
+                    )
+                    validation_timestep_shift = gr.Number(
+                        label="VALIDATION_TIMESTEP_SHIFT",
+                        value=DEFAULT_TIMESTEP_SHIFT,
+                    )
+                    cfg_text_scale = gr.Number(
+                        label="CFG_TEXT_SCALE",
+                        value=DEFAULT_CFG_TEXT_SCALE,
+                    )
+
+                run_button = gr.Button("Run", variant="primary")
+
+            with gr.Column(scale=1):
+                output_video = gr.Video(label="Video Result")
+                output_text = gr.Textbox(label="Text Result", lines=8)
+                status = gr.Markdown("Waiting to run.")
+                logs = gr.Textbox(label="Run Logs", lines=22, max_lines=30)
+
+        task.change(
+            fn=update_task_ui,
+            inputs=[task],
+            outputs=[prompt, input_video, question, height, width, num_frames, output_text],
+        )
+
+        run_button.click(
+            fn=run_task,
+            inputs=[
+                task,
+                prompt,
+                input_video,
+                question,
+                height,
+                width,
+                num_frames,
+                seed,
+                resolution,
+                validation_num_timesteps,
+                validation_timestep_shift,
+                cfg_text_scale,
+            ],
+            outputs=[output_video, output_text, status, logs],
+        )
+
+    return demo
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Lance T2V/V2T Gradio")
+    parser.add_argument("--server-name", default="0.0.0.0")
+    parser.add_argument("--server-port", type=int, default=7860)
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument(
+        "--gpus",
+        default=DEFAULT_GPUS,
+        help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
+    )
+    parser.add_argument(
+        "--queue-size",
+        type=int,
+        default=DEFAULT_QUEUE_SIZE,
+        help="Maximum number of queued Gradio requests.",
+    )
+    return parser.parse_args()
+
+
+def parse_gpu_ids(gpu_string: str) -> list[int]:
+    gpu_ids: list[int] = []
+    for item in gpu_string.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        gpu_ids.append(int(item))
+    if not gpu_ids:
+        raise ValueError("No valid GPU IDs were parsed.")
+    return gpu_ids
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    QUEUE_MAX_SIZE = args.queue_size
+    gpu_ids = parse_gpu_ids(args.gpus)
+    PIPELINE_POOL = PipelinePool(gpu_ids)
+    PIPELINE_POOL.initialize_all()
+    demo = build_demo()
+    demo.queue(
+        max_size=args.queue_size,
+        default_concurrency_limit=PIPELINE_POOL.size,
+    ).launch(
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+    )
diff --git a/modeling/__init__.py b/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bf43810187497ae6adb1a151081b38136e4682
--- /dev/null
+++ b/modeling/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates.
+# SPDX-License-Identifier: Apache-2.0
+
+# from . import bagel, qwen2, siglip, autoencoder
diff --git a/modeling/lance/__init__.py b/modeling/lance/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b3fc9949d087757b91368aed051633d9d214f6
--- /dev/null
+++ b/modeling/lance/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+from .lance import LanceConfig, Lance
+from .qwen2_navit import Qwen2Model, Qwen2ForCausalLM
+from modeling.qwen2.modeling_qwen2 import Qwen2Config
+
+
+__all__ = [
+    'LanceConfig',
+    'Lance',
+    'Qwen2Config',
+    'Qwen2Model',
+    'Qwen2ForCausalLM',
+]
diff --git a/modeling/lance/lance.py b/modeling/lance/lance.py
new file mode 100644
index 0000000000000000000000000000000000000000..706bad6dd39f8d3453267d84d4ad7ca63fdbfebd
--- /dev/null
+++ b/modeling/lance/lance.py
@@ -0,0 +1,1918 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import random
+from typing import List, Tuple, Optional, Dict
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn.attention.flex_attention import create_block_mask
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from data.data_utils import (
+    create_sparse_mask,
+    get_flattened_position_ids_extrapolate,
+    get_flattened_position_ids_interpolate,
+    get_flattened_position_ids_interpolate_video,
+    get_flattened_position_ids_extrapolate_video,
+)
+from .qwen2_navit import NaiveCache, Qwen2ForCausalLM
+from .modeling_utils import MLPconnector, TimestepEmbedder, PositionEmbedding3D
+
+from config.config_factory import TrainingArguments
+from common.utils.misc import AutoEncoderParams
+from common.utils.distributed import get_global_rank
+from common.utils.logging import get_logger
+from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
+from modeling.qwen2 import Qwen2Tokenizer
+from common.val.utils import map_splits_to_samples, make_packed_vit_token_embed, uncond_split_pro
+from data.common import shift_position_ids
+from copy import deepcopy
+
+class LanceConfig(PretrainedConfig):
+    def __init__(
+        self,
+        visual_gen=True,
+        visual_und=True,
+        llm_config=None,
+        vit_config=None,
+        vae_config: AutoEncoderParams = None,
+        latent_patch_size=(1, 2, 2),  # pt ph pw
+        max_latent_size=32,
+        vit_max_num_patch_per_side=70,
+        connector_act="gelu_pytorch_tanh",
+        interpolate_pos=False,
+        timestep_shift=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.visual_gen = visual_gen
+        self.visual_und = visual_und
+        self.llm_config = llm_config
+        self.vit_config = vit_config
+        self.vae_config = vae_config
+        self.latent_patch_size = latent_patch_size
+        self.max_num_frames = kwargs.get("max_num_frames", 25)  # TODO: 完善下
+        self.max_latent_size = max_latent_size
+        self.vit_max_num_patch_per_side = vit_max_num_patch_per_side
+        self.connector_act = connector_act
+        self.interpolate_pos = interpolate_pos
+        self.timestep_shift = timestep_shift
+
+
+class Lance(PreTrainedModel):
+    config_class = LanceConfig
+    base_model_prefix = "lance"
+
+    def __init__(
+        self,
+        language_model: Qwen2ForCausalLM,
+        vit_model: Qwen2_5_VisionTransformerPretrainedModel,
+        vit_type: str = "qwen2_5_vl",
+        config: LanceConfig = None,
+        **kwargs
+    ):
+        super().__init__(config)
+        self.language_model: Qwen2ForCausalLM = language_model
+        self.hidden_size = config.llm_config.hidden_size
+        self.use_moe = "Mo" in config.llm_config.layer_module
+        self.num_heads = config.llm_config.num_attention_heads
+        self.logger = get_logger()
+        self.log_rank0 = self.logger.info if get_global_rank() == 0 else lambda x: None
+        if config.visual_gen:
+            self.latent_patch_size = config.latent_patch_size
+            self.timestep_shift = config.timestep_shift
+            self.latent_downsample_spatial = config.vae_config.downsample_spatial * config.latent_patch_size[-1]
+            self.latent_downsample_temporal = config.vae_config.downsample_temporal
+            self.max_num_latent_frames = config.max_num_frames // self.latent_downsample_temporal + 1
+            self.latent_channel = config.vae_config.z_channels
+            self.max_latent_size = config.max_latent_size
+            self.patch_latent_dim = self.latent_patch_size[0] * self.latent_patch_size[1] * self.latent_patch_size[2] * self.latent_channel
+            self.time_embedder = TimestepEmbedder(self.hidden_size)
+
+            self.vae2llm = nn.Linear(self.patch_latent_dim, self.hidden_size)  # vision input
+            self.llm2vae = nn.Linear(self.hidden_size, self.patch_latent_dim)  # vision ouput
+
+            self.latent_pos_embed = PositionEmbedding3D(self.max_num_latent_frames, self.max_latent_size, self.hidden_size)
+
+            safety = 1024 # 由于video前有文本，因此video的起始坐标不再是[0,0,0], 而是[n,n,n], 预留出安全位移
+            self.pos_shift = self.max_latent_size * self.max_latent_size * self.max_num_latent_frames + safety
+
+        if config.visual_und:
+            self.vit_model: Qwen2_5_VisionTransformerPretrainedModel = vit_model
+            self.vit_patch_size = config.vit_config.patch_size
+            self.vit_max_num_patch_per_side = config.vit_max_num_patch_per_side
+            self.vit_type = vit_type
+            if self.vit_type == "qwen2_5_vl":
+                self.vit_hidden_size: int = config.vit_config.out_hidden_size
+                self.connector: MLPconnector = MLPconnector(self.vit_hidden_size, self.hidden_size, config.connector_act)
+
+            elif self.vit_type == "qwen_2_5_vl_original":
+                pass # 注意⚠️ ViT encoder中的merger就是MLP connector了，已经包含MLP connector了
+            else:
+                raise ValueError(f"vit_model_type {self.vit_type} not supported")
+
+            self.vit_model.eval()
+
+        if config.interpolate_pos:
+            self.get_flattened_position_ids = get_flattened_position_ids_interpolate
+        else:
+            self.get_flattened_position_ids = get_flattened_position_ids_extrapolate
+
+        self.config = config
+        self.training_args: TrainingArguments = kwargs.get("training_args") # for 消融实验
+
+        # for task embedding
+        if self.training_args.use_task_embedding:
+            self.num_tasks = 10 # 设置为一个大于实际task/modality 数量的值
+            self.task_embedding = nn.Embedding(self.num_tasks, config.vit_config.out_hidden_size)
+
+        if self.training_args.use_modality_embedding:
+            self.num_modalities = 10 # 设置为一个大于实际task/modality 数量的值
+            self.modality_embedding = nn.Embedding(self.num_modalities, config.vit_config.out_hidden_size)
+
+    def update_tokenizer(self, tokenizer):
+        self.tokenizer: Qwen2Tokenizer = tokenizer # add for debug, check, loss
+        self.vocab_size_efficient = len(tokenizer) # 有效的词表长度，用于计算loss
+
+    def process_attention_mask(self, current_attn_modes, current_split_lens, current_seq_len, device, BLOCK_SIZE=128):
+        current_attn_modes_ = ["full" if mode_ in ["full_noise", "full_noise_target"] else mode_ for mode_ in current_attn_modes]
+        sparse_mask = create_sparse_mask(current_seq_len, current_split_lens, current_attn_modes_, device)
+        current_seq_len_sum = sum(current_seq_len)
+        attention_mask = create_block_mask(
+                sparse_mask, B=1, H=self.num_heads, Q_LEN=current_seq_len_sum, KV_LEN=current_seq_len_sum, device=device, BLOCK_SIZE=BLOCK_SIZE, _compile=False
+            )
+        return attention_mask
+
+    def forward(
+        self,
+        sequence_length: int,
+        packed_text_ids: torch.LongTensor,
+        packed_text_indexes: torch.LongTensor,
+        sample_lens: List[int],
+        sample_type: List[str],
+        sample_N_target: List[int],
+        packed_position_ids: torch.LongTensor,
+        nested_attention_masks: List[torch.Tensor] = None,
+        split_lens: List[int] = None,
+        attn_modes: List[str] = None,
+        # for visual understanding
+        ce_loss_indexes: Optional[torch.BoolTensor] = None,
+        packed_label_ids: Optional[torch.LongTensor] = None,
+        packed_vit_tokens: Optional[torch.Tensor] = None,
+        packed_vit_token_indexes: Optional[torch.LongTensor] = None,
+        packed_vit_position_ids: Optional[torch.LongTensor] = None,
+        vit_token_seqlens: Optional[torch.IntTensor] = None,
+        vit_video_grid_thw: Optional[torch.IntTensor] = None,
+        vae_video_grid_thw: Optional[torch.IntTensor] = None,
+        video_grid_thw: Optional[torch.IntTensor] = None,
+        # for visual generation
+        padded_latent: Optional[torch.Tensor] = None,
+        patchified_vae_latent_shapes: Optional[List[Tuple[int, int]]] = None,
+        packed_latent_position_ids: Optional[torch.LongTensor] = None,
+        packed_vae_token_indexes: Optional[torch.LongTensor] = None,
+        packed_timesteps: Optional[torch.LongTensor] = None,
+        mse_loss_indexes: Optional[torch.BoolTensor] = None,
+        vit_data_mode: Optional[List[str]] = None, # 指示每个vit_split online or offline
+        key_frame_mask: Optional[torch.BoolTensor] = None,
+        sample_task: Optional[torch.LongTensor] = None,
+        sample_modality: Optional[torch.LongTensor] = None,
+        BLOCK_SIZE: int = 128,
+    ) -> torch.Tensor:
+        """
+        Args:
+            sequence_length: length of sequence.
+            packed_text_ids: 1-D int tensor, packed text token ids.
+            packed_text_indexes: 1-D int tensor, packed text token indexes in sequence.
+            sample_lens: A list of N ints, length of each sample in packed_sequence.
+            nested_attention_masks: A list of N 2-D float tensor,  where 0.0 means attention and
+                -inf means ignore.
+            packed_position_ids: packed 1-D positions, an image has only one global position shared
+                by all latent tokens.
+
+            packed_vit_tokens: packed patchified image tokens for vit model.
+            packed_vit_position_ids: 1-D int tensor, the position of each token for vit model.
+            packed_vit_token_indexes: 1-D int tensor, packed vit token indexes in sequence.
+            vit_token_seqlens: 1-D int tensor, the length of each image tokens for vit model.
+            packed_label_ids: 1-D int tensor, packed label token ids.
+            ce_loss_indexes: 1-D bool tensor, where to compute ce loss.
+
+            padded_latent: padded latent from VAE encoder.
+            patchified_vae_latent_shapes: A list of (h, w) tuples, patchfied latent shapes of each image.
+            packed_latent_position_ids: 1-D int tensor, the position of each token for latent.
+            packed_vae_token_indexes: 1-D int tensor, padded image token indexes in sequence.
+            packed_timesteps: 1-D float tensor, flow timesteps. 0 indicates use clean image.
+            mse_loss_indexes: 1-D bool tensor, where to compute mse loss.
+        """
+        # 获取其他参数信息
+        N_vit_split = attn_modes.count("full")
+        device = packed_text_ids.device
+        apply_qwen_2_5_vl_pos_emb = getattr(self.training_args, "apply_qwen_2_5_vl_pos_emb", False)
+        choose_frame_loss = random.random() > 0.5 # 50% 的概率执行预测视频
+        sample_splits = map_splits_to_samples(sample_lens, split_lens)
+
+        if apply_qwen_2_5_vl_pos_emb:  # TODO :
+
+            packed_position_ids = []
+            sample_lens_tensor = torch.tensor(sample_lens, device=device, dtype=torch.long)  # sample_lens是已经是padding的，即便为0，也会pad
+            cu_sample_lens = torch.cat([torch.zeros(1, device=device, dtype=torch.long), sample_lens_tensor.cumsum(0)[:-1]])
+            # i_num_grid_thw = 0
+            for i_sample in range(len(sample_lens) - 1):
+                text_ids = packed_text_ids[cu_sample_lens[i_sample] : cu_sample_lens[i_sample + 1]]
+                left, right = sample_splits[i_sample][0], sample_splits[i_sample][-1] + 1
+                grid_thw_rope  = video_grid_thw[i_sample]
+
+                i_sample_task = sample_task[cu_sample_lens[i_sample] : cu_sample_lens[i_sample + 1]]
+                i_sample_modality = sample_modality[cu_sample_lens[i_sample] : cu_sample_lens[i_sample + 1]]
+
+                current_packed_position_ids, rope_deltas = self.language_model.get_rope_index(
+                    input_ids=text_ids.unsqueeze(0),
+                    image_grid_thw=grid_thw_rope,  # [[1,16,16]],
+                    video_grid_thw=grid_thw_rope,  # video_grid_thw,
+                    second_per_grid_ts=[1.0]*len(grid_thw_rope),  # second_per_grid_ts, 理论上应为1/6，实际使用 1
+                    attention_mask=torch.ones([1, len(text_ids)], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+                )
+                # mrope 上区分 ref image vae特征与video vae特征
+                current_packed_position_ids = shift_position_ids(current_packed_position_ids, pos_shift = 1000, attn_modes = attn_modes[left:right], split_lens = split_lens[left:right], shift_attn_mode=['full_noise',"full"], pro_type = 10, i_sample_task=i_sample_task, i_sample_modality=i_sample_modality)
+                packed_position_ids.append(current_packed_position_ids)
+            packed_position_ids = torch.cat(packed_position_ids, dim=-1)  # [3,1,sequence_length]
+
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+        packed_sequence = packed_text_embedding.new_zeros(size=(sequence_length, self.hidden_size))  # 构造模型的输入 L * C
+        packed_sequence[packed_text_indexes] = packed_text_embedding[packed_text_indexes]
+
+        if nested_attention_masks is None: # 走这里
+            attn_modes_ = ["full" if mode=="full_noise" else mode for mode in attn_modes]
+            sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes_, packed_text_embedding.device)
+            seqlen = sum(sample_lens)  # 始终是max_num_tokens
+            attention_mask = create_block_mask(sparse_mask, B=1, H=self.num_heads, Q_LEN=seqlen, KV_LEN=seqlen, device=packed_text_embedding.device, BLOCK_SIZE=BLOCK_SIZE, _compile=True)
+        else:
+            attention_mask = nested_attention_masks
+
+        if N_vit_split > 0: # 肥understanding 分支也可能有 VIT 条件，例如TI2I
+            if self.vit_type in ("qwen2_5_vl", "qwen_2_5_vl_original"):
+                # NOTE video understanding部分
+                with torch.no_grad():
+                    packed_vit_token_embed = make_packed_vit_token_embed(packed_vit_tokens, vit_data_mode, vit_video_grid_thw, self.vit_model)
+                if self.vit_type == "qwen2_5_vl":
+                    packed_vit_token_embed = self.connector(packed_vit_token_embed) # L//4 x 2048 -> L//4 x 1536
+                packed_sequence[packed_vit_token_indexes] = packed_vit_token_embed  # NOTE: 这里替换真实的vit token embed！
+
+        # 计算flow matching loss
+        if self.config.visual_gen:
+            pt, ph, pw = self.latent_patch_size  # e.g., 1, 1, 1
+            packed_latent = []
+            # NOTE: patchified_vae_latent_shapes 已经是计算了patchify的
+            for latent, (t, h, w) in zip(padded_latent, patchified_vae_latent_shapes):
+                # NOTE: 这里的patchify是直接reshape的 (space2channel)，后面会通过 self.vae2llm(packed_latenat) 做映射;
+                # NOTE: padded_latent的shape是被补齐的，取的最大的shape
+                # TODO: 当前并不支持native shape；后续需要支持native shape
+
+                # 假设 latent 形状是 [t*pt, h*ph, w*pw, c]，并且外层循环中你已经拿到了 t, h, w, pt, ph, pw
+                patches = rearrange(latent, "(t pt) (h ph) (w pw) c -> (t h w) (pt ph pw c)", t=t, pt=pt, h=h, ph=ph, w=w, pw=pw)
+                # patches 的形状就是 [t*h*w, pt*ph*pw*c]
+                packed_latent.append(patches)
+            packed_latent_clean = torch.cat(packed_latent, dim=0)  # (B*t*h*w, pt*ph*pw*c) -> e.g., (B*256, 48) for seedance vae with 256x256
+
+            noise = torch.randn_like(packed_latent_clean)
+            if getattr(self.training_args, "incre_time_pro", 0) <=0:
+                packed_timesteps = torch.sigmoid(packed_timesteps)  # NOTE: 这里归一化到 [0, 1] 了。Gaussian + Sigmoid = Logit-Normal Distribution
+            packed_timesteps = self.timestep_shift * packed_timesteps / (1 + (self.timestep_shift - 1) * packed_timesteps)
+            if key_frame_mask != []:
+                if not choose_frame_loss:
+                    time_idx = torch.arange(len(packed_timesteps), device=packed_timesteps.device)[packed_timesteps > 0][key_frame_mask==1]
+                    packed_timesteps[time_idx] = 0 # 将关键帧置为 clean
+                    mse_loss_indexes = mse_loss_indexes[key_frame_mask==0] # 只在非关键帧上计算loss
+                else:
+                    mse_loss_indexes = mse_loss_indexes[key_frame_mask==1]
+
+            packed_latent = (1 - packed_timesteps[:, None]) * packed_latent_clean + packed_timesteps[:, None] * noise
+            packed_timestep_embeds = self.time_embedder(packed_timesteps) # [L, C]
+            latent_token_pos_emb = self.latent_pos_embed(packed_latent_position_ids)
+            packed_latent = self.vae2llm(packed_latent) + packed_timestep_embeds + latent_token_pos_emb
+
+            packed_sequence[packed_vae_token_indexes] = packed_latent.to(packed_sequence.dtype) # NOTE: 这里替换真实的vae token embed！
+        extra_inputs = {}
+        if self.use_moe:
+            packed_und_token_indexes = packed_text_indexes
+            if packed_vit_token_indexes is not None:
+                packed_und_token_indexes = torch.cat([packed_text_indexes, packed_vit_token_indexes], dim=0)
+            extra_inputs.update(
+                packed_und_token_indexes=packed_und_token_indexes,
+                packed_gen_token_indexes=packed_vae_token_indexes,
+            )
+
+        # 添加task_embedding
+        if hasattr(self, 'task_embedding') and sample_task is not None:
+            # 确保sample_task的形状正确
+            if sample_task.dim() == 1:
+                # 为每个token添加对应的task embedding
+                task_embeddings = self.task_embedding(sample_task)
+                # sample_task: 当task_id为0时( 对应 t2v task)，不添加embedding
+                mask = (sample_task == 0)
+                task_embeddings[mask] = 0
+                # 将task_embedding添加到packed_sequence
+                packed_sequence += task_embeddings
+
+        if hasattr(self, 'modality_embedding') and sample_modality is not None:
+            # 确保sample_modality的形状正确
+            if sample_modality.dim() == 1:
+                # 为每个token添加对应的modality embedding
+                modality_embeddings = self.modality_embedding(sample_modality)
+                # sample_modality: 当modality_id为0或1时( 对应 text/noise modality)，不添加embedding
+                mask = (sample_modality == 0) | (sample_modality == 1)
+                modality_embeddings[mask] = 0
+                # 将modality_embedding添加到packed_sequence
+                packed_sequence += modality_embeddings
+
+        last_hidden_state = self.language_model(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_ids=packed_position_ids,
+            **extra_inputs,
+        )
+
+        mse, frame_mse, total_mse_tokens = None, None, None
+        if self.config.visual_gen:
+            packed_mse_preds = self.llm2vae(last_hidden_state[mse_loss_indexes])
+            total_mse_tokens = packed_mse_preds.shape[0]
+            target = noise - packed_latent_clean  # NOTE: v_t=dx_t/dt=x_1-x_0, pointing from data to noise
+            has_mse = packed_timesteps > 0
+            if key_frame_mask != [] and choose_frame_loss:
+                mse = (packed_mse_preds - target[has_mse][key_frame_mask==1]) ** 2  # 只对关键帧计算损失
+            else:
+                mse = (packed_mse_preds - target[has_mse]) ** 2
+
+        # 更鲁棒的实现，忽略了越界的token id
+        ce = None
+        if ce_loss_indexes is not None:
+            V_eff = self.vocab_size_efficient          # 建议等于 len(self.tokenizer)
+            ignore_index = -100
+
+            h = last_hidden_state[ce_loss_indexes]      # [L, H]
+            logits = self.language_model.lm_head(h)[..., :V_eff]   # [L, V_eff]
+
+            targets = packed_label_ids.to(dtype=torch.long)
+            invalid = (targets >= V_eff) | (targets < 0)
+            targets = torch.where(invalid, torch.full_like(targets, ignore_index), targets)
+            ce = F.cross_entropy(logits, targets, reduction="none", ignore_index=ignore_index)
+
+        return dict(mse=mse, ce=ce, frame_mse=frame_mse, total_mse_tokens=total_mse_tokens)
+
+    @torch.no_grad()
+    def validation_gen(
+        self,
+        val_packed_text_ids: torch.LongTensor,
+        val_packed_text_indexes: torch.LongTensor,
+        val_packed_vit_tokens: torch.LongTensor,
+        val_packed_vit_token_indexes: torch.LongTensor,
+        val_sample_lens: List[int],
+        val_packed_position_ids: torch.LongTensor,
+        val_split_lens: List[int] = None,
+        val_attn_modes: List[str] = None,
+        val_sample_N_target: List[int] = None,
+        vit_video_grid_thw: Optional[torch.IntTensor] = None,  # NOTE: 仅 TI2I 时使用
+        vae_video_grid_thw: Optional[torch.IntTensor] = None,
+        video_grid_thw: Optional[torch.IntTensor] = None,
+        val_mse_loss_indexes: Optional[torch.BoolTensor] = None,
+        # for visual generation
+        val_packed_vae_token_indexes: Optional[torch.LongTensor] = None,
+        val_padded_latent: Optional[torch.Tensor] = None,
+        # val_key_frame_mask: Optional[torch.BoolTensor] = None,
+        sample_task: Optional[torch.LongTensor] = None,
+        sample_modality: Optional[torch.LongTensor] = None,
+        video_sizes: List[Tuple[int, int, int]] = [[1, 256, 256]],
+        val_padded_videos: torch.Tensor = None,
+        timestep_shift: float = 4.0,
+        num_timesteps: int = 24,
+        # cfg_text
+        cfg_interval: Optional[Tuple[float, float]] = [0, 1],
+        cfg_renorm_min: float = 0.0,
+        cfg_renorm_type: str = "global",
+        cfg_text_scale: float = 1.0,
+        cfg_vit_scale: float = 1.0, # HACK
+        device=None,
+        dtype=None,
+        new_token_ids=None,
+        BLOCK_SIZE: int = 128,
+        apply_chat_template: bool = False,
+        apply_qwen_2_5_vl_pos_emb: bool = False,
+        image_token_id: int = 151655,
+        # sample_index: Optional[torch.LongTensor] = None,
+        caption: Optional[List[str]] = None,
+        index: str = "",
+        **kwargs,
+    ):
+
+        # 特殊 token
+        start_id = new_token_ids["start_of_image"]
+        end_id = new_token_ids["end_of_image"]
+
+        pt, ph, pw = self.latent_patch_size
+
+        index_dtype = val_packed_text_ids.dtype
+
+        # --- 为在循环中切片准备累积长度 ---
+        cu_sample_lens = torch.nn.functional.pad(torch.cumsum(torch.tensor(val_sample_lens, device=device), dim=0), (1, 0))
+
+        sample_splits = map_splits_to_samples(val_sample_lens, val_split_lens)
+
+        # 每个样本中的 每个 vit token 序列长度
+        if val_packed_vit_tokens is not None and vit_video_grid_thw is not None:
+            vit_sample_len = vit_video_grid_thw[:, 0] * vit_video_grid_thw[:, 1] * vit_video_grid_thw[:, 2]  # shape: (N,) , N = 1 * 16 * 16,
+            cu_vit_sample_lens = torch.cat([torch.zeros(1, device=vit_video_grid_thw.device, dtype=vit_sample_len.dtype), vit_sample_len.cumsum(0)])
+            self.vit_model = self.vit_model.to(device=device, dtype=dtype)
+
+            val_packed_vit_tokens = torch.cat(val_packed_vit_tokens, dim=0)
+
+        x_t_all = []
+        max_samples = kwargs.get("max_samples", 16)
+        num_samples = len(val_sample_lens)
+        max_samples = min(num_samples, max_samples)
+
+        gen_idx = 0
+        curr_vae_split_idx, curr_vit_split_idx = 0, 0  # curr_vae_split_idx 为生成的图片索引, 在每个样本的N_target=1时与gen_idx相同； curr_vit_split_idx 为 vit split 索引
+
+        padded_videos = []
+        # self.logger.info(f"Validation start... (timesteps = {num_timesteps})")
+        for i_sample in range(num_samples):
+            left, right = sample_splits[i_sample][0], sample_splits[i_sample][-1] + 1
+            # --- for interleave ---
+            current_split_lens = val_split_lens[left:right]
+            current_attn_modes = val_attn_modes[left:right]
+            N_noise_element = current_attn_modes.count("noise") + current_attn_modes.count("full_noise") + current_attn_modes.count("full_noise_target")
+            N_vit_split = current_attn_modes.count("full")
+
+            if right > len(val_attn_modes):
+                break
+
+            # 跳过非 GEN（这里你的逻辑是看有没有 "noise"）
+            if N_noise_element<=0:
+                curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+                continue
+
+            # 生成数量控制：用 gen_idx 而不是 i_sample
+            if gen_idx >= max_samples:
+                break
+
+            # 1. 获取当前样本在整个批次中的切片信息
+            sample_start_idx = cu_sample_lens[i_sample]
+            sample_end_idx = cu_sample_lens[i_sample + 1]
+            current_seq_len = val_sample_lens[i_sample]
+            current_pos_ids = val_packed_position_ids[sample_start_idx:sample_end_idx]
+            i_sample_task = sample_task[sample_start_idx:sample_end_idx]
+            i_sample_modality = sample_modality[sample_start_idx:sample_end_idx]
+
+            # --- 视觉特征 嵌入 ---
+            vae_mask = (val_packed_vae_token_indexes >= sample_start_idx) & (val_packed_vae_token_indexes < sample_end_idx)
+            current_vae_token_indexes_local = val_packed_vae_token_indexes[vae_mask] - sample_start_idx
+
+            # --- vae mse token 部分 : 指示x_t 中需更新的部分的index ---
+            vae_mse_mask = (val_mse_loss_indexes >= sample_start_idx) & (val_mse_loss_indexes < sample_end_idx)
+            current_vae_mse_indexes_local = val_mse_loss_indexes[vae_mse_mask] - sample_start_idx  # 指示x_t 中需更新的部分的index
+            current_vae_mse_indexes_local_in_vae = (
+                current_vae_mse_indexes_local - current_vae_mse_indexes_local[0] + torch.where(current_vae_token_indexes_local == current_vae_mse_indexes_local[0])[0]
+            )  # TODO : 如果是多个target image 且中间有文本信息，可能需要修改
+
+            num_vid_tokens_list, vid_shape_list, vae_position_ids, curr_padded_latent = [], [], [], []
+
+            # 2. 其次生成 vit uncond 特征 （可选）
+            cfg_vit_pro = False
+            if cfg_vit_scale > 1.0 and "full" in current_attn_modes:
+                vit_uncond_sequence, vit_uncond_attn_modes, vit_uncond_split_lens, vit_uncond_vae_index, _, vit_uncond_packed_gen_token_indexes, vit_uncond_packed_und_token_indexes, vit_uncond_text_ids, vit_uncond_seq_len, vit_uncond_pad = uncond_split_pro(self.language_model, current_attn_modes, current_split_lens, vae_video_grid_thw, vit_video_grid_thw, curr_vae_split_idx, curr_vit_split_idx, device, dtype, start_id, image_token_id, end_id, BLOCK_SIZE, is_text_uncond = True, is_vit_uncond = True)
+                cfg_vit_pro = True
+
+            for i_target in range(N_noise_element):
+                T, H, W = video_sizes[curr_vae_split_idx]  # ✅ 尺寸用 gen_idx 索引，保证与“GEN 样本序列顺序”一致
+                t = (T - 1) // self.latent_downsample_temporal + 1
+                h = H // self.latent_downsample_spatial
+                w = W // self.latent_downsample_spatial
+
+                vid_shape_list.append([t, h, w])
+                num_vid_tokens_list.append(t * h * w)
+
+                # prepare packed_vae_position_ids
+                # 使用3D感知的位置编码函数
+                if self.config.interpolate_pos:  # False
+                    # 内插
+                    vae_position_ids.append(
+                        get_flattened_position_ids_interpolate_video(
+                            t, h, w, 1, max_num_frames=self.max_num_latent_frames, max_num_patches_per_side=self.max_latent_size  # latent space的patch size为1
+                        )
+                    )
+                else:  # Adopt !!!!
+                    # 外插
+                    vae_position_ids.append(
+                        get_flattened_position_ids_extrapolate_video(t, h, w, max_latent_size=self.max_latent_size)  # latent space的patch size为1  # NOT USED in 外插
+                    )
+
+                # 当存在condition 的 vae token 时， 计算当前的 padded_latent
+                if len(current_vae_mse_indexes_local) != len(current_vae_token_indexes_local):
+                    padded_latent_ = val_padded_latent[curr_vae_split_idx]  # (T,H,W,C)
+
+                    patches = rearrange(padded_latent_, "(t pt) (h ph) (w pw) c -> (t h w) (pt ph pw c)", t=t, pt=pt, h=h, ph=ph, w=w, pw=pw)
+                    curr_padded_latent.append(patches)
+
+                if val_padded_videos is not None:
+                    padded_videos.append(val_padded_videos[curr_vae_split_idx])
+
+                curr_vae_split_idx += 1
+
+            num_vid_tokens = sum(num_vid_tokens_list)
+            vae_position_ids = torch.cat(vae_position_ids, 0)
+            if curr_padded_latent != []:
+                curr_padded_latent = torch.cat(curr_padded_latent, dim=0).to(dtype)
+
+            # 2. 为当前样本重建输入序列和注意力掩码
+            current_sequence = torch.zeros((current_seq_len, self.hidden_size), device=device, dtype=dtype)
+
+            # --- 文本部分 ---
+            text_mask = (val_packed_text_indexes >= sample_start_idx) & (val_packed_text_indexes < sample_end_idx)
+            current_text_indexes_local = val_packed_text_indexes[text_mask] - sample_start_idx
+
+            current_text_ids = val_packed_text_ids[sample_start_idx:sample_end_idx]
+
+            # ++ 如果修改 val_data 和train_data对齐即不使用
+            current_text_embedding = self.language_model.model.embed_tokens(current_text_ids).to(dtype=dtype)
+
+            current_sequence[current_text_indexes_local] = current_text_embedding[current_text_indexes_local]
+
+            if cfg_text_scale > 1.0:
+                if cfg_vit_pro:
+                    vit_uncond_attn_modes_, vit_uncond_split_lens_ = vit_uncond_attn_modes, vit_uncond_split_lens
+                    vit_uncond_attn_mask = self.process_attention_mask(vit_uncond_attn_modes_, vit_uncond_split_lens_, [vit_uncond_seq_len, vit_uncond_pad], device = device, BLOCK_SIZE = BLOCK_SIZE)
+
+            # --- vit部分: 支持 ti2i ---
+            if N_vit_split != 0:
+                vit_sample_start_idx = cu_vit_sample_lens[curr_vit_split_idx]
+                vit_sample_end_idx = cu_vit_sample_lens[curr_vit_split_idx + N_vit_split]
+                current_val_packed_vit_tokens = val_packed_vit_tokens[vit_sample_start_idx:vit_sample_end_idx].to(dtype)
+                current_val_vit_video_grid_thw = vit_video_grid_thw[curr_vit_split_idx : curr_vit_split_idx + N_vit_split]
+                curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+
+                if self.vit_type in ["qwen2_5_vl", "qwen_2_5_vl_original"]:
+                    packed_vit_token_embed = self.vit_model(hidden_states=current_val_packed_vit_tokens, grid_thw=current_val_vit_video_grid_thw)
+                    if self.vit_type in ["qwen2_5_vl"]:
+                        packed_vit_token_embed = self.connector(packed_vit_token_embed).to(dtype)
+                else:
+                    raise NotImplementedError(f"{self.vit_type} is not supported")
+
+                vit_mask = (val_packed_vit_token_indexes >= sample_start_idx) & (val_packed_vit_token_indexes < sample_end_idx)
+                current_vit_indexes_local = val_packed_vit_token_indexes[vit_mask] - sample_start_idx
+                current_sequence[current_vit_indexes_local] = packed_vit_token_embed
+
+            # --- 关键：与训练一致 → pad 到 BLOCK_SIZE 的倍数，并让输入/掩码/length 全一致 ---
+            current_seq_len_pad = (current_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
+            current_pad = current_seq_len_pad - current_seq_len
+            if current_pad > 0:
+                current_split_lens = current_split_lens + [current_pad]
+                current_attn_modes = current_attn_modes + ["causal"]
+            current_split_lens_, current_attn_modes_ = current_split_lens, current_attn_modes
+
+            attention_mask = self.process_attention_mask(current_attn_modes_, current_split_lens_,  [current_seq_len, current_pad], device = device, BLOCK_SIZE = BLOCK_SIZE)
+            # NOTE: 固定种子
+            validation_noise_seed = kwargs.get("validation_noise_seed", -1)
+            if validation_noise_seed > 0:
+                generator = torch.Generator(device=device).manual_seed(validation_noise_seed + get_global_rank() * max_samples + i_sample)  # 构造seed
+            else:
+                generator = None
+            x_t = torch.randn(num_vid_tokens, self.patch_latent_dim, generator=generator, device=device, dtype=dtype)  # [1*t*h*w, pt*ph*pw*C]
+
+            if curr_padded_latent != []:  # 存在 vae_condition
+                curr_padded_latent[current_vae_mse_indexes_local_in_vae] = x_t[current_vae_mse_indexes_local_in_vae]
+                x_t = curr_padded_latent
+
+            timesteps = torch.linspace(1, 0, num_timesteps + 1, device=x_t.device)  # fix: 加1
+            timesteps = timestep_shift * timesteps / (1 + (timestep_shift - 1) * timesteps)
+            dts = timesteps[:-1] - timesteps[1:]
+            timesteps = timesteps[:-1]
+
+            if apply_qwen_2_5_vl_pos_emb:
+                grid_thw_rope = video_grid_thw[i_sample]
+
+                # Qwen-VL 中求解方法， TODO: rope_deltas在一次性求完所有 token 的 position_ids 时有作用吗
+                current_pos_ids, _ = self.language_model.get_rope_index(
+                    input_ids=current_text_ids.unsqueeze(0),
+                    image_grid_thw=grid_thw_rope, #vae_video_grid_thw[gen_idx : gen_idx + 1],  # NOTE: 这里是全局索引
+                    video_grid_thw=grid_thw_rope, #vae_video_grid_thw[gen_idx : gen_idx + 1],  # video_grid_thw,
+                    second_per_grid_ts=[1.0]*len(grid_thw_rope),  # second_per_grid_ts,
+                    attention_mask=torch.ones([1, len(current_text_ids)], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+                )  # current_packed_position_ids: [3, 1, L]
+                # mrope 上区分  ref image vae特征与video vae特征
+                current_pos_ids = shift_position_ids(
+                    current_pos_ids,
+                    pos_shift=1000,
+                    attn_modes=current_attn_modes,
+                    split_lens=current_split_lens,
+                    shift_attn_mode=["full_noise", "full"],
+                    pro_type=10,
+                    i_sample_task=i_sample_task,
+                    i_sample_modality=i_sample_modality,
+                )
+
+            if cfg_text_scale > 1.0:
+                uncond_mask = i_sample_modality!=0
+                _, uncond_pos_ids, uncond_attn_mask, _, _, uncond_extra_inputs, uncond_seq_len = self.uncond_split_pro_new(
+                    uncond_mask,
+                    current_text_ids,
+                    current_attn_modes,
+                    current_split_lens,
+                    device,
+                    dtype,
+                    BLOCK_SIZE,
+                    grid_thw_rope,
+                    apply_qwen_2_5_vl_pos_emb,
+                    i_sample_task=i_sample_task,
+                    i_sample_modality=i_sample_modality,
+                )
+
+            for _ in range(1):
+                timestep = torch.zeros(x_t.shape[0], device=x_t.device)
+                # for group-by-group generation
+
+                for i, timestep_ in enumerate(timesteps):
+                    timestep[current_vae_mse_indexes_local_in_vae] = torch.tensor([timestep_] * current_vae_mse_indexes_local_in_vae.shape[0], device=x_t.device)
+                    if timestep_ > cfg_interval[0] and timestep_ <= cfg_interval[1]:
+                        cfg_text_scale_ = cfg_text_scale
+                        cfg_vit_scale_ = cfg_vit_scale # 默认 vit_uncond 和text_uncond 都采用 同一 cfg_interval
+                    else:
+                        cfg_text_scale_ = 1.0
+                        cfg_vit_scale_ = 1.0
+
+                    # --- 视觉特征 编码 ---
+                    timestep_embed = self.time_embedder(timestep)
+                    latent_pos_embed = self.latent_pos_embed(vae_position_ids)
+                    vae_embed = self.vae2llm(x_t) + timestep_embed + latent_pos_embed
+                    vae_embed = vae_embed.to(current_sequence.dtype)
+
+                    # 一切正常再做赋值
+                    current_sequence[current_vae_token_indexes_local] = vae_embed
+
+                    extra_inputs = {}  # {'mode': "und"}
+                    if self.use_moe: # NOTE: packed_und_token_indexes 的计算可能有问题，在x2v的任务中
+                        if N_vit_split != 0:
+                            packed_und_token_indexes = torch.cat([current_text_indexes_local, current_vit_indexes_local], dim=0)
+                        else:
+                            packed_und_token_indexes = current_text_indexes_local
+                        extra_inputs.update(
+                        packed_und_token_indexes=packed_und_token_indexes.to(dtype=index_dtype),
+                        packed_gen_token_indexes=current_vae_token_indexes_local.to(dtype=index_dtype),
+                    )
+
+                    self.language_model.to(current_sequence.dtype)
+                    cond_hidden_state = self.language_model(
+                    packed_sequence=current_sequence[:current_seq_len],  # current_sequence,
+                    sample_lens=[current_seq_len],  # [current_seq_len_pad]
+                    attention_mask=attention_mask,
+                    packed_position_ids=current_pos_ids.to(dtype=index_dtype),
+                    mode_forward="validation",
+                    **extra_inputs,
+                )
+                    v_t = self.llm2vae(cond_hidden_state[current_vae_mse_indexes_local])
+
+                    # --- 引入 cfg ---
+                    if cfg_text_scale_ > 1.0:
+                        uncond_sequence = current_sequence[uncond_mask] # 与uncond_sequence等价
+                        cfg_text_v_t = self.uncond_forward(uncond_sequence, uncond_pos_ids, uncond_seq_len, uncond_attn_mask, uncond_extra_inputs, current_vae_mse_indexes_local, current_seq_len)
+
+                        if cfg_vit_pro:
+                            if i_sample_task is not None:
+                                i_sample_task_text_uncond = i_sample_task[i_sample_modality!=0]
+                                i_sample_modality_text_uncond = i_sample_modality[i_sample_modality!=0]
+                            else:
+                                i_sample_task_text_uncond, i_sample_modality_text_uncond = None, None
+
+                            if i_sample_task is not None:
+                                i_sample_task_text_vit_uncond = i_sample_task_text_uncond[i_sample_modality_text_uncond!=4]
+                                i_sample_modality_text_vit_uncond = i_sample_modality_text_uncond[i_sample_modality_text_uncond!=4]
+                            else:
+                                i_sample_task_text_vit_uncond, i_sample_modality_text_vit_uncond = None, None
+
+                            cfg_text_vit_v_t = self.uncond_forward(vae_embed, vit_uncond_sequence, vit_uncond_text_ids, vit_uncond_seq_len, vit_uncond_packed_und_token_indexes, vit_uncond_packed_gen_token_indexes, vit_uncond_attn_mask, vit_uncond_vae_index, grid_thw_rope, current_vae_mse_indexes_local, current_seq_len, apply_qwen_2_5_vl_pos_emb, device,i_sample_task_text_vit_uncond,i_sample_modality_text_vit_uncond)
+
+                            v_t_ = cfg_text_vit_v_t + cfg_text_scale_ * (v_t - cfg_text_v_t) + cfg_vit_scale_  * (cfg_text_v_t - cfg_text_vit_v_t)
+                        else:
+                            v_t_ = cfg_text_v_t + cfg_text_scale_ * (v_t - cfg_text_v_t)
+
+                        # NOTE norm is computed over all dimensions, thus currently only supports batch_size = 1 with navit
+                        if cfg_renorm_type == "global":
+                            norm_v_t = torch.norm(v_t)
+                            norm_v_t_ = torch.norm(v_t_)
+                            scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                        elif cfg_renorm_type == "channel":
+                            norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                            norm_v_t_ = torch.norm(v_t_, dim=-1, keepdim=True)
+                            scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                        elif cfg_renorm_type.lower() in ("", "none", "null"):
+                            scale = 1
+                        else:
+                            raise NotImplementedError(f"{cfg_renorm_type} is not suppoprted")
+                        v_t = v_t_ * scale
+
+                    x_t[current_vae_mse_indexes_local_in_vae] = x_t[current_vae_mse_indexes_local_in_vae] - v_t.to(x_t.device) * dts[i]  # velocity pointing from data to noise
+
+            # ---- 每个样本各自重排到 [T,H,W,C]，避免用最后一个样本的 t/h/w 去重排整批 ----
+            curr_seq_target, patch = 0, []
+            for i_target in range(N_noise_element):
+
+                pt, ph, pw = self.latent_patch_size
+                t, h, w = vid_shape_list[i_target]
+                len_target = t * h * w
+
+                x_t_ =  rearrange(x_t[curr_seq_target : curr_seq_target + len_target], "(t h w) (pt ph pw c) -> (t pt) (h ph) (w pw) c", t=t, h=h, w=w, pt=pt, ph=ph, pw=pw)
+
+                patch.append(x_t_)
+                curr_seq_target += len_target
+            # patch = torch.cat(patch, dim=0)
+            x_t_all.append(patch)
+            # ✅ 成功处理一个 GEN 样本，推进 gen_idx
+            gen_idx += 1
+
+        # self.logger.info(f"Validation step done with {len(x_t_all)} samples.")
+
+        if caption != None:
+            return x_t_all, [caption], padded_videos, index
+
+        # if padded_videos != []:
+        #     return x_t_all, padded_videos
+
+        return x_t_all
+
+    def uncond_split_pro_new(
+        self,
+        uncond_mask,
+        current_text_ids,
+        current_attn_modes,
+        current_split_lens,
+        device,
+        dtype,
+        BLOCK_SIZE,
+        grid_thw_rope=None,
+        apply_qwen_2_5_vl_pos_emb=False,
+        i_sample_task=None,
+        i_sample_modality=None,
+        uncond_pos_ids=None,
+    ):
+        """
+        uncond_attn_modes: uncond 序列的 attn_mode 列表
+        uncond_split_lens: uncond 序列的 split_len 列表
+        uncond_packed_gen_token_indexes: uncond 序列中 gen_token 索引列表 (for extra_inputs)
+        uncond_packed_und_token_indexes: uncond 序列中 und_token 索引列表 (for extra_inputs)
+        uncond_seq_len: uncond 序列的长度
+        uncond_pad: uncond 序列的 pad 长度
+        """
+        start = 0
+        uncond_split_lens, uncond_attn_modes, uncond_packed_gen_token_indexes = [], [], []
+        for i_visual, attn_mode_ in enumerate(current_attn_modes):
+            split_len_ = current_split_lens[i_visual]
+            end = start + split_len_
+            split_in_uncond = int(uncond_mask[start:end].sum())
+            start += split_len_
+            if split_in_uncond == 0:  # 即表示当前split在uncond 序列中不保留
+                continue
+            else:
+                if attn_mode_ in ["noise", "full_noise"]:
+                    start_gen, end_gen = sum(uncond_split_lens) + 1, sum(uncond_split_lens) + 1 + split_len_ - 2
+                    uncond_packed_gen_token_indexes.extend(range(start_gen, end_gen))
+                uncond_split_lens.append(split_in_uncond)  # 保留对应长度，一般除非使用system_prompt，否则split_in_uncond == split_len_
+                uncond_attn_modes.append(attn_mode_)
+
+        # 与训练一致 → 也 pad 掉尾块
+        uncond_seq_len = sum(uncond_split_lens)
+        uncond_seq_len_pad = (uncond_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
+        uncond_pad = uncond_seq_len_pad - uncond_seq_len
+        if uncond_pad > 0:
+            uncond_split_lens.append(uncond_pad)
+            uncond_attn_modes.append("causal")
+
+        uncond_packed_gen_token_indexes = torch.tensor(uncond_packed_gen_token_indexes, dtype=torch.long, device=device)
+        all_indexes = torch.arange(0, uncond_seq_len).to(device)
+        und_token_mask = ~torch.isin(all_indexes, uncond_packed_gen_token_indexes)  # 判断 A 中哪些元素不在 B 中
+        uncond_packed_und_token_indexes = all_indexes[und_token_mask]
+
+        uncond_extra_inputs = {}  # {'mode': "validation"}
+        if self.use_moe:
+            uncond_extra_inputs.update(
+                packed_und_token_indexes=uncond_packed_und_token_indexes,
+                packed_gen_token_indexes=uncond_packed_gen_token_indexes,
+            )
+
+        # uncond attention mask 的处理
+        uncond_attn_mask = self.process_attention_mask(uncond_attn_modes, uncond_split_lens, [uncond_seq_len, uncond_pad], device=device, BLOCK_SIZE=BLOCK_SIZE)
+
+        # 提取 uncond 序列的 text_ids
+        uncond_text_ids = current_text_ids[uncond_mask]
+        uncond_sample_task = i_sample_task[uncond_mask] if i_sample_task is not None else None
+        uncond_sample_modality = i_sample_modality[uncond_mask] if i_sample_modality is not None else None
+
+        if apply_qwen_2_5_vl_pos_emb:
+            uncond_pos_ids, uncond_rope_deltas = self.language_model.get_rope_index(
+                input_ids=uncond_text_ids.unsqueeze(0),
+                image_grid_thw=grid_thw_rope,  # vae_video_grid_thw[gen_idx : gen_idx + 1],
+                video_grid_thw=grid_thw_rope,  # vae_video_grid_thw[gen_idx : gen_idx + 1],  # video_grid_thw,
+                second_per_grid_ts=[1.0] * len(grid_thw_rope),  # second_per_grid_ts,
+                attention_mask=torch.ones([1, len(uncond_text_ids)], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+            )
+            # mrope 上区分  ref image vae特征与video vae特征
+            uncond_pos_ids = shift_position_ids(
+                uncond_pos_ids,
+                pos_shift=1000,
+                attn_modes=uncond_attn_modes,
+                split_lens=uncond_split_lens,
+                shift_attn_mode=["full_noise", "full"],
+                pro_type=10,
+                i_sample_task=uncond_sample_task,
+                i_sample_modality=uncond_sample_modality,
+            )
+        else:
+            uncond_pos_ids = torch.tensor(uncond_pos_ids, dtype=torch.long, device=device)[:uncond_seq_len]
+
+        return (
+            uncond_text_ids,
+            uncond_pos_ids,
+            uncond_attn_mask,
+            uncond_attn_modes,
+            uncond_split_lens,
+            uncond_extra_inputs,
+            uncond_seq_len,
+        )
+
+    def uncond_forward(
+        self,
+        uncond_sequence,
+        uncond_pos_ids,
+        uncond_seq_len,
+        uncond_attn_mask,
+        uncond_extra_inputs,
+        current_vae_mse_indexes_local,
+        current_seq_len,
+    ):
+        # 执行无条件前向传播
+        uncond_hidden_state = self.language_model(
+            packed_sequence=uncond_sequence[:uncond_seq_len],
+            sample_lens=[uncond_seq_len],
+            attention_mask=uncond_attn_mask,
+            packed_position_ids=uncond_pos_ids,
+            mode_forward="validation",  # NOTE
+            **uncond_extra_inputs,
+        )
+        uncond_current_vae_mse_indexes_local = current_vae_mse_indexes_local - (current_seq_len - uncond_seq_len)  # TODO : 如果是多个target image 且中间有文本信息，可能需要修改
+        cfg_text_v_t = self.llm2vae(uncond_hidden_state[uncond_current_vae_mse_indexes_local])
+
+        return cfg_text_v_t
+
+    @torch.no_grad()
+    def validation_video_to_text(
+        self,
+        val_packed_text_ids: torch.LongTensor,
+        val_packed_text_indexes: torch.LongTensor,
+        val_packed_position_ids: torch.LongTensor,
+        val_ce_loss_indexes: torch.LongTensor,
+        val_sample_N_target: List[int],  # 理论上 und 分支的 N_target 均为 1
+        val_split_lens: List[int],
+        val_attn_modes: List[str],
+        val_sample_lens: List[int],
+        val_sample_type: List[str],
+        # val_split_lens: List[int] = None,
+        # val_attn_modes: List[str] = None,
+        val_packed_vit_tokens: Optional[torch.Tensor] = None,
+        # val_packed_vit_token_indexes: Optional[torch.LongTensor] = None,
+        # val_packed_vit_position_ids: Optional[torch.LongTensor] = None,
+        # val_vit_token_seqlens: Optional[torch.IntTensor] = None,
+        val_vit_video_grid_thw: Optional[torch.IntTensor] = None,  # for video understanding
+        max_samples: int = 1,
+        max_length: int = 256,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        new_token_ids: Dict[str, int] = None,
+        pad_token_id: int = None,
+        vocab_size: int = None,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        caption: any = "",
+        tokenizer: any = None,  # 适应有instruction的处理
+        apply_chat_template: bool = False,
+        apply_qwen_2_5_vl_pos_emb: bool = False,
+        image_token_id: int = 151655,
+        BLOCK_SIZE: int = 128,
+        visualize_generation_progress: bool = False,
+        index: str = "",
+    ):
+        # 特殊 token
+        start_id = new_token_ids["start_of_image"]
+        end_id = new_token_ids["end_of_image"]
+        bos_id = new_token_ids["bos_token_id"]
+        eos_id = new_token_ids["eos_token_id"]
+
+        # 每个样本长度
+        cu_sample_lens = torch.nn.functional.pad(torch.cumsum(torch.tensor(val_sample_lens, device=device), dim=0), (1, 0))
+        sample_splits = map_splits_to_samples(val_sample_lens, val_split_lens)
+
+        # 每个样本中的 每个 vit token 序列长度
+        vit_sample_len = val_vit_video_grid_thw[:, 0] * val_vit_video_grid_thw[:, 1] * val_vit_video_grid_thw[:, 2]  # shape: (N,) , N = 1 * 16 * 16,
+        cu_vit_sample_lens = torch.cat([torch.zeros(1, device=val_vit_video_grid_thw.device, dtype=vit_sample_len.dtype), vit_sample_len.cumsum(0)])
+
+        if val_packed_vit_tokens is not None:
+            val_packed_vit_tokens = torch.cat(val_packed_vit_tokens, dim=0)
+
+        # max_samples = min(len(val_sample_lens) - 1, max_samples)
+        max_samples = min(len(val_sample_lens), max_samples)  #  NOTE 与测试的时候兼容，不再-1
+        cnt_samples = 0
+        generated_sequence_all = []
+
+        # L = len(val_sample_lens) - 1
+        L = len(val_sample_lens)  # 与测试的时候兼容，不再-1
+        curr_vit_split_idx = 0
+        for i_sample in range(L):
+            left, right = sample_splits[i_sample][0], sample_splits[i_sample][-1] + 1
+            # --- for interleave ---
+            current_split_lens = val_split_lens[left:right]
+            current_attn_modes = val_attn_modes[left:right]
+            N_target = val_sample_N_target[i_sample]  # 理论上 und 分支的 N_target 均为 1
+            N_vit_split = current_attn_modes.count("full")
+
+            if val_sample_type[i_sample] != "und":
+                curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+                continue
+            cnt_samples += 1
+            if cnt_samples > max_samples:
+                break
+
+            assert N_target == 1
+
+            # 获取当前video vit样本在整个批次中的切片信息
+            vit_sample_start_idx = cu_vit_sample_lens[curr_vit_split_idx]
+            vit_sample_end_idx = cu_vit_sample_lens[curr_vit_split_idx + N_vit_split]
+            current_val_packed_vit_tokens = val_packed_vit_tokens[vit_sample_start_idx:vit_sample_end_idx]
+            current_val_vit_video_grid_thw = val_vit_video_grid_thw[curr_vit_split_idx : curr_vit_split_idx + N_vit_split]
+            curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+
+            if N_vit_split > 0 :
+                if self.vit_type in ["qwen2_5_vl", "qwen_2_5_vl_original"]:
+                    packed_vit_token_embed = self.vit_model(hidden_states=current_val_packed_vit_tokens, grid_thw=current_val_vit_video_grid_thw)
+                    if self.vit_type in ["qwen2_5_vl"]:
+                        packed_vit_token_embed = self.connector(packed_vit_token_embed).to(dtype)
+                else:
+                    raise NotImplementedError(f"{self.vit_type} is not supported")
+
+            # 获取当前 文本条件 （包括特殊token） 在整个批次中的切片信息
+            sample_start_idx = cu_sample_lens[i_sample]
+            sample_end_idx = cu_sample_lens[i_sample + 1]
+            current_pos_ids = val_packed_position_ids[sample_start_idx:sample_end_idx]
+
+            # text_mask = (val_packed_text_indexes >= sample_start_idx) & (val_packed_text_indexes < sample_end_idx)
+            # current_text_indexes_local = val_packed_text_indexes[text_mask] - sample_start_idx
+
+            text_mask_ce = (val_ce_loss_indexes >= sample_start_idx) & (val_ce_loss_indexes < sample_end_idx)
+            current_ce_loss_indexes_local = val_ce_loss_indexes[text_mask_ce] - sample_start_idx
+            if text_mask_ce.numel() != 0:
+                current_text_ids = val_packed_text_ids[sample_start_idx:sample_end_idx][: current_ce_loss_indexes_local[0] + 1]
+            else:
+                current_text_ids = val_packed_text_ids[sample_start_idx:sample_end_idx]
+
+            num_text_ids = current_text_ids.shape[0]
+            num_last_split = num_text_ids - sum(current_split_lens[:-N_target])
+
+            current_split_lens = current_split_lens[:-N_target]  # 去除目标序列的split 长度
+
+            if num_last_split > 1:
+                current_split_lens.extend([num_last_split - 1])  # num_last_split 包含了 起始token ， 所以需要减1
+
+            # NOTE 填充pad，让总的seqlen能够被BLOCK_SIZE整除！
+            max_seq_len = (max_length + num_text_ids + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
+            num_pad = max_seq_len - num_text_ids
+
+            current_text_ids = torch.cat(
+                [current_text_ids, torch.full((num_pad,), pad_token_id, dtype=torch.long, device=device)], dim=0
+            )  # [151652, 151653, 151654] + [151643] * 61,  151643 = <|endoftext|>
+            packed_text_embedding = self.language_model.model.embed_tokens(current_text_ids).to(dtype)  # [64, 1536]
+
+            # 把packed_vit_token_embed插入image_pad对应位置
+            if N_vit_split > 0 :
+                mask = current_text_ids == image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(packed_text_embedding)
+                image_mask = mask_expanded.to(packed_text_embedding.device)
+                curr_packed_sequence = packed_text_embedding.masked_scatter(image_mask, packed_vit_token_embed)
+            else:
+                curr_packed_sequence = packed_text_embedding
+
+            # begin text generation
+            step = num_text_ids - 1  # NOTE step会指示当前生成到第几个token，一开始的时候，应该定位到BOS / 起始token (比如"assistant\n"之后)
+            generated_sequence = []
+
+            if apply_qwen_2_5_vl_pos_emb:
+                # Qwen-VL 中求解方法， TODO: rope_deltas在一次性求完所有 token 的 position_ids 时有作用吗
+                current_packed_position_ids, rope_deltas = self.language_model.get_rope_index(
+                    input_ids=current_text_ids.unsqueeze(0),
+                    image_grid_thw=current_val_vit_video_grid_thw,
+                    video_grid_thw=current_val_vit_video_grid_thw,  # video_grid_thw,
+                    second_per_grid_ts=[1.0],  # second_per_grid_ts,
+                    attention_mask=torch.ones([1, max_seq_len], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+                )  # current_packed_position_ids: [3, 1, L]
+            else:
+                current_pos_ids = current_pos_ids[:num_text_ids]
+                pos_pad_start = int(current_pos_ids[-1] + 1)
+                current_pad = torch.arange(pos_pad_start, pos_pad_start + num_pad, device=device)
+                current_packed_position_ids = torch.cat([current_pos_ids, current_pad], dim=0)
+
+            current_sample_lens = [max_seq_len]
+            seqlen = sum(current_sample_lens)  # 始终是max_length
+            current_attn_modes_ = current_attn_modes[: len(current_split_lens)] + ["causal", "causal"]
+            current_attn_modes_ = ["full" if mode_=="full_noise" else mode_ for mode_ in current_attn_modes_]
+            while step < (max_seq_len - 1):
+                # current_sample_lens = [curr_packed_sequence.shape[0]]
+                # attention 的求解存在问题
+
+                current_text_len = (step + 1) - (num_text_ids - 1)
+                current_split_lens_ = current_split_lens + [current_text_len, num_pad + 1 - current_text_len]
+
+                sparse_mask = create_sparse_mask(current_sample_lens, current_split_lens_, current_attn_modes_, device)
+                attention_mask = create_block_mask(sparse_mask, B=1, H=self.num_heads, Q_LEN=seqlen, KV_LEN=seqlen, device=device, BLOCK_SIZE=BLOCK_SIZE, _compile=False)
+
+                # 构建MoE输入
+                extra_inputs = {"mode": "und"}
+                if self.use_moe:
+                    packed_und_token_indexes = torch.arange(0, max_seq_len, device=device)  # all tokens are UND
+                    extra_inputs.update(
+                        packed_und_token_indexes=packed_und_token_indexes,
+                        packed_gen_token_indexes=None,
+                    )
+
+                last_hidden_state = self.language_model(
+                    packed_sequence=curr_packed_sequence.to(dtype=dtype),
+                    sample_lens=current_sample_lens,
+                    attention_mask=attention_mask,
+                    packed_position_ids=current_packed_position_ids,  # 这里正确嘛？
+                    mode_forward="validation",
+                    **extra_inputs,
+                )
+
+                # sample
+                pred_logits = self.language_model.lm_head(last_hidden_state[step : step + 1, :])
+                # 将大于vocab_size的pred_logits设置成负无穷
+                pred_logits[:, vocab_size:] = float("-inf")
+                if do_sample:
+                    probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
+                    curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+                else:
+                    curr_tokens = torch.argmax(pred_logits, dim=-1)
+
+                generated_sequence.append(curr_tokens)
+                if visualize_generation_progress:
+                    print(f"curr_tokens: {curr_tokens}", curr_tokens.item(), ", eos_id:", eos_id)
+
+                if curr_tokens.item() == eos_id:
+                    break
+
+                # curr_tokens替代当前step的pad embeddings
+                curr_packed_sequence[step + 1] = self.language_model.model.embed_tokens(curr_tokens)
+
+                step += 1
+
+            generated_sequence = torch.stack([i.to(device) for i in generated_sequence], dim=0)  # [L, 1] , torch.int64
+            generated_sequence_all.append(generated_sequence)
+        return generated_sequence_all, caption, index
+
+    def validation_video_to_text_KVcache(
+        self,
+        val_attn_modes: List[str],
+        val_sample_lens: List[int],
+        val_packed_vit_tokens: Optional[torch.Tensor] = None,
+        val_vit_video_grid_thw: Optional[torch.IntTensor] = None,  # for video understanding
+        max_samples: int = 1,
+        max_length: int = 128,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+        new_token_ids: Dict[str, int] = None,
+        pad_token_id: int = None,
+        vocab_size: int = None,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+    ):
+        # 每个样本的 token 数
+        sample_len = (
+            val_vit_video_grid_thw[:, 0]
+            * val_vit_video_grid_thw[:, 1]
+            * val_vit_video_grid_thw[:, 2]
+        )
+        # ✅ 前缀和长度做成 N+1：[0, s1, s1+s2, ...]
+        cu_sample_lens = torch.cat(
+            [
+                torch.zeros(1, device=val_vit_video_grid_thw.device, dtype=sample_len.dtype),
+                sample_len.cumsum(0),
+            ],
+            dim=0,
+        )
+        # self.logger.info(f'cu_sample_lens: {cu_sample_lens}')
+
+        max_samples = min(len(val_sample_lens) - 1, max_samples)
+        generated_sequence_all = []
+
+        cnt_samples = 0
+        L = len(val_sample_lens) - 1
+        und_idx = 0  # ✅ 只在遇到 UND 样本时递增，用它索引 UND 专属的张量/前缀和
+
+        # self.logger.info(f'val_attn_modes: {val_attn_modes}')
+
+        for i_sample in range(L):
+            left, right = 2 * i_sample, 2 * i_sample + 1
+            # self.logger.info(f'left: {left}, right: {right}')
+            if right >= len(val_attn_modes):   # ✅ 边界判断
+                break
+
+            # GEN: ('causal', 'noise') -> 跳过；UND 留下
+            if not (val_attn_modes[left] == "full" and val_attn_modes[right] == "causal"):
+                # 不是und模式，跳过
+                continue
+
+            # ✅ 用 UND 的真实下标切片
+            if und_idx + 1 >= cu_sample_lens.numel():
+                break  # 安全兜底，防止坏数据越界
+
+            cnt_samples += 1
+            if cnt_samples > max_samples:
+                break
+
+            sample_start_idx = cu_sample_lens[und_idx].item()
+            sample_end_idx   = cu_sample_lens[und_idx + 1].item() # fix: 用 und_idx 来索引，而不是 i_sample
+
+            current_val_packed_vit_tokens = val_packed_vit_tokens[sample_start_idx:sample_end_idx]
+            current_val_vit_video_grid_thw = val_vit_video_grid_thw[und_idx:und_idx + 1]
+            und_idx += 1  # ✅ 遇到一个 UND，推进一次
+
+            # 单样本逐一处理时使用，同时处理多个样本需修改
+            past_key_values = NaiveCache(self.config.llm_config.num_hidden_layers)
+            generation_input, kv_lens, ropes = self.prepare_vit_images_validation(curr_kvlens=[0], curr_rope=[0], vit_tokens = [current_val_packed_vit_tokens], new_token_ids=new_token_ids, device=device)
+            past_key_values = self.forward_cache_update_vit_validation(past_key_values, vit_vae_video_grid_thw=current_val_vit_video_grid_thw, device=device, dtype=dtype,**generation_input)
+
+            # initial start token
+            generation_input = self.prepare_start_tokens(kv_lens, ropes, new_token_ids, device=device)
+            unpacked_latent = self.generate_text(
+                past_key_values=past_key_values,
+                max_length=max_length,
+                do_sample=do_sample,
+                temperature=temperature,
+                end_token_id=new_token_ids["eos_token_id"],
+                vocab_size=vocab_size,
+                **generation_input,
+            )  # [L,1]
+            generated_sequence_all.append(unpacked_latent)
+        return generated_sequence_all
+
+    # 单样本情况下: curr_kvlens :  [0], curr_rope: [0], images: <PIL.Image.Image image mode=RGB>
+    def prepare_vit_images_validation(self, curr_kvlens, curr_rope, vit_tokens, new_token_ids, device):
+        packed_vit_token_indexes = list()
+        vit_token_seqlens, packed_vit_tokens, packed_vit_position_ids = list(), list(), list()
+        packed_text_ids, packed_text_indexes = list(), list()
+        packed_seqlens, packed_position_ids, packed_indexes = list(), list(), list()
+        packed_key_value_indexes = list()
+
+        _curr = curr = 0
+        newlens, new_rope = list(), list()
+        for vit_token, curr_kvlen, curr_position_id in zip(vit_tokens, curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            curr += curr_kvlen
+
+            packed_text_ids.append(new_token_ids["start_of_image"])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+
+            packed_vit_tokens.append(vit_token)
+            num_img_tokens = len(vit_tokens[0]) // 4  # 实际上qwen2.5-vl还需要merge，2x2 merge成1个，
+            vit_token_seqlens.append(num_img_tokens)
+            packed_vit_token_indexes.extend(range(_curr, _curr + num_img_tokens))
+            packed_indexes.extend(range(curr, curr + num_img_tokens))
+            curr += num_img_tokens
+            _curr += num_img_tokens
+
+            packed_text_ids.append(new_token_ids['end_of_image'])
+            packed_text_indexes.append(_curr)
+            packed_indexes.append(curr)
+            curr += 1
+            _curr += 1
+
+            packed_position_ids.extend([curr_position_id] * (num_img_tokens + 2))
+            packed_seqlens.append(num_img_tokens + 2)
+            newlens.append(curr_kvlen + num_img_tokens + 2)
+            new_rope.append(curr_position_id + 1)
+
+        generation_input = {
+            "packed_text_ids": torch.tensor(packed_text_ids, dtype=torch.long, device=device),
+            "packed_text_indexes": torch.tensor(packed_text_indexes, dtype=torch.long, device=device),
+            "vit_token_seqlens": torch.tensor(vit_token_seqlens, dtype=torch.int, device=device),
+            "packed_vit_tokens": torch.cat(packed_vit_tokens, dim=0).to(device),
+            "packed_vit_token_indexes": torch.tensor(packed_vit_token_indexes, dtype=torch.long, device=device),
+            "packed_position_ids": torch.tensor(packed_position_ids, dtype=torch.long, device=device),
+            "packed_seqlens": torch.tensor(packed_seqlens, dtype=torch.int, device=device),
+            "packed_indexes": torch.tensor(packed_indexes, dtype=torch.long, device=device),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long, device=device),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int, device=device),
+        }
+
+        return generation_input, newlens, new_rope
+
+    @torch.no_grad()
+    def forward_cache_update_vit_validation(
+        self,
+        past_key_values: NaiveCache,
+        vit_vae_video_grid_thw: torch.IntTensor, ## ++
+        packed_text_ids: torch.LongTensor, ## 是否包含special token
+        packed_text_indexes: torch.LongTensor,
+        packed_vit_tokens: torch.Tensor,
+        packed_vit_token_indexes: torch.LongTensor,
+        vit_token_seqlens: torch.IntTensor,
+        packed_position_ids: torch.LongTensor,
+        packed_seqlens: torch.IntTensor,
+        packed_indexes: torch.LongTensor,
+        packed_key_value_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+        device: torch.device = None,
+        dtype: torch.dtype = None,
+    ):
+        packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids).to(dtype)
+        packed_sequence = packed_text_embedding.new_zeros((sum(packed_seqlens), self.hidden_size), dtype = dtype)
+        packed_sequence[packed_text_indexes] = packed_text_embedding
+
+        if self.vit_type in ["qwen2_5_vl", "qwen_2_5_vl_original"]:
+            # NOTE video understanding部分
+            packed_vit_token_embed = self.vit_model(
+                hidden_states=packed_vit_tokens,
+                grid_thw=vit_vae_video_grid_thw,
+            )
+            if self.vit_type in ["qwen2_5_vl"]:
+                packed_vit_token_embed = self.connector(packed_vit_token_embed).to(dtype)
+            packed_sequence[packed_vit_token_indexes] = packed_vit_token_embed
+        else:
+            raise NotImplementedError(f"{self.vit_type} is not supported")
+
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs = {"mode": "und"}
+
+        output = self.language_model.forward_inference(
+            packed_query_sequence=packed_sequence,
+            query_lens=packed_seqlens,
+            packed_query_position_ids=packed_position_ids,
+            packed_query_indexes=packed_indexes,
+            past_key_values=past_key_values,
+            packed_key_value_indexes=packed_key_value_indexes,
+            key_values_lens=key_values_lens,
+            update_past_key_values=True,
+            is_causal=False,
+            **extra_inputs,
+        )
+        past_key_values = output.past_key_values
+
+        return past_key_values
+
+
+    def prepare_start_tokens(self, curr_kvlens, curr_rope, new_token_ids, device):
+        packed_start_tokens, packed_key_value_indexes = list(), list()
+        packed_query_position_ids = list()
+
+        curr = 0
+        for curr_kvlen, curr_position_id in zip(curr_kvlens, curr_rope):
+            packed_key_value_indexes.extend(range(curr, curr + curr_kvlen))
+            packed_start_tokens.append(new_token_ids["bos_token_id"])
+            packed_query_position_ids.append(curr_position_id)
+            curr += curr_kvlen
+
+        generation_input = {
+            "packed_start_tokens": torch.tensor(packed_start_tokens, dtype=torch.long).to(device),
+            "packed_query_position_ids": torch.tensor(packed_query_position_ids, dtype=torch.long).to(device),
+            "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int).to(device),
+            "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long).to(device),
+        }
+
+        return generation_input
+
+    @torch.no_grad()
+    def generate_text(
+        self,
+        past_key_values: NaiveCache,
+        packed_key_value_indexes: torch.LongTensor,
+        key_values_lens: torch.IntTensor,
+        packed_start_tokens: torch.LongTensor,
+        packed_query_position_ids: torch.LongTensor,
+        max_length: int,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        end_token_id: int = None,
+        vocab_size: int = None,
+    ):
+        step = 0
+        generated_sequence = []
+        curr_tokens = packed_start_tokens
+        while step < max_length:
+            generated_sequence.append(curr_tokens)
+            packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens)
+            query_lens = torch.ones_like(curr_tokens)
+            packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange(0, len(key_values_lens), device=key_values_lens.device, dtype=key_values_lens.dtype)
+
+            uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
+            for i in range(len(uppacked)):
+                uppacked[i] += i
+            packed_key_value_indexes = torch.cat(uppacked, dim=0)
+
+            extra_inputs = {}
+            if self.use_moe:
+                extra_inputs = {"mode": "und"}
+
+            output = self.language_model.forward_inference(
+                packed_query_sequence=packed_text_embedding,
+                query_lens=query_lens,
+                packed_query_position_ids=packed_query_position_ids,
+                packed_query_indexes=packed_query_indexes,
+                past_key_values=past_key_values,
+                key_values_lens=key_values_lens,
+                packed_key_value_indexes=packed_key_value_indexes,
+                update_past_key_values=True,
+                is_causal=True,
+                **extra_inputs,
+            )
+            past_key_values = output.past_key_values
+            packed_query_sequence = output.packed_query_sequence
+            pred_logits = self.language_model.lm_head(packed_query_sequence)
+
+            pred_logits[:, vocab_size:] = float('-inf') # ++
+            if do_sample:
+                probs = nn.functional.softmax(pred_logits / temperature, dim=-1)
+                curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                curr_tokens = torch.argmax(pred_logits, dim=-1)
+
+            uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0))
+            for i in range(len(uppacked)):
+                uppacked[i] = torch.cat([uppacked[i], torch.tensor([uppacked[i][-1] + 1], device=uppacked[i].device)], dim=0)
+            packed_key_value_indexes = torch.cat(uppacked, dim=0)
+            key_values_lens = key_values_lens + 1
+            packed_query_position_ids = packed_query_position_ids + 1
+            step += 1
+
+            if end_token_id is not None and curr_tokens[0].item() == end_token_id:  # only support batch=1
+                generated_sequence.append(curr_tokens)
+                break
+
+        output_device = generated_sequence[0].device
+        return torch.stack([i.to(output_device) for i in generated_sequence], dim=0)
+
+
+
+    # ------------------------------------------------------------------------------------------------------------------------------
+    # for validation_gen_kvcache
+    def init_gen_context(self, device: torch.device, dtype: torch.dtype):
+        gen_context = {
+            'kv_lens': torch.tensor([0], device=device, dtype=dtype), #kv_lens 代表 "key-value lengths"，用于追踪注意力机制（Attention Mechanism）中已经缓存的键（key）和值（value）的序列长度。初始化为 [0] 表示在最开始，这个“记忆”是空的，还没有处理任何输入。
+            #'ropes': [0], #旋转位置编码（Rotary Position Embeddings, RoPE）的状态，RoPE 是一种给模型输入序列提供位置信息的方法。这里初始化为 [0] 同样表示从序列的第一个位置开始。
+            'past_key_values': NaiveCache(self.config.llm_config.num_hidden_layers), #用于存储过去所有时间步计算出的注意力键（key）和值（value）。在生成下一个 token 时，模型可以直接使用这些缓存好的 past_key_values，而无需重新计算整个输入序列的注意力，从而极大地提升了生成速度。
+        }
+        return gen_context
+
+
+    @torch.no_grad()
+    def validation_gen_KVcache(
+        self,
+        val_packed_text_ids: torch.LongTensor,
+        val_packed_text_indexes: torch.LongTensor,
+        val_packed_vit_tokens: torch.LongTensor,
+        val_packed_vit_token_indexes: torch.LongTensor,
+        val_sample_lens: List[int],
+        val_packed_position_ids: torch.LongTensor,
+        val_split_lens: List[int] = None,
+        val_attn_modes: List[str] = None,
+        val_sample_N_target: List[int] = None,
+        vit_video_grid_thw: Optional[torch.IntTensor] = None,  # NOTE: 仅 TI2I 时使用
+        vae_video_grid_thw: Optional[torch.IntTensor] = None,
+        video_grid_thw: Optional[torch.IntTensor] = None,
+        val_mse_loss_indexes: Optional[torch.BoolTensor] = None,
+        # for visual generation
+        val_packed_vae_token_indexes: Optional[torch.LongTensor] = None,
+        val_padded_latent: Optional[torch.Tensor] = None,
+        # val_key_frame_mask: Optional[torch.BoolTensor] = None,
+        sample_task: Optional[torch.LongTensor] = None,
+        sample_modality: Optional[torch.LongTensor] = None,
+        video_sizes: List[Tuple[int, int, int]] = [[1, 256, 256]],
+        val_padded_videos: torch.Tensor = None,
+        timestep_shift: float = 4.0,
+        num_timesteps: int = 24,
+        # cfg_text
+        cfg_interval: Optional[Tuple[float, float]] = [0, 1],
+        cfg_renorm_min: float = 0.0,
+        cfg_renorm_type: str = "global",
+        cfg_text_scale: float = 1.0,
+        cfg_vit_scale: float = 1.0, # NOTE ： 对应 cfg_vision_scale
+        device=None,
+        dtype=None,
+        new_token_ids=None,
+        BLOCK_SIZE: int = 128,
+        apply_chat_template: bool = False,
+        apply_qwen_2_5_vl_pos_emb: bool = False,
+        image_token_id: int = 151655,
+        # sample_index: Optional[torch.LongTensor] = None,
+        caption: Optional[List[str]] = None,
+        index: str = "",
+        **kwargs,
+    ):
+        cfg_vision_scale = cfg_vit_scale
+
+        pt, ph, pw = self.latent_patch_size
+
+        index_dtype = val_packed_text_ids.dtype
+
+        # --- 为在循环中切片准备累积长度 ---
+        cu_sample_lens = torch.nn.functional.pad(torch.cumsum(torch.tensor(val_sample_lens, device=device), dim=0), (1, 0))
+
+        sample_splits = map_splits_to_samples(val_sample_lens, val_split_lens)
+
+        # 每个样本中的 每个 vit token 序列长度
+        if val_packed_vit_tokens is not None and vit_video_grid_thw is not None:
+            vit_sample_len = vit_video_grid_thw[:, 0] * vit_video_grid_thw[:, 1] * vit_video_grid_thw[:, 2]  # shape: (N,) , N = 1 * 16 * 16,
+            cu_vit_sample_lens = torch.cat([torch.zeros(1, device=vit_video_grid_thw.device, dtype=vit_sample_len.dtype), vit_sample_len.cumsum(0)])
+            self.vit_model = self.vit_model.to(device=device, dtype=dtype)
+
+            val_packed_vit_tokens = torch.cat(val_packed_vit_tokens, dim=0)
+
+        x_t_all = []
+        max_samples = kwargs.get("max_samples", 16)
+        L = max(len(val_sample_lens) - 1, 1)
+        max_samples = min(L, max_samples)  # update
+
+        gen_idx = 0
+        curr_vae_split_idx, curr_vit_split_idx = 0, 0  # curr_vae_split_idx 为生成的图片索引, 在每个样本的N_target=1时与gen_idx相同； curr_vit_split_idx 为 vit split 索引
+
+        padded_videos = []
+        # self.logger.info(f"Validation start... (timesteps = {num_timesteps})")
+        for i_sample in range(L):  # fix: 需要-1
+            left, right = sample_splits[i_sample][0], sample_splits[i_sample][-1] + 1
+            # --- for interleave ---
+            current_split_lens = val_split_lens[left:right]
+            current_attn_modes = val_attn_modes[left:right]
+            N_target = val_sample_N_target[i_sample]  # 判断目标图像序列数量
+            N_noise_element = current_attn_modes.count("noise") + current_attn_modes.count("full_noise") + current_attn_modes.count("full_noise_target")
+            N_vit_split = current_attn_modes.count("full")
+
+            if right > len(val_attn_modes):
+                break
+
+            # 跳过非 GEN（这里你的逻辑是看有没有 "noise"）
+            if N_noise_element<=0:
+                curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+                continue
+
+            # 生成数量控制：用 gen_idx 而不是 i_sample
+            if gen_idx >= max_samples:
+                break
+
+            # 1. 获取当前样本在整个批次中的切片信息
+            sample_start_idx = cu_sample_lens[i_sample]
+            sample_end_idx = cu_sample_lens[i_sample + 1]
+            current_seq_len = val_sample_lens[i_sample]
+            current_pos_ids = val_packed_position_ids[sample_start_idx:sample_end_idx]
+            i_sample_task = sample_task[sample_start_idx:sample_end_idx]
+            i_sample_modality = sample_modality[sample_start_idx:sample_end_idx]
+
+            # --- 视觉特征 嵌入 ---
+            vae_mask = (val_packed_vae_token_indexes >= sample_start_idx) & (val_packed_vae_token_indexes < sample_end_idx)
+            current_vae_token_indexes_local = val_packed_vae_token_indexes[vae_mask] - sample_start_idx
+
+            # --- vae mse token 部分 : 指示x_t 中需更新的部分的index ---
+            vae_mse_mask = (val_mse_loss_indexes >= sample_start_idx) & (val_mse_loss_indexes < sample_end_idx)
+            current_vae_mse_indexes_local = val_mse_loss_indexes[vae_mse_mask] - sample_start_idx  # 指示x_t 中需更新的部分的index
+            current_vae_mse_indexes_local_in_vae = (
+                current_vae_mse_indexes_local - current_vae_mse_indexes_local[0] + torch.where(current_vae_token_indexes_local == current_vae_mse_indexes_local[0])[0]
+            )  # TODO : 如果是多个target image 且中间有文本信息，可能需要修改
+
+            num_vid_tokens_list, vid_shape_list, vae_position_ids, curr_padded_latent = [], [], [], []
+
+            # 2. 其次生成 vit uncond 特征 （可选）
+            cfg_vision_pro = False
+            if cfg_vision_scale > 1.0 and "full" in current_attn_modes:
+                cfg_vision_pro = True
+                vision_uncond_mask =  i_sample_modality <= 1 # i_sample_modality!=4  则为 cfg_vit
+                _, vision_uncond_pos_ids, _ = self.uncond_split_pro_kvcache(vision_uncond_mask, current_text_ids, device, dtype, apply_qwen_2_5_vl_pos_emb, grid_thw_rope = grid_thw_rope[-N_target:], current_attn_modes=current_attn_modes, current_split_lens=current_split_lens, i_sample_task=i_sample_task, i_sample_modality=i_sample_modality ) # NOTE: grid_thw_rope 去掉 vit/vae condition 的项
+
+            for i_target in range(N_noise_element):
+                T, H, W = video_sizes[curr_vae_split_idx]  # ✅ 尺寸用 gen_idx 索引，保证与“GEN 样本序列顺序”一致
+                t = (T - 1) // self.latent_downsample_temporal + 1
+                h = H // self.latent_downsample_spatial
+                w = W // self.latent_downsample_spatial
+
+                vid_shape_list.append([t, h, w])
+                num_vid_tokens_list.append(t * h * w)
+
+                # prepare packed_vae_position_ids
+                # 使用3D感知的位置编码函数
+                if self.config.interpolate_pos:  # False
+                    # 内插
+                    vae_position_ids.append(
+                        get_flattened_position_ids_interpolate_video(
+                            t, h, w, 1, max_num_frames=self.max_num_latent_frames, max_num_patches_per_side=self.max_latent_size  # latent space的patch size为1
+                        )
+                    )
+                else:  # Adopt !!!!
+                    # 外插
+                    vae_position_ids.append(
+                        get_flattened_position_ids_extrapolate_video(t, h, w, max_latent_size=self.max_latent_size)  # latent space的patch size为1  # NOT USED in 外插
+                    )
+
+                # 当存在condition 的 vae token 时， 计算当前的 padded_latent
+                if len(current_vae_mse_indexes_local) != len(current_vae_token_indexes_local):
+                    padded_latent_ = val_padded_latent[curr_vae_split_idx]  # (T,H,W,C)
+
+                    patches = rearrange(padded_latent_, "(t pt) (h ph) (w pw) c -> (t h w) (pt ph pw c)", t=t, pt=pt, h=h, ph=ph, w=w, pw=pw)
+                    curr_padded_latent.append(patches)
+
+                if val_padded_videos is not None:
+                    padded_videos.append(val_padded_videos[curr_vae_split_idx])
+
+                curr_vae_split_idx += 1
+
+            num_vid_tokens = sum(num_vid_tokens_list)
+            vae_position_ids = torch.cat(vae_position_ids, 0)
+            if curr_padded_latent != []:
+                curr_padded_latent = torch.cat(curr_padded_latent, dim=0).to(dtype)
+
+            # 2. 为当前样本重建输入序列和注意力掩码
+            current_sequence = torch.zeros((current_seq_len, self.hidden_size), device=device, dtype=dtype)
+
+            # --- 文本部分 ---
+            text_mask = (val_packed_text_indexes >= sample_start_idx) & (val_packed_text_indexes < sample_end_idx)
+            current_text_indexes_local = val_packed_text_indexes[text_mask] - sample_start_idx
+
+            current_text_ids = val_packed_text_ids[sample_start_idx:sample_end_idx]
+
+            # ++ 如果修改 val_data 和train_data对齐即不使用
+            current_text_embedding = self.language_model.model.embed_tokens(current_text_ids).to(dtype=dtype)
+
+            current_sequence[current_text_indexes_local] = current_text_embedding[current_text_indexes_local]
+
+            # --- vit部分: 支持 ti2i ---
+            if N_vit_split != 0:
+                vit_sample_start_idx = cu_vit_sample_lens[curr_vit_split_idx]
+                vit_sample_end_idx = cu_vit_sample_lens[curr_vit_split_idx + N_vit_split]
+                current_val_packed_vit_tokens = val_packed_vit_tokens[vit_sample_start_idx:vit_sample_end_idx].to(dtype)
+                current_val_vit_video_grid_thw = vit_video_grid_thw[curr_vit_split_idx : curr_vit_split_idx + N_vit_split]
+                curr_vit_split_idx += N_vit_split  # 推进 vit 样本指针
+
+                if self.vit_type in ["qwen2_5_vl", "qwen_2_5_vl_original"]:
+                    packed_vit_token_embed = self.vit_model(hidden_states=current_val_packed_vit_tokens, grid_thw=current_val_vit_video_grid_thw)
+                    if self.vit_type in ["qwen2_5_vl"]:
+                        packed_vit_token_embed = self.connector(packed_vit_token_embed).to(dtype)
+                else:
+                    raise NotImplementedError(f"{self.vit_type} is not supported")
+
+                vit_mask = (val_packed_vit_token_indexes >= sample_start_idx) & (val_packed_vit_token_indexes < sample_end_idx)
+                current_vit_indexes_local = val_packed_vit_token_indexes[vit_mask] - sample_start_idx
+                current_sequence[current_vit_indexes_local] = packed_vit_token_embed
+
+            # --- 关键：与训练一致 → pad 到 BLOCK_SIZE 的倍数，并让输入/掩码/length 全一致 ---
+            current_seq_len_pad = (current_seq_len + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE
+            current_pad = current_seq_len_pad - current_seq_len
+            if current_pad > 0:
+                current_split_lens = current_split_lens + [current_pad]
+                current_attn_modes = current_attn_modes + ["causal"]
+
+            # NOTE: 固定种子
+            validation_noise_seed = kwargs.get("validation_noise_seed", -1)
+            if validation_noise_seed > 0:
+                generator = torch.Generator(device=device).manual_seed(validation_noise_seed + get_global_rank() * max_samples + i_sample)  # 构造seed
+            else:
+                generator = None
+            x_t = torch.randn(num_vid_tokens, self.patch_latent_dim, generator=generator, device=device, dtype=dtype)  # [1*t*h*w, pt*ph*pw*C]
+
+            if curr_padded_latent != []:  # 存在 vae_condition
+                curr_padded_latent[current_vae_mse_indexes_local_in_vae] = x_t[current_vae_mse_indexes_local_in_vae]
+                x_t = curr_padded_latent
+
+            timesteps = torch.linspace(1, 0, num_timesteps + 1, device=x_t.device)  # fix: 加1
+            timesteps = timestep_shift * timesteps / (1 + (timestep_shift - 1) * timesteps)
+            dts = timesteps[:-1] - timesteps[1:]
+            timesteps = timesteps[:-1]
+
+            if apply_qwen_2_5_vl_pos_emb:
+                grid_thw_rope = video_grid_thw[i_sample]
+
+                # Qwen-VL 中求解方法， TODO: rope_deltas在一次性求完所有 token 的 position_ids 时有作用吗
+                current_pos_ids, _ = self.language_model.get_rope_index(
+                    input_ids=current_text_ids.unsqueeze(0),
+                    image_grid_thw=grid_thw_rope, #vae_video_grid_thw[gen_idx : gen_idx + 1],  # NOTE: 这里是全局索引
+                    video_grid_thw=grid_thw_rope, #vae_video_grid_thw[gen_idx : gen_idx + 1],  # video_grid_thw,
+                    second_per_grid_ts=[1.0]*len(grid_thw_rope),  # second_per_grid_ts,
+                    attention_mask=torch.ones([1, len(current_text_ids)], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+                )  # current_packed_position_ids: [3, 1, L]
+                # mrope 上区分  ref image vae特征与video vae特征
+                current_pos_ids = shift_position_ids(current_pos_ids, pos_shift = 1000, attn_modes = current_attn_modes, split_lens = current_split_lens, shift_attn_mode=['full_noise',"full"], pro_type = 10, i_sample_task=i_sample_task, i_sample_modality=i_sample_modality)
+
+            if cfg_text_scale > 1.0:
+                uncond_mask = i_sample_modality!=0
+                _, uncond_pos_ids, _ = self.uncond_split_pro_kvcache(uncond_mask, current_text_ids, device, dtype, apply_qwen_2_5_vl_pos_emb, grid_thw_rope = grid_thw_rope, current_attn_modes=current_attn_modes, current_split_lens=current_split_lens, i_sample_task=i_sample_task, i_sample_modality=i_sample_modality)
+
+
+            extra_inputs = {}  # {'mode': "und"}
+            if self.use_moe: # NOTE: packed_und_token_indexes 的计算可能有问题，在x2v的任务中
+                if N_vit_split != 0:
+                    packed_und_token_indexes = torch.cat([current_text_indexes_local, current_vit_indexes_local], dim=0)
+                else:
+                    packed_und_token_indexes = current_text_indexes_local
+                extra_inputs.update(
+                    packed_und_token_indexes=packed_und_token_indexes.to(dtype=index_dtype),
+                    packed_gen_token_indexes=current_vae_token_indexes_local.to(dtype=index_dtype),
+                )
+
+            timestep = torch.zeros(x_t.shape[0], device=x_t.device)
+            timestep[current_vae_mse_indexes_local_in_vae] = torch.tensor([1.] * current_vae_mse_indexes_local_in_vae.shape[0], device=x_t.device)
+
+            # --- 存入 视觉特征 编码 （vae condition）---
+            timestep_embed = self.time_embedder(timestep)
+            latent_pos_embed = self.latent_pos_embed(vae_position_ids)
+            vae_embed = self.vae2llm(x_t) + timestep_embed + latent_pos_embed
+            vae_embed = vae_embed.to(current_sequence.dtype)
+            current_sequence[current_vae_token_indexes_local] = vae_embed
+
+            ## ++++++++= for kv cache
+            gen_context = self.init_gen_context(device=device, dtype=torch.int32) # gen_context: 初始化kv_lens，ropes，past_key_values
+            cfg_text_context = deepcopy(gen_context)  #在添加当前文本之前使用，对应“没有包含这段新文本指令”的上下文，作为负向引导（negative prompt）使用，
+            cfg_vision_context = deepcopy(gen_context )
+            # current_cond_len = current_vae_mse_indexes_local[0]  # 处理到第一个 noise latent 前, 即前面的condition 共有 current_cond_len 个元素
+
+            current_cond_start, current_cond_end = 0, 0
+
+            self.language_model.eval()
+            self.eval()
+            for i_attn_mode_, current_cond_len in zip(current_attn_modes, current_split_lens):
+                current_cond_end += current_cond_len
+                if i_attn_mode_ == "noise": # 求解  noise latent 前 的 kv cache
+                    vae_in_packed_sequence_index = torch.arange(current_cond_start, current_cond_end, dtype=torch.long, device=device) # vae split 在 packed_sequence 中的索引
+                    packed_seqlens_vae = current_cond_len
+
+                    # 生成用于 vae split 的 extra_inputs
+                    target_packed_vae_token_indexes = torch.arange(1, current_cond_len-1, dtype=torch.long, device=device)
+                    target_packed_text_indexes = torch.tensor([0, current_cond_len-1], dtype=torch.long, device=device)
+
+                    break
+
+                if i_attn_mode_ == 'causal':
+                    is_causal = True
+                else:
+                    is_causal = False
+
+                gen_context = self.update_gen_context(current_sequence, current_pos_ids, gen_context, extra_inputs, current_cond_start, current_cond_end, current_cond_len, device, dtype, is_causal = is_causal)
+                if cfg_text_scale > 1.0 and i_sample_modality[current_cond_start] != 0:
+                    cfg_text_context = self.update_gen_context(current_sequence, current_pos_ids, cfg_text_context, extra_inputs, current_cond_start, current_cond_end, current_cond_len, device, dtype, is_causal = is_causal)
+                if cfg_vision_scale > 1.0 and i_sample_modality[current_cond_start] > 1: # i_sample_modality[current_cond_start] != 4 则为 cfg_vit
+                    cfg_vision_context = self.update_gen_context(current_sequence, current_pos_ids, cfg_vision_context, extra_inputs, current_cond_start, current_cond_end, current_cond_len, device, dtype, is_causal = is_causal)
+
+                current_cond_start = current_cond_end
+
+
+            for _ in range(1):
+                timestep = torch.zeros(x_t.shape[0], device=x_t.device)
+                # for group-by-group generation
+
+                for i, timestep_ in enumerate(timesteps):
+
+                    timestep[current_vae_mse_indexes_local_in_vae] = torch.tensor([timestep_] * current_vae_mse_indexes_local_in_vae.shape[0], device=x_t.device)
+                    if timestep_ > cfg_interval[0] and timestep_ <= cfg_interval[1]:
+                        cfg_text_scale_ = cfg_text_scale
+                        cfg_vision_scale_ = cfg_vision_scale # 默认 vit_uncond 和text_uncond 都采用 同一 cfg_interval
+                    else:
+                        cfg_text_scale_ = 1.0
+                        cfg_vision_scale_ = 1.0
+
+                    # --- 视觉特征 编码 ---
+                    timestep_embed = self.time_embedder(timestep)
+                    latent_pos_embed = self.latent_pos_embed(vae_position_ids)
+                    vae_embed = self.vae2llm(x_t) + timestep_embed + latent_pos_embed
+                    vae_embed = vae_embed.to(current_sequence.dtype)
+
+                    # 一切正常再做赋值
+                    current_sequence[current_vae_token_indexes_local] = vae_embed
+
+                    # 提取出VAE部分的值
+                    packed_sequence_vae = current_sequence[vae_in_packed_sequence_index]
+
+                    extra_inputs_vae = {}
+                    if self.use_moe:
+                        extra_inputs_vae = {"mode": "gen", "packed_vae_token_indexes": target_packed_vae_token_indexes, "packed_text_indexes": target_packed_text_indexes}
+
+
+                    v_t_output = self.language_model.forward_inference(
+                        packed_query_sequence=packed_sequence_vae,  # [1026, 1536]
+                        query_lens=torch.tensor([packed_seqlens_vae],dtype=torch.int32, device=device),  # [1]
+                        packed_query_position_ids=current_pos_ids[:, :, current_cond_start:current_cond_end],  # [1026]
+                        packed_query_indexes=vae_in_packed_sequence_index,  # [1026]
+                        past_key_values=gen_context['past_key_values'],  # <class 'modeling.lance.qwen2_navit.NaiveCache'>
+                        key_values_lens=gen_context['kv_lens'],  # [1]
+                        packed_key_value_indexes=torch.arange(0,gen_context['kv_lens'][0], dtype=torch.int64, device=device),  # [76]
+                        update_past_key_values=False,
+                        is_causal=False,
+                        **extra_inputs_vae,
+                    )
+
+                    v_t = self.llm2vae(v_t_output.packed_query_sequence)
+                    v_t = v_t[target_packed_vae_token_indexes]
+
+                    # --- 引入 cfg ---
+                    if cfg_text_scale_ > 1.0:
+                        cfg_text_output = self.language_model.forward_inference(
+                            packed_query_sequence=packed_sequence_vae,
+                            query_lens=torch.tensor([packed_seqlens_vae],dtype=torch.int32, device=device),
+                            packed_query_position_ids=uncond_pos_ids[:,:,cfg_text_context['kv_lens'][0]:cfg_text_context['kv_lens'][0]+packed_seqlens_vae],
+                            packed_query_indexes=vae_in_packed_sequence_index - sum(i_sample_modality==0), # 对应 packed_sequence_vae 在整个cfg序列中的index
+                            past_key_values=cfg_text_context['past_key_values'],
+                            key_values_lens=cfg_text_context['kv_lens'],
+                            packed_key_value_indexes=torch.arange(0,cfg_text_context['kv_lens'][0], dtype=torch.int64, device=device),
+                            update_past_key_values=False,
+                            is_causal=False,
+                            **extra_inputs_vae,
+                        )
+                        cfg_text_v_t = self.llm2vae(cfg_text_output.packed_query_sequence)
+                        cfg_text_v_t = cfg_text_v_t[target_packed_vae_token_indexes]
+
+                        if cfg_vision_pro:
+                            cfg_vision_output = self.language_model.forward_inference(
+                                packed_query_sequence=packed_sequence_vae,
+                                query_lens=torch.tensor([packed_seqlens_vae],dtype=torch.int32, device=device),
+                                packed_query_position_ids=vision_uncond_pos_ids[:,:,cfg_vision_context['kv_lens'][0]:cfg_vision_context['kv_lens'][0]+packed_seqlens_vae],
+                                packed_query_indexes=vae_in_packed_sequence_index - sum(i_sample_modality==4), # 对应 packed_sequence_vae 在整个cfg序列中的index
+                                past_key_values=cfg_vision_context['past_key_values'],
+                                key_values_lens=cfg_vision_context['kv_lens'],
+                                packed_key_value_indexes=torch.arange(0,cfg_vision_context['kv_lens'][0], dtype=torch.int64, device=device),
+                                update_past_key_values=False,
+                                is_causal=False,
+                                **extra_inputs_vae,
+                            )
+
+                            cfg_text_vision_v_t = self.llm2vae(cfg_vision_output.packed_query_sequence)
+                            cfg_text_vision_v_t = cfg_text_vision_v_t[target_packed_vae_token_indexes]
+
+                            v_t_ = cfg_text_vision_v_t + cfg_text_scale_ * (v_t - cfg_text_v_t) + cfg_vision_scale_  * (cfg_text_v_t - cfg_text_vision_v_t)
+                        else:
+                            v_t_ = cfg_text_v_t + cfg_text_scale_ * (v_t - cfg_text_v_t)
+
+                        # NOTE norm is computed over all dimensions, thus currently only supports batch_size = 1 with navit
+                        if cfg_renorm_type == "global":
+                            norm_v_t = torch.norm(v_t)
+                            norm_v_t_ = torch.norm(v_t_)
+                            scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                        elif cfg_renorm_type == "channel":
+                            norm_v_t = torch.norm(v_t, dim=-1, keepdim=True)
+                            norm_v_t_ = torch.norm(v_t_, dim=-1, keepdim=True)
+                            scale = (norm_v_t / (norm_v_t_ + 1e-8)).clamp(min=cfg_renorm_min, max=1.0)
+                        elif cfg_renorm_type.lower() in ("", "none", "null"):
+                            scale = 1
+                        else:
+                            raise NotImplementedError(f"{cfg_renorm_type} is not suppoprted")
+                        v_t = v_t_ * scale
+
+                    x_t[current_vae_mse_indexes_local_in_vae] = x_t[current_vae_mse_indexes_local_in_vae] - v_t.to(x_t.device) * dts[i]  # velocity pointing from data to noise
+
+            # ---- 每个样本各自重排到 [T,H,W,C]，避免用最后一个样本的 t/h/w 去重排整批 ----
+            curr_seq_target, patch = 0, []
+            for i_target in range(N_noise_element):
+
+                pt, ph, pw = self.latent_patch_size
+                t, h, w = vid_shape_list[i_target]
+                len_target = t * h * w
+
+                x_t_ =  rearrange(x_t[curr_seq_target : curr_seq_target + len_target], "(t h w) (pt ph pw c) -> (t pt) (h ph) (w pw) c", t=t, h=h, w=w, pt=pt, ph=ph, pw=pw)
+
+                patch.append(x_t_)
+                curr_seq_target += len_target
+            # patch = torch.cat(patch, dim=0)
+            x_t_all.append(patch)
+            # ✅ 成功处理一个 GEN 样本，推进 gen_idx
+            gen_idx += 1
+
+        # self.logger.info(f"Validation step done with {len(x_t_all)} samples.")
+
+        if caption != None:
+            return x_t_all, [caption], padded_videos, index
+
+        # if padded_videos != []:
+        #     return x_t_all, padded_videos
+
+        return x_t_all
+
+    def get_uncond_attn_modes_split_lens(self, current_attn_modes, current_split_lens, uncond_mask):
+        # 根据 uncond_mask 过滤出无条件样本的部分
+        curr = 0
+        uncond_attn_modes, uncond_split_lens = [], []
+        for i, split_len in enumerate(current_split_lens):
+
+            # 检查当前 split 对应的 mask 是否全为 True
+            mask_slice = uncond_mask[curr:curr+split_len]
+            if mask_slice.all():
+                uncond_attn_modes.append(current_attn_modes[i])
+                uncond_split_lens.append(split_len)
+
+            # 更新当前位置
+            curr += split_len
+
+        return uncond_attn_modes, uncond_split_lens
+
+
+
+
+    def uncond_split_pro_kvcache(
+        self,
+        uncond_mask,
+        current_text_ids,
+        device,
+        dtype,
+        apply_qwen_2_5_vl_pos_emb=False,
+        uncond_pos_ids=None,
+        grid_thw_rope=None,
+        current_attn_modes=None,
+        current_split_lens=None,
+        i_sample_task=None,
+        i_sample_modality=None,
+    ):
+        """
+        uncond_attn_modes: uncond 序列的 attn_mode 列表
+        uncond_split_lens: uncond 序列的 split_len 列表
+        uncond_packed_gen_token_indexes: uncond 序列中 gen_token 索引列表 (for extra_inputs)
+        uncond_packed_und_token_indexes: uncond 序列中 und_token 索引列表 (for extra_inputs)
+        uncond_seq_len: uncond 序列的长度
+        uncond_pad: uncond 序列的 pad 长度
+        """
+        # 提取 uncond 序列的 text_ids
+        uncond_text_ids = current_text_ids[uncond_mask]
+        uncond_seq_len = len(uncond_text_ids)
+
+
+        if apply_qwen_2_5_vl_pos_emb:
+            uncond_pos_ids, uncond_rope_deltas = self.language_model.get_rope_index(
+                input_ids=uncond_text_ids.unsqueeze(0),
+                image_grid_thw=grid_thw_rope,  # vae_video_grid_thw[gen_idx : gen_idx + 1],
+                video_grid_thw=grid_thw_rope,  # vae_video_grid_thw[gen_idx : gen_idx + 1],  # video_grid_thw,
+                second_per_grid_ts=[1.0] * len(grid_thw_rope),  # second_per_grid_ts,
+                attention_mask=torch.ones([1, len(uncond_text_ids)], dtype=torch.long, device=device),  # attention_mask, 全1掩码？
+            )
+            # mrope 上区分  ref image vae特征与video vae特征
+            uncond_attn_modes, uncond_split_lens = self.get_uncond_attn_modes_split_lens( current_attn_modes, current_split_lens, uncond_mask)
+            i_sample_task = i_sample_task[uncond_mask]
+            i_sample_modality = i_sample_modality[uncond_mask]
+
+            uncond_pos_ids = shift_position_ids(uncond_pos_ids, pos_shift = 1000, attn_modes = uncond_attn_modes, split_lens = uncond_split_lens, shift_attn_mode=['full_noise',"full"], pro_type = 10, i_sample_task=i_sample_task, i_sample_modality=i_sample_modality)
+        else:
+            uncond_pos_ids = torch.tensor(uncond_pos_ids, dtype=torch.long, device=device)[:uncond_seq_len]
+
+        return (
+            uncond_text_ids,
+            uncond_pos_ids,
+            uncond_seq_len,
+        )
+
+
+
+    def update_gen_context(self, current_sequence, current_pos_ids, gen_context, extra_inputs, current_cond_start, current_cond_end, current_cond_len, device, dtype, is_causal = True):
+        extra_inputs_cond = {}
+        extra_inputs_gen_mask = (extra_inputs["packed_gen_token_indexes"] >= current_cond_start) & (extra_inputs["packed_gen_token_indexes"] < current_cond_end)
+        extra_inputs_cond["packed_vae_token_indexes"] = extra_inputs["packed_gen_token_indexes"][extra_inputs_gen_mask] - gen_context['kv_lens']
+        extra_inputs_und_mask = (extra_inputs["packed_und_token_indexes"] >= current_cond_start) & (extra_inputs["packed_und_token_indexes"] < current_cond_end)
+        extra_inputs_cond["packed_text_indexes"] = extra_inputs["packed_und_token_indexes"][extra_inputs_und_mask] - gen_context['kv_lens']
+
+        if extra_inputs_cond["packed_vae_token_indexes"].shape[0] > 0 :
+            mode_ = "gen"
+        else:
+            mode_ = "und"
+
+        # 只对condition 求解kv cache
+        output = self.language_model.forward_inference(
+            packed_query_sequence=current_sequence[current_cond_start:current_cond_end],
+            query_lens=torch.tensor([current_cond_len],dtype=torch.int32, device=device),
+            packed_query_position_ids=current_pos_ids[:, :, current_cond_start:current_cond_end],
+            #packed_query_indexes=torch.arange(current_cond_start,current_cond_end, dtype=torch.long, device=device), # 放置 当前新输入 的位置。
+            packed_query_indexes=torch.arange(gen_context['kv_lens'][0],gen_context['kv_lens'][0] + current_cond_len, dtype=torch.long, device=device), # 放置 当前新输入 的位置。
+            past_key_values=gen_context['past_key_values'],
+            packed_key_value_indexes=torch.arange(0,gen_context['kv_lens'][0], dtype=torch.int64, device=device), # 放置 历史缓存 (Past KV) 的位置。
+            key_values_lens=gen_context['kv_lens'], # torch.tensor([0], dtype=torch.int32),
+            update_past_key_values=True,
+            is_causal=is_causal,
+            mode = mode_,
+            **extra_inputs_cond
+        )
+
+        gen_context['past_key_values'] = output.past_key_values
+        gen_context['kv_lens'] += current_cond_len #= torch.tensor([int(current_cond_end)], dtype=torch.int32, device=device)
+        #gen_context['ropes'] = [int(current_cond_end)]
+
+        return gen_context
diff --git a/modeling/lance/modeling_utils.py b/modeling/lance/modeling_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e2c359de2559ee0adaabfe8802304a887fba411
--- /dev/null
+++ b/modeling/lance/modeling_utils.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import math
+
+import numpy as np
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    """
+    Get 3D sine-cosine positional embeddings from a grid.
+    """
+    assert embed_dim % 2 == 0, "Embedding dimension must be even for 3D embeddings"
+
+    # 维度分配策略保持不变（确保每轴维度为偶数）
+    d = embed_dim // 3
+    d = d if d % 2 == 0 else d - 1
+    dim_t, dim_h = d, d
+    dim_w = embed_dim - 2 * d
+    assert dim_w % 2 == 0
+
+    emb_t = get_1d_sincos_pos_embed_from_grid(dim_t, grid[0])  # (T*H*W, Dt)
+    emb_h = get_1d_sincos_pos_embed_from_grid(dim_h, grid[1])  # (T*H*W, Dh)
+    emb_w = get_1d_sincos_pos_embed_from_grid(dim_w, grid[2])  # (T*H*W, Dw)
+    return np.concatenate([emb_t, emb_h, emb_w], axis=1)
+
+
+def get_3d_sincos_pos_embed(embed_dim, t, h, w):
+    """
+    Get 3D sine-cosine positional embeddings (v2 version, using thw indexing).
+    """
+    grid_t = np.arange(t, dtype=np.float32)
+    grid_h = np.arange(h, dtype=np.float32)
+    grid_w = np.arange(w, dtype=np.float32)
+    tt, hh, ww = np.meshgrid(grid_t, grid_h, grid_w, indexing="ij")  # (t,h,w)
+
+    grid = np.stack([tt, hh, ww], axis=0)  # [3, t, h, w]
+    return get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+
+
+# --------------------------------------------------------
+# TimestepEmbedder
+# Reference:
+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+# --------------------------------------------------------
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq) # 跟llm的hidden size对齐
+        return t_emb
+
+
+class MLPconnector(nn.Module):
+    def __init__(self, in_dim: int, out_dim: int, hidden_act: str):
+        super().__init__()
+        self.activation_fn = ACT2FN[hidden_act]
+        self.fc1 = nn.Linear(in_dim, out_dim)
+        self.fc2 = nn.Linear(out_dim, out_dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class PositionEmbedding(nn.Module):
+    def __init__(self, max_num_patch_per_side, hidden_size):
+        super().__init__()
+        self.max_num_patch_per_side = max_num_patch_per_side
+        self.hidden_size = hidden_size
+        self.pos_embed = nn.Parameter(
+            torch.zeros(max_num_patch_per_side ** 2, hidden_size),
+            requires_grad=False
+        )
+        self._init_weights()
+
+    def _init_weights(self):
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.hidden_size, self.max_num_patch_per_side)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float())
+
+    def forward(self, position_ids):
+        return self.pos_embed[position_ids]
+
+
+class PositionEmbedding3D(nn.Module):
+    def __init__(self, max_latent_num_frames, max_latent_size, hidden_size):
+        super().__init__()
+        self.max_num_latent_frames = max_latent_num_frames  # t
+        self.max_latent_size = max_latent_size  # h, w
+        self.hidden_size = hidden_size
+        self.pos_embed = nn.Parameter(torch.zeros(max_latent_num_frames * (max_latent_size**2), hidden_size), requires_grad=False)
+        self._init_weights()
+
+    def _init_weights(self):
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_3d_sincos_pos_embed(self.hidden_size, self.max_num_latent_frames, self.max_latent_size, self.max_latent_size)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float())
+
+    def forward(self, position_ids):
+        return self.pos_embed[position_ids]
diff --git a/modeling/lance/qwen2_navit.py b/modeling/lance/qwen2_navit.py
new file mode 100644
index 0000000000000000000000000000000000000000..070157b0d8c17a3e8ec858c1fc70ed59c2a9e79c
--- /dev/null
+++ b/modeling/lance/qwen2_navit.py
@@ -0,0 +1,1297 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright (c) 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Optional, Tuple
+from einops import rearrange
+
+import torch
+from torch import nn
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.attention.flex_attention import flex_attention
+from torch.nn.functional import scaled_dot_product_attention
+from transformers.utils import ModelOutput
+
+from flash_attn import flash_attn_varlen_func
+from modeling.qwen2.modeling_qwen2 import (
+    Qwen2Attention,
+    Qwen2MLP,
+    Qwen2PreTrainedModel,
+    Qwen2RMSNorm,
+    Qwen2RotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from modeling.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VLRotaryEmbedding,
+    apply_multimodal_rotary_pos_emb,
+)
+from modeling.qwen2.configuration_qwen2 import Qwen2Config
+
+torch._dynamo.config.cache_size_limit = 512
+torch._dynamo.config.accumulated_cache_size_limit = 4096
+flex_attention = torch.compile(flex_attention)
+
+class NaiveCache:
+    def __init__(self, num_layers):
+        self.key_cache = {k: None for k in range(num_layers)}
+        self.value_cache = {k: None for k in range(num_layers)}
+
+    @property
+    def num_layers(self):
+        return len(self.key_cache)
+
+    @property
+    def seq_lens(self):
+        if self.key_cache[0] is not None:
+            return self.key_cache[0].shape[0]
+        else:
+            return 0
+
+
+@dataclass
+class BaseNavitOutputWithPast(ModelOutput):
+    packed_query_sequence: torch.FloatTensor = None
+    past_key_values: Optional[NaiveCache] = None
+
+
+def pad_sequence(tensor, pad_size):
+    H, L, D = tensor.shape
+    pad_tensor = tensor.new_zeros((H, pad_size, D))
+    return torch.cat([tensor, pad_tensor], dim=1)
+
+
+class PackedAttention(Qwen2Attention):
+    # TODO: 暂未使用，qknorm未更新相关逻辑
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        if self.config.qk_norm:
+            self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        else:
+            self.q_norm = nn.Identity()
+            self.k_norm = nn.Identity()
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask: List[torch.Tensor],
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        **kwargs
+    ):
+        packed_query_states = self.q_proj(packed_sequence).view(-1, self.num_heads, self.head_dim)
+        packed_key_states = self.k_proj(packed_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = self.v_proj(packed_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+
+        packed_query_states = self.q_norm(packed_query_states)
+        packed_key_states = self.k_norm(packed_key_states)
+
+        packed_cos, packed_sin = packed_position_embeddings
+        if kwargs.get("apply_qwen_2_5_vl_pos_emb"):  # kwargs.get("vit_type") == 'qwen_2_5_vl_original':
+            packed_query_states = rearrange(packed_query_states, "l (b h) d -> b h l d", h=self.num_heads)
+            packed_key_states = rearrange(packed_key_states, "l (b h) d -> b h l d", h=self.num_key_value_heads)
+            packed_query_states, packed_key_states = apply_multimodal_rotary_pos_emb(
+                packed_query_states, packed_key_states, packed_cos, packed_sin, self.config.rope_scaling["mrope_section"]
+            )
+            packed_query_states = rearrange(packed_query_states, "b h l d -> l (b h) d")
+            packed_key_states = rearrange(packed_key_states, "b h l d -> l (b h) d")
+        else:
+            packed_query_states, packed_key_states = apply_rotary_pos_emb(packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1)
+
+        if isinstance(attention_mask, List):
+            packed_key_states = packed_key_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_key_states = packed_key_states.reshape(-1, self.num_heads, self.head_dim)
+            packed_value_states = packed_value_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_value_states = packed_value_states.reshape(-1, self.num_heads, self.head_dim)
+
+            unpacked_query_states = packed_query_states.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_key_states = packed_key_states.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_value_states = packed_value_states.transpose(0, 1).split(sample_lens, dim=1)
+            upacked_attn_output = []
+            for query_states, key_states, value_states, attention_mask_per_sample in zip(unpacked_query_states, unpacked_key_states, unpacked_value_states, attention_mask):
+                with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+                    attn_output = scaled_dot_product_attention(
+                        query_states.to(torch.bfloat16).unsqueeze(0),
+                        key_states.to(torch.bfloat16).unsqueeze(0),
+                        value_states.to(torch.bfloat16).unsqueeze(0),
+                        attention_mask_per_sample.to(torch.bfloat16).unsqueeze(0),
+                    )
+                upacked_attn_output.append(attn_output.squeeze(0))
+            packed_attn_output = torch.cat(upacked_attn_output, dim=1)
+        else:
+            pad_size = sum(sample_lens) - packed_query_states.shape[0]
+            packed_query_states = pad_sequence(packed_query_states.permute(1, 0, 2), pad_size)
+            packed_key_states = pad_sequence(packed_key_states.permute(1, 0, 2), pad_size)
+            packed_value_states = pad_sequence(packed_value_states.permute(1, 0, 2), pad_size)
+            packed_attn_output = flex_attention(
+                packed_query_states.unsqueeze(0),
+                packed_key_states.unsqueeze(0),
+                packed_value_states.unsqueeze(0),
+                enable_gqa=True,
+                block_mask=attention_mask,
+            )
+            end_index = packed_attn_output.shape[2] - pad_size
+            packed_attn_output = packed_attn_output[0, :, :end_index, :]
+
+        packed_attn_output = packed_attn_output.transpose(0, 1).reshape(-1, self.hidden_size)
+        packed_attn_output = self.o_proj(packed_attn_output)
+
+        return packed_attn_output
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        **kwargs
+    ):
+        packed_query_states = self.q_proj(packed_query_sequence).view(-1, self.num_heads, self.head_dim)
+        packed_key_states = self.k_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = self.v_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+
+        packed_query_states = self.q_norm(packed_query_states)
+        packed_key_states = self.k_norm(packed_key_states)
+
+        packed_cos, packed_sin = packed_query_position_embeddings
+        packed_query_states, packed_key_states = apply_rotary_pos_emb(packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1)
+
+        packed_query_states = packed_query_states.to(torch.bfloat16)
+        packed_key_states = packed_key_states.to(torch.bfloat16)
+        packed_value_states = packed_value_states.to(torch.bfloat16)
+
+        if past_key_values is not None and past_key_values.key_cache[self.layer_idx] is not None:
+            past_key_states = past_key_values.key_cache[self.layer_idx]
+            past_value_states = past_key_values.value_cache[self.layer_idx]
+
+            seqlens = sum(query_lens) + sum(key_values_lens)
+            merged_key_states = past_key_states.new_zeros((seqlens, self.num_key_value_heads, self.head_dim))
+            merged_value_states = past_key_states.new_zeros((seqlens, self.num_key_value_heads, self.head_dim))
+            merged_key_states[packed_query_indexes] = packed_key_states
+            merged_key_states[packed_key_value_indexes] = past_key_states
+            merged_value_states[packed_query_indexes] = packed_value_states
+            merged_value_states[packed_key_value_indexes] = past_value_states
+            key_values_lens = key_values_lens + query_lens
+        else:
+            merged_key_states = packed_key_states
+            merged_value_states = packed_value_states
+            key_values_lens = query_lens
+
+        cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(query_lens, dim=0), (1, 0))
+        cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(key_values_lens, dim=0), (1, 0))
+
+        packed_attn_output = flash_attn_varlen_func(
+            q=packed_query_states,
+            k=merged_key_states,
+            v=merged_value_states,
+            cu_seqlens_q=cu_seqlens_q.to(torch.int32),
+            cu_seqlens_k=cu_seqlens_k.to(torch.int32),
+            max_seqlen_q=max(query_lens).item(),
+            max_seqlen_k=max(key_values_lens).item(),
+            causal=is_causal,
+        )
+        packed_attn_output = packed_attn_output.reshape(-1, self.hidden_size)
+        packed_attn_output = self.o_proj(packed_attn_output)
+
+        if update_past_key_values:
+            past_key_values.key_cache[self.layer_idx] = merged_key_states
+            past_key_values.value_cache[self.layer_idx] = merged_value_states
+
+        return packed_attn_output, past_key_values
+
+
+class PackedAttentionMoT(Qwen2Attention):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        if self.config.qk_norm_und or self.config.qk_norm_gen:
+            # NOTE: 拆开初始化
+            # 理解
+            self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps) if self.config.qk_norm_und else nn.Identity()
+            self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps) if self.config.qk_norm_und else nn.Identity()
+
+            # 生成
+            self.q_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps) if self.config.qk_norm_gen else nn.Identity()
+            self.k_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps) if self.config.qk_norm_gen else nn.Identity()
+        else:
+            # NOTE: 不拆开初始化
+            if self.config.qk_norm:
+                self.q_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+                self.k_norm = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+                self.q_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+                self.k_norm_moe_gen = Qwen2RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            else:
+                self.q_norm = nn.Identity()
+                self.k_norm = nn.Identity()
+                self.q_norm_moe_gen = nn.Identity()
+                self.k_norm_moe_gen = nn.Identity()
+
+        self.q_proj_moe_gen = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj_moe_gen = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj_moe_gen = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj_moe_gen = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.layer_idx = layer_idx
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+        mode=None,
+        **kwargs,
+    ):
+        packed_query_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_heads * self.head_dim))
+        packed_key_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+        packed_value_states = packed_sequence.new_zeros((packed_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+
+        packed_sequence_und = packed_sequence[packed_und_token_indexes]
+        packed_sequence_gen = packed_sequence[packed_gen_token_indexes]
+
+        packed_query_states[packed_und_token_indexes] = self.q_proj(packed_sequence_und)
+        packed_query_states[packed_gen_token_indexes] = self.q_proj_moe_gen(packed_sequence_gen)
+
+        packed_key_states[packed_und_token_indexes] = self.k_proj(packed_sequence_und)
+        packed_key_states[packed_gen_token_indexes] = self.k_proj_moe_gen(packed_sequence_gen)
+
+        packed_value_states[packed_und_token_indexes] = self.v_proj(packed_sequence_und)
+        packed_value_states[packed_gen_token_indexes] = self.v_proj_moe_gen(packed_sequence_gen)
+
+
+        packed_query_states = packed_query_states.view(-1, self.num_heads, self.head_dim)
+        packed_key_states = packed_key_states.view(-1, self.num_key_value_heads, self.head_dim)
+        packed_value_states = packed_value_states.view(-1, self.num_key_value_heads, self.head_dim)
+        if self.config.freeze_und:
+            packed_value_states[packed_und_token_indexes] = packed_value_states[packed_und_token_indexes].detach()
+
+        packed_query_states_ = packed_query_states.new_zeros(packed_query_states.shape)
+        packed_key_states_ = packed_key_states.new_zeros(packed_key_states.shape)
+
+        packed_query_states_[packed_und_token_indexes] = self.q_norm(packed_query_states[packed_und_token_indexes])
+        if self.config.freeze_und:
+            packed_query_states_[packed_und_token_indexes] = packed_query_states_[packed_und_token_indexes].detach()
+        packed_query_states_[packed_gen_token_indexes] = self.q_norm_moe_gen(packed_query_states[packed_gen_token_indexes])
+
+        packed_key_states_[packed_und_token_indexes] = self.k_norm(packed_key_states[packed_und_token_indexes])
+        if self.config.freeze_und:
+            packed_key_states_[packed_und_token_indexes] = packed_key_states_[packed_und_token_indexes].detach()
+        packed_key_states_[packed_gen_token_indexes] = self.k_norm_moe_gen(packed_key_states[packed_gen_token_indexes])
+
+        packed_cos, packed_sin = packed_position_embeddings
+        if kwargs.get("apply_qwen_2_5_vl_pos_emb"):  # kwargs.get("vit_type") == 'qwen_2_5_vl_original':
+            packed_query_states_ = rearrange(packed_query_states_, "l (b h) d -> b h l d", h=self.num_heads)
+            packed_key_states_ = rearrange(packed_key_states_, "l (b h) d -> b h l d", h=self.num_key_value_heads)
+            packed_query_states_, packed_key_states_ = apply_multimodal_rotary_pos_emb(
+                packed_query_states_, packed_key_states_, packed_cos, packed_sin, self.config.rope_scaling["mrope_section"]
+            )
+            packed_query_states_ = rearrange(packed_query_states_, "b h l d -> l (b h) d")
+            packed_key_states_ = rearrange(packed_key_states_, "b h l d -> l (b h) d")
+        else:
+            packed_query_states_, packed_key_states_ = apply_rotary_pos_emb(packed_query_states_, packed_key_states_, packed_cos, packed_sin, unsqueeze_dim=1)
+
+
+        if isinstance(attention_mask, List):
+            packed_key_states_ = packed_key_states_[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_key_states_ = packed_key_states_.reshape(-1, self.num_heads, self.head_dim)
+            packed_value_states = packed_value_states[:, :, None, :].repeat(1, 1, self.num_key_value_groups, 1)
+            packed_value_states = packed_value_states.reshape(-1, self.num_heads, self.head_dim)
+
+            unpacked_query_states = packed_query_states_.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_key_states = packed_key_states_.transpose(0, 1).split(sample_lens, dim=1)
+            unpacked_value_states = packed_value_states.transpose(0, 1).split(sample_lens, dim=1)
+            upacked_attn_output = []
+            for query_states, key_states, value_states, attention_mask_per_sample in zip(unpacked_query_states, unpacked_key_states, unpacked_value_states, attention_mask):
+                with sdpa_kernel(backends=[SDPBackend.EFFICIENT_ATTENTION]):
+                    attn_output = scaled_dot_product_attention(
+                        query_states.to(torch.bfloat16).unsqueeze(0),
+                        key_states.to(torch.bfloat16).unsqueeze(0),
+                        value_states.to(torch.bfloat16).unsqueeze(0),
+                        attention_mask_per_sample.to(torch.bfloat16).unsqueeze(0),
+                    )
+                upacked_attn_output.append(attn_output.squeeze(0))
+            packed_attn_output = torch.cat(upacked_attn_output, dim=1)
+
+            packed_attn_output = packed_attn_output.transpose(0, 1).reshape(-1, self.num_heads * self.head_dim)
+            packed_attn_output_ = packed_attn_output.new_zeros(packed_attn_output.shape)
+            packed_attn_output_[packed_und_token_indexes] = self.o_proj(packed_attn_output[packed_und_token_indexes])
+            packed_attn_output_[packed_gen_token_indexes] = self.o_proj_moe_gen(packed_attn_output[packed_gen_token_indexes])
+        else:  # USED !!!
+            pad_size = sum(sample_lens) - packed_query_states.shape[0]
+            packed_query_states_ = pad_sequence(packed_query_states_.permute(1, 0, 2), pad_size)
+            packed_key_states_ = pad_sequence(packed_key_states_.permute(1, 0, 2), pad_size)
+            packed_value_states = pad_sequence(packed_value_states.permute(1, 0, 2), pad_size)
+            packed_attn_output = flex_attention(
+                packed_query_states_.unsqueeze(0),  # 1, num_head, L, head_dim
+                packed_key_states_.unsqueeze(0),
+                packed_value_states.unsqueeze(0),
+                enable_gqa=True,
+                block_mask=attention_mask,
+            )
+            end_index = packed_attn_output.shape[2] - pad_size
+            packed_attn_output = packed_attn_output[0, :, :end_index, :]
+
+            packed_attn_output = packed_attn_output.transpose(0, 1).reshape(-1, self.num_heads * self.head_dim)
+            packed_attn_output_ = packed_attn_output.new_zeros(packed_attn_output.shape)
+            packed_attn_output_[packed_und_token_indexes] = self.o_proj(packed_attn_output[packed_und_token_indexes])
+            packed_attn_output_[packed_gen_token_indexes] = self.o_proj_moe_gen(packed_attn_output[packed_gen_token_indexes])
+
+        return packed_attn_output_
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+        **kwargs
+    ):
+        if mode == "und":
+            packed_query_states = self.q_proj(packed_query_sequence).view(-1, self.num_heads, self.head_dim)
+            packed_key_states = self.k_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+            packed_value_states = self.v_proj(packed_query_sequence).view(-1, self.num_key_value_heads, self.head_dim)
+            packed_query_states = self.q_norm(packed_query_states)
+            packed_key_states = self.k_norm(packed_key_states)
+        elif mode == "gen":
+            packed_query_sequence = packed_query_sequence.to(torch.bfloat16)
+            packed_query_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_heads * self.head_dim))
+            packed_key_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+            packed_value_states = packed_query_sequence.new_zeros((packed_query_sequence.shape[0], self.num_key_value_heads * self.head_dim))
+
+            packed_text_query_sequence = packed_query_sequence[packed_text_indexes]
+            packed_vae_query_sequence = packed_query_sequence[packed_vae_token_indexes]
+
+            packed_query_states[packed_text_indexes] = self.q_proj(packed_text_query_sequence)
+            packed_query_states[packed_vae_token_indexes] = self.q_proj_moe_gen(packed_vae_query_sequence)
+
+            packed_key_states[packed_text_indexes] = self.k_proj(packed_text_query_sequence)
+            packed_key_states[packed_vae_token_indexes] = self.k_proj_moe_gen(packed_vae_query_sequence)
+
+            packed_value_states[packed_text_indexes] = self.v_proj(packed_text_query_sequence)
+            packed_value_states[packed_vae_token_indexes] = self.v_proj_moe_gen(packed_vae_query_sequence)
+
+            packed_query_states = packed_query_states.view(-1, self.num_heads, self.head_dim)
+            packed_key_states = packed_key_states.view(-1, self.num_key_value_heads, self.head_dim)
+            packed_value_states = packed_value_states.view(-1, self.num_key_value_heads, self.head_dim)
+
+            packed_query_states = packed_query_states.to(torch.float32)
+            packed_query_states[packed_text_indexes] = self.q_norm(packed_query_states[packed_text_indexes])
+            packed_query_states[packed_vae_token_indexes] = self.q_norm_moe_gen(packed_query_states[packed_vae_token_indexes])
+
+            packed_key_states = packed_key_states.to(torch.float32)
+            packed_key_states[packed_text_indexes] = self.k_norm(packed_key_states[packed_text_indexes])
+            packed_key_states[packed_vae_token_indexes] = self.k_norm_moe_gen(packed_key_states[packed_vae_token_indexes])
+
+        packed_cos, packed_sin = packed_query_position_embeddings
+        if kwargs.get("apply_qwen_2_5_vl_pos_emb"):
+            packed_query_states = rearrange(packed_query_states, "l (b h) d -> b h l d", h=self.num_heads)
+            packed_key_states = rearrange(packed_key_states, "l (b h) d -> b h l d", h=self.num_key_value_heads)
+            packed_query_states, packed_key_states = apply_multimodal_rotary_pos_emb(
+                packed_query_states, packed_key_states, packed_cos, packed_sin, self.config.rope_scaling["mrope_section"]
+            )
+            packed_query_states = rearrange(packed_query_states, "b h l d -> l (b h) d")
+            packed_key_states = rearrange(packed_key_states, "b h l d -> l (b h) d")
+
+        else:
+            packed_query_states, packed_key_states = apply_rotary_pos_emb(packed_query_states, packed_key_states, packed_cos, packed_sin, unsqueeze_dim=1)
+
+        packed_query_states = packed_query_states.to(torch.bfloat16)
+        packed_key_states = packed_key_states.to(torch.bfloat16)
+        packed_value_states = packed_value_states.to(torch.bfloat16)
+
+        if past_key_values is not None and past_key_values.key_cache[self.layer_idx] is not None:
+            past_key_states = past_key_values.key_cache[self.layer_idx]
+            past_value_states = past_key_values.value_cache[self.layer_idx]
+
+            seqlens = sum(query_lens) + sum(key_values_lens)
+            merged_key_states = past_key_states.new_zeros(size=[seqlens, self.num_key_value_heads, self.head_dim])
+            merged_value_states = past_key_states.new_zeros(size=[seqlens, self.num_key_value_heads, self.head_dim])
+            merged_key_states[packed_query_indexes] = packed_key_states
+            merged_key_states[packed_key_value_indexes] = past_key_states
+            merged_value_states[packed_query_indexes] = packed_value_states
+            merged_value_states[packed_key_value_indexes] = past_value_states
+            key_values_lens = key_values_lens + query_lens
+        else:
+            merged_key_states = packed_key_states
+            merged_value_states = packed_value_states
+            key_values_lens = query_lens
+
+        cu_seqlens_q = torch.nn.functional.pad(torch.cumsum(query_lens, dim=0), (1, 0))
+        cu_seqlens_k = torch.nn.functional.pad(torch.cumsum(key_values_lens, dim=0), (1, 0))
+
+        packed_attn_output = flash_attn_varlen_func(
+            q=packed_query_states,
+            k=merged_key_states,
+            v=merged_value_states,
+            cu_seqlens_q=cu_seqlens_q.to(torch.int32),
+            cu_seqlens_k=cu_seqlens_k.to(torch.int32),
+            max_seqlen_q=max(query_lens).item(),
+            max_seqlen_k=max(key_values_lens).item(),
+            causal=is_causal,
+        )
+        packed_attn_output = packed_attn_output.reshape(-1, self.hidden_size)
+        if mode == "und":
+            packed_attn_output = self.o_proj(packed_attn_output)
+        elif mode == "gen":
+            packed_attn_output[packed_text_indexes] = self.o_proj(packed_attn_output[packed_text_indexes])
+            packed_attn_output[packed_vae_token_indexes] = self.o_proj_moe_gen(packed_attn_output[packed_vae_token_indexes])
+
+        if update_past_key_values:
+            past_key_values.key_cache[self.layer_idx] = merged_key_states
+            past_key_values.value_cache[self.layer_idx] = merged_value_states
+
+        return packed_attn_output, past_key_values
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = PackedAttention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+
+        residual = packed_sequence
+        packed_sequence = self.input_layernorm(packed_sequence)
+
+        # Self Attention
+        packed_sequence = self.self_attn(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+            **kwargs,
+        )
+        packed_sequence = residual + packed_sequence
+
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence = self.post_attention_layernorm(packed_sequence)
+        packed_sequence = self.mlp(packed_sequence)
+        packed_sequence = residual + packed_sequence
+
+        return packed_sequence
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        **kwargs
+    ) -> BaseNavitOutputWithPast:
+
+        residual = packed_query_sequence
+        packed_query_sequence = self.input_layernorm(packed_query_sequence)
+
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+            **kwargs
+        )
+        packed_query_sequence = residual + packed_query_sequence
+
+        # Fully Connected
+        residual = packed_query_sequence
+        packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+        packed_query_sequence = self.mlp(packed_query_sequence)
+        packed_query_sequence = residual + packed_query_sequence
+
+        return packed_query_sequence, past_key_values
+
+
+class Qwen2MoTDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_idx: Optional[int] = None,
+        attn_module: Optional[PackedAttentionMoT] = PackedAttentionMoT,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.freeze_und = config.freeze_und
+
+        self.self_attn: PackedAttentionMoT = attn_module(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.mlp_moe_gen = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        residual = packed_sequence
+        packed_sequence_ = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_[packed_und_token_indexes] = self.input_layernorm(packed_sequence[packed_und_token_indexes])
+        packed_sequence_[packed_gen_token_indexes] = self.input_layernorm_moe_gen(packed_sequence[packed_gen_token_indexes])
+
+        # Self Attention
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(device=packed_sequence_.device)
+
+        packed_sequence_ = self.self_attn(
+            packed_sequence=packed_sequence_,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+            packed_und_token_indexes=packed_und_token_indexes,
+            packed_gen_token_indexes=packed_gen_token_indexes,
+            **kwargs,
+        )
+        if self.freeze_und:
+            packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+        packed_sequence = residual + packed_sequence_
+
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence_ = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_[packed_und_token_indexes] = self.mlp(self.post_attention_layernorm(packed_sequence[packed_und_token_indexes]))
+        if self.freeze_und:
+            packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+
+        packed_sequence_[packed_gen_token_indexes] = self.mlp_moe_gen(self.post_attention_layernorm_moe_gen(packed_sequence[packed_gen_token_indexes]))
+        packed_sequence = residual + packed_sequence_
+
+        return packed_sequence
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+        **kwargs
+    ) -> BaseNavitOutputWithPast:
+
+        residual = packed_query_sequence
+        if mode == "und":
+            packed_query_sequence = self.input_layernorm(packed_query_sequence)
+        elif mode == "gen":
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence)
+            packed_query_sequence_[packed_text_indexes] = self.input_layernorm(packed_query_sequence[packed_text_indexes])
+            packed_query_sequence_[packed_vae_token_indexes] = self.input_layernorm_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+            packed_query_sequence = packed_query_sequence_
+
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+            mode=mode,
+            packed_vae_token_indexes=packed_vae_token_indexes,
+            packed_text_indexes=packed_text_indexes,
+            **kwargs,
+        )
+        packed_query_sequence = residual + packed_query_sequence
+
+        # Fully Connected
+        residual = packed_query_sequence
+        if mode == "und":
+            packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+            packed_query_sequence = self.mlp(packed_query_sequence)
+        elif mode == "gen":
+            packed_text_query_sequence = packed_query_sequence[packed_text_indexes]
+            packed_vae_query_sequence = packed_query_sequence[packed_vae_token_indexes]
+            packed_text_query_sequence = self.post_attention_layernorm(packed_text_query_sequence).to(torch.bfloat16)
+            packed_vae_query_sequence = self.post_attention_layernorm_moe_gen(packed_vae_query_sequence).to(torch.bfloat16)
+
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence).to(torch.bfloat16)
+            packed_query_sequence_[packed_text_indexes] = self.mlp(packed_text_query_sequence)
+            packed_query_sequence_[packed_vae_token_indexes] = self.mlp_moe_gen(packed_vae_query_sequence)
+            packed_query_sequence = packed_query_sequence_
+
+        packed_query_sequence = residual + packed_query_sequence
+        return packed_query_sequence, past_key_values
+
+
+class Qwen2MoEDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = PackedAttention(config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.mlp_moe_gen = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        packed_und_token_indexes: torch.LongTensor,
+        packed_gen_token_indexes: torch.LongTensor,
+        mode=None,
+    ) -> torch.Tensor:
+
+        residual = packed_sequence
+        packed_sequence = self.input_layernorm(packed_sequence)
+
+        # Self Attention
+        packed_sequence = self.self_attn(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            attention_mask=attention_mask,
+            packed_position_embeddings=packed_position_embeddings,
+        )
+        packed_sequence = residual + packed_sequence
+
+        # Fully Connected
+        residual = packed_sequence
+        packed_sequence = self.post_attention_layernorm(packed_sequence)
+
+        packed_sequence_new = packed_sequence.new_zeros(packed_sequence.shape)
+        packed_sequence_und = self.mlp(packed_sequence[packed_und_token_indexes])
+        packed_sequence_gen = self.mlp_moe_gen(packed_sequence[packed_gen_token_indexes])
+        packed_sequence_new[packed_und_token_indexes] = packed_sequence_und
+        packed_sequence_new[packed_gen_token_indexes] = packed_sequence_gen
+
+        packed_sequence = residual + packed_sequence_new
+
+        return packed_sequence
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_embeddings: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+    ) -> BaseNavitOutputWithPast:
+
+        residual = packed_query_sequence
+        packed_query_sequence = self.input_layernorm(packed_query_sequence)
+
+        # Self Attention
+        packed_query_sequence, past_key_values = self.self_attn(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_embeddings=packed_query_position_embeddings,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+        )
+        packed_query_sequence = residual + packed_query_sequence
+
+        # Fully Connected
+        residual = packed_query_sequence
+        packed_query_sequence = self.post_attention_layernorm(packed_query_sequence)
+        if mode == "und":
+            packed_query_sequence = self.mlp(packed_query_sequence)
+        elif mode == "gen":
+            packed_query_sequence_ = torch.zeros_like(packed_query_sequence).to(torch.bfloat16)
+            packed_query_sequence_[packed_text_indexes] = self.mlp(packed_query_sequence[packed_text_indexes])
+            packed_query_sequence_[packed_vae_token_indexes] = self.mlp_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+            packed_query_sequence = packed_query_sequence_
+        packed_query_sequence = residual + packed_query_sequence
+
+        return packed_query_sequence, past_key_values
+
+
+Decoder_layer_dict = {
+    "Qwen2DecoderLayer": Qwen2DecoderLayer,
+    "Qwen2MoEDecoderLayer": Qwen2MoEDecoderLayer,
+    "Qwen2MoTDecoderLayer": partial(Qwen2MoTDecoderLayer, attn_module=PackedAttentionMoT),
+}
+
+
+class Qwen2Model(Qwen2PreTrainedModel):
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.use_moe = "Mo" in config.layer_module
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        layer_module = Decoder_layer_dict[config.layer_module]
+        self.layers = nn.ModuleList([layer_module(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])  # here is very slow
+
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if self.use_moe:
+            self.norm_moe_gen = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.apply_qwen_2_5_vl_pos_emb = config.apply_qwen_2_5_vl_pos_emb
+        if self.apply_qwen_2_5_vl_pos_emb:
+            self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
+        else:
+            self.rotary_emb = Qwen2RotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        # self.post_init()  # NOTE too slow, not used in inference
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_ids: torch.Tensor,
+        packed_und_token_indexes: Optional[torch.LongTensor] = None,
+        packed_gen_token_indexes: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if self.config.freeze_und:
+            packed_sequence[packed_und_token_indexes] = packed_sequence[packed_und_token_indexes].detach()
+
+        # create position embeddings to be shared across the decoder layers
+        if self.apply_qwen_2_5_vl_pos_emb:
+            packed_position_embeddings = self.rotary_emb(packed_sequence.unsqueeze(0), packed_position_ids)
+            kwargs.update({"apply_qwen_2_5_vl_pos_emb": self.apply_qwen_2_5_vl_pos_emb})
+        else:
+            cos, sin = self.rotary_emb(packed_sequence, packed_position_ids.unsqueeze(0))
+            cos = cos.squeeze(0)
+            sin = sin.squeeze(0)
+            packed_position_embeddings = (cos, sin)
+            kwargs.update({"apply_qwen_2_5_vl_pos_emb": self.apply_qwen_2_5_vl_pos_emb})
+
+        extra_inputs = {}
+        if self.use_moe:
+            assert packed_und_token_indexes is not None
+            if packed_gen_token_indexes is None:
+                packed_gen_token_indexes = packed_und_token_indexes.new_ones(size=[0])
+            extra_inputs.update(
+                packed_und_token_indexes=packed_und_token_indexes,
+                packed_gen_token_indexes=packed_gen_token_indexes,
+            )
+
+        for decoder_layer in self.layers:
+            attention_mask_ = attention_mask
+            packed_sequence = decoder_layer(
+                packed_sequence=packed_sequence,
+                sample_lens=sample_lens,
+                attention_mask=attention_mask_,
+                packed_position_embeddings=packed_position_embeddings,
+                **extra_inputs,
+                **kwargs,
+            )
+
+        if self.use_moe:
+            packed_sequence_ = torch.zeros_like(packed_sequence)
+            packed_sequence_[packed_und_token_indexes] = self.norm(packed_sequence[packed_und_token_indexes]).to(dtype=packed_sequence.dtype)
+            if self.config.freeze_und:
+                packed_sequence_[packed_und_token_indexes] = packed_sequence_[packed_und_token_indexes].detach()
+            packed_sequence_[packed_gen_token_indexes] = self.norm_moe_gen(packed_sequence[packed_gen_token_indexes]).to(dtype=packed_sequence.dtype)
+            return packed_sequence_
+        else:
+            return self.norm(packed_sequence)
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_ids: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+        **kwargs,
+    ) -> BaseNavitOutputWithPast:
+
+        if self.apply_qwen_2_5_vl_pos_emb:
+            packed_query_position_embeddings = self.rotary_emb(packed_query_sequence.unsqueeze(0), packed_query_position_ids)
+            kwargs.update({"apply_qwen_2_5_vl_pos_emb": self.apply_qwen_2_5_vl_pos_emb})
+        else:
+            # create position embeddings to be shared across the decoder layers
+            cos, sin = self.rotary_emb(packed_query_sequence, packed_query_position_ids.unsqueeze(0))
+            cos = cos.squeeze(0)
+            sin = sin.squeeze(0)
+            packed_query_position_embeddings = (cos, sin)
+            kwargs.update({"apply_qwen_2_5_vl_pos_emb": self.apply_qwen_2_5_vl_pos_emb})
+
+        extra_inputs = {}
+        if self.use_moe:
+            extra_inputs.update(mode=mode)
+            if mode == "gen":
+                assert packed_vae_token_indexes is not None
+                assert packed_text_indexes is not None
+                extra_inputs.update(
+                    packed_vae_token_indexes=packed_vae_token_indexes,
+                    packed_text_indexes=packed_text_indexes,
+                )
+
+        for decoder_layer in self.layers:
+            packed_query_sequence, past_key_values = decoder_layer(
+                packed_query_sequence=packed_query_sequence,
+                query_lens=query_lens,
+                packed_query_position_embeddings=packed_query_position_embeddings,
+                packed_query_indexes=packed_query_indexes,
+                past_key_values=past_key_values,
+                key_values_lens=key_values_lens,
+                packed_key_value_indexes=packed_key_value_indexes,
+                update_past_key_values=update_past_key_values,
+                is_causal=is_causal,
+                **extra_inputs,
+                **kwargs,
+            )
+
+        if self.use_moe:
+            if mode == "und":
+                packed_query_sequence = self.norm(packed_query_sequence)
+            elif mode == "gen":
+                packed_query_sequence_ = torch.zeros_like(packed_query_sequence)
+                packed_query_sequence_[packed_text_indexes] = self.norm(packed_query_sequence[packed_text_indexes])
+                packed_query_sequence_[packed_vae_token_indexes] = self.norm_moe_gen(packed_query_sequence[packed_vae_token_indexes])
+                packed_query_sequence = packed_query_sequence_
+        else:
+            packed_query_sequence = self.norm(packed_query_sequence)
+
+        return BaseNavitOutputWithPast(
+            packed_query_sequence=packed_query_sequence,
+            past_key_values=past_key_values,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.model: Qwen2Model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        # self.post_init() # NOTE too slow, not used in inference
+
+    # === 新增：解绑 + 克隆（防止绑权别名引发保存报错） ===
+    def untie_lm_head(self):
+        in_emb = self.get_input_embeddings()
+        out_emb = self.get_output_embeddings()
+        if out_emb.weight.data.data_ptr() == in_emb.weight.data.data_ptr():
+            with torch.no_grad():
+                out_emb.weight = torch.nn.Parameter(in_emb.weight.detach().clone())
+        # 禁止后续自动 re-tie
+        self.config.tie_word_embeddings = False
+        if hasattr(self, "_tied_weights_keys"):
+            self._tied_weights_keys = []
+
+    # === 新增：当 vocab 扩大时，把新增行从输入词表拷到 lm_head（仅拷数值） ===
+    def copy_new_token_rows_to_lm_head(self, num_new_tokens: int):
+        with torch.no_grad():
+            if num_new_tokens and num_new_tokens > 0:
+                in_emb = self.get_input_embeddings()
+                out_emb = self.get_output_embeddings()
+                with torch.no_grad():
+                    out_emb.weight[-num_new_tokens:].copy_(in_emb.weight[-num_new_tokens:])
+
+    def init_moe(self):
+        for name, param in self.named_parameters():
+            if "moe_gen" in name:
+                try:
+                    original_name = name.replace("_moe_gen", "")
+                    param.data.copy_(self.state_dict()[original_name].data)
+                except KeyError:
+                    print(f"Warning: {original_name} not found in state_dict, skipping copy.")
+
+    def freeze_llm_params(self):
+        self.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def freeze_embed_tokens(self):
+        for name, param in self.model.embed_tokens.named_parameters():
+            # print(f'freeze_embed_tokens: {name}')
+            param.requires_grad = False
+
+    def freeze_lm_head(self):
+        for name, param in self.lm_head.named_parameters():
+            param.requires_grad = False
+
+    def freeze_und_params(self):
+        # NOTE: 将理解部分的参数冻结
+        for name, param in self.named_parameters():
+            if "moe_gen" not in name:
+                # print(name)
+                param.requires_grad = False
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def forward(self, *args, **kwargs):
+        if self.training or kwargs.get("mode_forward") == "validation":
+            return self.forward_train(*args, **kwargs)
+        else:
+            return self.forward_inference(*args, **kwargs)
+
+    def forward_train(
+        self,
+        packed_sequence: torch.Tensor,
+        sample_lens: List[int],
+        attention_mask,
+        packed_position_ids: torch.Tensor,
+        packed_und_token_indexes: Optional[torch.LongTensor] = None,
+        packed_gen_token_indexes: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        outputs = self.model.forward(
+            packed_sequence=packed_sequence,
+            sample_lens=sample_lens,
+            packed_position_ids=packed_position_ids,
+            attention_mask=attention_mask,
+            packed_und_token_indexes=packed_und_token_indexes,
+            packed_gen_token_indexes=packed_gen_token_indexes,
+            **kwargs,
+        )
+        return outputs
+
+    def forward_inference(
+        self,
+        packed_query_sequence: torch.Tensor,
+        query_lens: torch.Tensor,
+        packed_query_position_ids: torch.Tensor,
+        packed_query_indexes: torch.Tensor,
+        past_key_values: Optional[NaiveCache] = None,
+        key_values_lens: Optional[torch.Tensor] = None,
+        packed_key_value_indexes: Optional[torch.Tensor] = None,
+        update_past_key_values=True,
+        is_causal=True,
+        mode="und",
+        packed_vae_token_indexes=None,
+        packed_text_indexes=None,
+        **kwargs,
+    ) -> BaseNavitOutputWithPast:
+
+        outputs = self.model.forward(
+            packed_query_sequence=packed_query_sequence,
+            query_lens=query_lens,
+            packed_query_position_ids=packed_query_position_ids,
+            packed_query_indexes=packed_query_indexes,
+            past_key_values=past_key_values,
+            key_values_lens=key_values_lens,
+            packed_key_value_indexes=packed_key_value_indexes,
+            update_past_key_values=update_past_key_values,
+            is_causal=is_causal,
+            mode=mode,
+            packed_vae_token_indexes=packed_vae_token_indexes,
+            packed_text_indexes=packed_text_indexes,
+            **kwargs,
+        )
+
+        return outputs
+
+    # 新增 计算rope index for Qwen-VL
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None, # 视频中每个网格的时间间隔
+        attention_mask: Optional[torch.Tensor] = None, # 用于 mask padding token，防止在attention计算中考虑它们。 全 1 mask可以不提供
+        image_token_id: int = None,
+        video_token_id: int = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embeddin for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.vision_config['spatial_merge_size'] # 2
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            ) # [3, 1, L]
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1] # 用于判断下一个填充的token 是图像还是视频
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums): # 遍历图像和视频的token，计算每个图像或视频的结束位置：ed_image 和 ed_video 分别表示图像和视频 token 的结束位置。
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+
+                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config['tokens_per_second']
+
+                    time_tensor_long = time_tensor.long()
+                    t_index = time_tensor_long.flatten()
+
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
diff --git a/modeling/qwen2/__init__.py b/modeling/qwen2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a070f8136040af828773e4870eedd3587c10c3
--- /dev/null
+++ b/modeling/qwen2/__init__.py
@@ -0,0 +1,63 @@
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tokenizers_available,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_qwen2": ["Qwen2Config"],
+    "tokenization_qwen2": ["Qwen2Tokenizer"],
+}
+
+try:
+    if not is_tokenizers_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["tokenization_qwen2_fast"] = ["Qwen2TokenizerFast"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_qwen2"] = [
+        "Qwen2ForCausalLM",
+        "Qwen2Model",
+        "Qwen2PreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    pass
+
+    try:
+        if not is_tokenizers_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        pass
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        pass
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/modeling/qwen2/configuration_qwen2.py b/modeling/qwen2/configuration_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fda7ba23be69ce895f5be5689aea9924c5a9004a
--- /dev/null
+++ b/modeling/qwen2/configuration_qwen2.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class _Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        is_causal=True,
+        _attn_implementation="flash_attention_2",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.is_causal = is_causal
+        self._attn_implementation = _attn_implementation
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+class Qwen2Config(_Qwen2Config):
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        is_causal=True,
+        _attn_implementation="flash_attention_2",
+        qk_norm=True,
+        layer_module="Qwen2DecoderLayer",
+        freeze_und=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            use_cache=use_cache,
+            tie_word_embeddings=tie_word_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            use_sliding_window=use_sliding_window,
+            sliding_window=sliding_window,
+            max_window_layers=max_window_layers,
+            attention_dropout=attention_dropout,
+            is_causal=is_causal,
+            _attn_implementation=_attn_implementation,
+            **kwargs,
+        )
+        self.qk_norm = qk_norm
+        self.layer_module = layer_module
+        self.freeze_und = freeze_und
+
+        # add split qk norm
+        self.qk_norm_und = kwargs.get("qk_norm_und", qk_norm)
+        self.qk_norm_gen = kwargs.get("qk_norm_gen", qk_norm)
+
+        # add apply qwen 2.5 vl pos emb
+        self.apply_qwen_2_5_vl_pos_emb = kwargs.get("apply_qwen_2_5_vl_pos_emb", False)
+
+        # # 新增算法设计部分
+        # self.mixture_kvdepth_type = kwargs.get("mixture_kvdepth_type", "none")
diff --git a/modeling/qwen2/modeling_qwen2.py b/modeling/qwen2/modeling_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b02f923d0d8c80441be1a25e8948784d6ccf28
--- /dev/null
+++ b/modeling/qwen2/modeling_qwen2.py
@@ -0,0 +1,946 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""PyTorch Qwen2 model."""
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2 import Qwen2Config
+# from qwen2.modeling_qwen2 import Qwen2Config
+
+
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+
+
+logger = logging.get_logger(__name__)
+
+
+_CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B"
+_CONFIG_FOR_DOC = "Qwen2Config"
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Qwen2
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2
+class Qwen2RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[Qwen2Config] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`Qwen2RotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.rope_type = "default"
+        
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config: Qwen2Config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = config.is_causal
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2FlashAttention2(Qwen2Attention):
+    """
+    Qwen2 flash attention module, following Qwen2 attention module. This module inherits from `Qwen2Attention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+QWEN2_ATTENTION_CLASSES = {
+    "eager": Qwen2Attention,
+    "flash_attention_2": Qwen2FlashAttention2,
+}
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = QWEN2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+QWEN2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2PreTrainedModel(PreTrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        pass # NOTE for quick load during inference
+        # std = self.config.initializer_range
+        # if isinstance(module, nn.Linear):
+        #     module.weight.data.normal_(mean=0.0, std=std)
+        #     if module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.Embedding):
+        #     module.weight.data.normal_(mean=0.0, std=std)
+        #     if module.padding_idx is not None:
+        #         module.weight.data[module.padding_idx].zero_()
+
+
+QWEN2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance, see our
+            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2 Model outputting raw hidden-states without any specific head on top.",
+    QWEN2_START_DOCSTRING,
+)
+class Qwen2Model(Qwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2RotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init() # NOTE too slow, not used in inference
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        if attention_mask is not None and 0.0 in attention_mask:
+            causal_mask = attention_mask
+        else:
+            causal_mask = None
+
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2ForCausalLM(Qwen2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        # self.post_init() # NOTE too slow, not used in inference
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/modeling/qwen2/tokenization_qwen2.py b/modeling/qwen2/tokenization_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b91f93866b800825bdfabbd765de0ebf532140b
--- /dev/null
+++ b/modeling/qwen2/tokenization_qwen2.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""Tokenization classes for Qwen2."""
+
+import json
+import os
+import unicodedata
+from functools import lru_cache
+from typing import Optional, Tuple
+
+import regex as re
+
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
+
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+
+
+@lru_cache()
+# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Qwen2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import Qwen2Tokenizer
+
+    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+
+    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
+            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
+            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        clean_up_tokenization_spaces=False,
+        split_special_tokens=False,
+        **kwargs,
+    ):
+        # Qwen vocab does not contain control tokens; added tokens need to be special
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for i, line in enumerate(merges_handle):
+                line = line.strip()
+                if (i == 0 and line.startswith("#version:")) or not line:
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        # NOTE: the cache can grow without bound and will get really large for long running processes
+        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
+        # not a memory leak but appears as one.
+        # GPT2Tokenizer has the same problem, so let's be consistent.
+        self.cache = {}
+
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+
+        if kwargs.get("add_prefix_space", False):
+            logger.warning_once(
+                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+            )
+
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            split_special_tokens=split_special_tokens,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
+        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
+        return super().decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)
diff --git a/modeling/qwen2/tokenization_qwen2_fast.py b/modeling/qwen2/tokenization_qwen2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..34f820f0ea8d2d7ce74e8b3a098586f62761f6d6
--- /dev/null
+++ b/modeling/qwen2/tokenization_qwen2_fast.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+"""Tokenization classes for Qwen2."""
+
+from typing import Optional, Tuple
+
+from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
+from transformers.utils import logging
+from .tokenization_qwen2 import Qwen2Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    "tokenizer_file": "tokenizer.json",
+}
+
+
+MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
+
+
+class Qwen2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import Qwen2TokenizerFast
+
+    >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead. Not applicable to this tokenizer.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = Qwen2Tokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        **kwargs,
+    ):
+        # We need to at least pass vocab_file and merges_file to base class
+        # in case a slow tokenizer needs to be initialized; other can be
+        # configured through files.
+        # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
+
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
diff --git a/modeling/qwen2_5_vl/configuration_qwen2_5_vl.py b/modeling/qwen2_5_vl/configuration_qwen2_5_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..87f06cc648888b449bc2ad145b02fad321d825de
--- /dev/null
+++ b/modeling/qwen2_5_vl/configuration_qwen2_5_vl.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+class Qwen2_5_VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_5_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=32,
+        hidden_size=3584,
+        hidden_act="silu",
+        intermediate_size=3420,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=4,
+        window_size=112,
+        out_hidden_size=3584,
+        fullatt_block_indexes=[7, 15, 23, 31],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+        self.window_size = window_size
+        self.fullatt_block_indexes = fullatt_block_indexes
+        self.out_hidden_size = out_hidden_size
+
+
+class Qwen2_5_VLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2_5_VLModel`]. It is used to instantiate a
+    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2_5_VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2_5_VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+
+    ```python
+    >>> from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLConfig
+
+    >>> # Initializing a Qwen2_5_VL style configuration
+    >>> configuration = Qwen2_5_VLConfig()
+
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = Qwen2_5_VLForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2_5_vl"
+    sub_configs = {"vision_config": Qwen2_5_VLVisionConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen2_5_VL`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default' because `mrope` does defeault RoPE calculations
+        # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
+        # TODO: @raushan update config in the hub
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self, ignore_keys={"mrope_section"})
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["Qwen2_5_VLConfig"]
diff --git a/modeling/qwen2_5_vl/modeling_qwen2_5_vl.py b/modeling/qwen2_5_vl/modeling_qwen2_5_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f7ef34be717d587ef9493a52ea6b9d5e5254a5d
--- /dev/null
+++ b/modeling/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -0,0 +1,2127 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+
+# from ...activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    is_torchdynamo_compiling,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2_5_vl import Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb
+
+else:
+    flash_attn_varlen_func = None
+    apply_rotary_emb = None
+
+
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Qwen2_5_VLConfig"
+
+
+class Qwen2_5_VLMLP(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen2_5_VLPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+
+
+def apply_rotary_pos_emb_flashatt(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    q_embed = apply_rotary_emb(q.float(), cos, sin).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos, sin).type_as(k)
+    return q_embed, k_embed
+
+
+class Qwen2_5_VLVisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+
+class Qwen2_5_VLVisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Qwen2_5_VLVisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+QWEN2_5_VL_VISION_ATTENTION_CLASSES = {
+    "eager": Qwen2_5_VLVisionAttention,
+    "flash_attention_2": Qwen2_5_VLVisionFlashAttention2,
+    "sdpa": Qwen2_5_VLVisionSdpaAttention,
+}
+
+
+class Qwen2_5_VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.attn = QWEN2_5_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.hidden_size, num_heads=config.num_heads
+        )
+        self.mlp = Qwen2_5_VLMLP(config, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+Qwen2_5_VL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Qwen2_5_VLConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Qwen2_5_VL Model outputting raw hidden-states without any specific head on top.",
+    Qwen2_5_VL_START_DOCSTRING,
+)
+class Qwen2_5_VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2_5_VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = False  # TODO (joao): fix. torch.compile failing probably due to `cache_positions`
+
+    def _init_weights(self, module):
+        pass # NOTE not used in inference
+        # std = self.config.initializer_range
+        # if isinstance(module, (nn.Linear, nn.Conv3d)):
+        #     module.weight.data.normal_(mean=0.0, std=std)
+        #     if module.bias is not None:
+        #         module.bias.data.zero_()
+        # elif isinstance(module, nn.Embedding):
+        #     module.weight.data.normal_(mean=0.0, std=std)
+        #     if module.padding_idx is not None:
+        #         module.weight.data[module.padding_idx].zero_()
+
+
+class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
+    config_class = Qwen2_5_VLVisionConfig
+    _no_split_modules = ["Qwen2_5_VLVisionBlock"]
+
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+        self.window_size = config.window_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.hidden_size,
+        )
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList(
+            [Qwen2_5_VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = Qwen2_5_VLPatchMerger(
+            dim=config.out_hidden_size,
+            context_dim=config.hidden_size,
+            spatial_merge_size=config.spatial_merge_size,
+        )
+        self.gradient_checkpointing = False
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings)
+
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+
+        return hidden_states
+
+
+class Qwen2_5_VLRotaryEmbedding(nn.Module):
+    def __init__(self, config: Qwen2_5_VLConfig, device=None):
+        super().__init__()
+
+        # # BC: "rope_type" was originally "type"
+        # if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+        #     self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        # else:
+        #     self.rope_type = "default"
+
+        # HACK: 强制设置为default
+        self.rope_type = "default"
+
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block. In contrast to other models, Qwen2_5_VL has different position ids for thw grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    mrope_section = mrope_section * 2
+    cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+    sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(
+        unsqueeze_dim
+    )
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class Qwen2_5_VLAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2_5_VLConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+        self.rope_scaling = config.rope_scaling
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # Fix precision issues in Qwen2-VL float16 inference
+        # Replace inf values with zeros in attention weights to prevent NaN propagation
+        if query_states.dtype == torch.float16:
+            attn_weights = torch.where(torch.isinf(attn_weights), torch.zeros_like(attn_weights), attn_weights)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
+    """
+    Qwen2_5_VL flash attention module, following Qwen2_5_VL attention module. This module inherits from `Qwen2_5_VLAttention`
+    as the weights of the module stays untouched. The only required change would be on the forward pass
+    where it needs to correctly call the public API of flash attention and deal with padding tokens
+    in case the input contains any of them. Additionally, for sliding window attention, we apply SWA only to the bottom
+    config.max_window_layers layers.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        else:
+            sliding_window = None
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            sliding_window=sliding_window,
+            is_causal=self.is_causal,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Qwen2_5_VLSdpaAttention(Qwen2_5_VLAttention):
+    """
+    Qwen2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Qwen2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Qwen2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Qwen2_5_VLModel is using Qwen2_5_VLSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+QWEN2_5_VL_ATTENTION_CLASSES = {
+    "eager": Qwen2_5_VLAttention,
+    "flash_attention_2": Qwen2_5_VLFlashAttention2,
+    "sdpa": Qwen2_5_VLSdpaAttention,
+}
+
+
+class Qwen2_5_VLDecoderLayer(nn.Module):
+    def __init__(self, config: Qwen2_5_VLConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        if config.use_sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+        self.self_attn = QWEN2_5_VL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare Qwen2_5_VL Model outputting raw hidden-states without any specific head on top.",
+    Qwen2_5_VL_START_DOCSTRING,
+)
+class Qwen2_5_VLModel(Qwen2_5_VLPreTrainedModel):
+    def __init__(self, config: Qwen2_5_VLConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen2_5_VLDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._attn_implementation = config._attn_implementation
+        self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()  # NOTE too slow, not used in inference
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.dim() == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen2_5_VL. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: Qwen2_5_VLConfig,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen2_5_VLConfig`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+
+
+@dataclass
+class Qwen2_5_VLCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Qwen2_5_VL causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+QWEN2_5_VL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
+            [`Qwen2_5_VLImageProcessor`] for processing images.
+        pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
+            The tensors corresponding to the input videos. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
+            [`Qwen2_5_VLImageProcessor`] for processing videos.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+"""
+
+
+class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    config_class = Qwen2_5_VLConfig
+    _no_split_modules = ["Qwen2_5_VLDecoderLayer", "Qwen2_5_VLVisionBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Qwen2_5_VisionTransformerPretrainedModel._from_config(config.vision_config)
+        self.model = Qwen2_5_VLModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rope_deltas = None  # cache rope_deltas here
+
+        # Initialize weights and apply final processing
+        # self.post_init() # NOTE too slow, not used in inference
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embeddin for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+
+                    time_tensor = expanded_range * second_per_grid_t * self.config.vision_config.tokens_per_second
+
+                    time_tensor_long = time_tensor.long()
+                    t_index = time_tensor_long.flatten()
+
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    @add_start_docstrings_to_model_forward(QWEN2_5_VL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Qwen2_5_VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        second_per_grid_ts: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+
+        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.visual.dtype)
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
+
+                mask = input_ids == self.config.image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                image_mask = mask_expanded.to(inputs_embeds.device)
+
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+            if pixel_values_videos is not None:
+                pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+                n_video_features = video_embeds.shape[0]
+                if n_video_tokens != n_video_features:
+                    raise ValueError(
+                        f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                    )
+
+                mask = input_ids == self.config.video_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                video_mask = mask_expanded.to(inputs_embeds.device)
+
+                video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+
+        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if (
+                (cache_position is not None and cache_position[0] == 0)
+                or self.rope_deltas is None
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            ):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    second_per_grid_ts,
+                    attention_mask,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return Qwen2_5_VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        second_per_grid_ts=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
+        # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+        # generate the first token for each sequence. Later use the generated Input ids for continuation.
+        if past_key_values is not None:
+            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
+                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
+            elif (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+
+        if cache_position[0] != 0:
+            pixel_values = None
+            pixel_values_videos = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            model_inputs = {"input_ids": input_ids, "inputs_embeds": None}
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = inputs_embeds.shape
+                device = inputs_embeds.device
+            else:
+                batch_size, sequence_length = input_ids.shape
+                device = input_ids.device
+
+            attention_mask = self.model._prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_cache_shape(),
+                dtype=self.lm_head.weight.dtype,
+                device=device,
+                cache_position=cache_position,
+                batch_size=batch_size,
+                config=self.config,
+                past_key_values=past_key_values,
+            )
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "image_grid_thw": image_grid_thw,
+                "video_grid_thw": video_grid_thw,
+                "cache_position": cache_position,
+                "second_per_grid_ts": second_per_grid_ts,
+            }
+        )
+        return model_inputs
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+
+        vision_start_mask = input_ids == vision_start_token_id
+        vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
+        image_mask = input_ids == image_token_id
+        video_mask = input_ids == video_token_id
+        image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
+        video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
+
+        return image_nums, video_nums
+
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+
+        if expand_size == 1:
+            return input_ids, model_kwargs
+
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(input_ids)
+
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    if not isinstance(dict_to_expand[key], list):
+                        raise TypeError(
+                            f"Expected value for key '{key}' to be a list, but got {type(dict_to_expand[key])} instead."
+                        )
+                    tensor = torch.tensor(dict_to_expand[key])
+                    lengths = list(video_nums)
+                    tensor = _repeat_interleave_samples(tensor, lengths=lengths, repeat_times=expand_size)
+                    dict_to_expand[key] = tensor.tolist()
+            return dict_to_expand
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        # input_ids is required for expanding visual inputs
+        # If input_ids is unavailable, visual inputs will not be used; therefore, there is no need to expand visual inputs.
+        if input_ids is not None and input_ids.numel() != 0:
+            model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+
+__all__ = ["Qwen2_5_VLForConditionalGeneration", "Qwen2_5_VLModel", "Qwen2_5_VLPreTrainedModel"]
diff --git a/modeling/vae/wan/model.py b/modeling/vae/wan/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bc9935d8032725ec6e2d79b3c79eb65d79af38
--- /dev/null
+++ b/modeling/vae/wan/model.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+__all__ = ['WanVideoVAE']
+
+from typing import List
+import torch
+from torch import Tensor
+from einops import rearrange
+
+from common.utils.logging import get_logger
+from common.utils.distributed import get_device
+from common.utils.misc import AutoEncoderParams
+from .vae2_2 import Wan2_2_VAE
+
+
+def reparameterize(mu, log_var):
+    std = torch.exp(0.5 * log_var)
+    eps = torch.randn_like(std)
+    return eps * std + mu
+
+
+class WanVideoVAE(object):
+    __version__ = "v2.2"
+    __name__ = "WanVideoVAE"
+    __logger__ = None
+
+    def __init__(self, config_path: str = "", **kwargs) -> None:
+        if self.__class__.__logger__ is None:
+            self.__class__.__logger__ = get_logger(self.__class__.__name__)
+        self.logger = self.__class__.__logger__
+
+        self.dtype = kwargs.get("dtype", torch.bfloat16)
+        self.configure_vae_model()
+        self.use_sample = kwargs.get("use_sample", True)
+
+        # wan vae2.2 config is equal to seedance vae
+        self.vae_config = AutoEncoderParams(
+            downsample_spatial=16,
+            downsample_temporal=4,
+            z_channels=48,
+            # scale_factor=1.0,
+            # shift_factor=0.012,
+        )
+
+    def configure_vae_model(self):
+        device = get_device()
+
+        # 从 path_default.yaml 读取 VAE 路径
+        try:
+            from config.config_factory import get_model_path
+            vae_path = get_model_path("vae.wan")
+        except Exception as e:
+            # 降级到默认路径
+            vae_path = "downloads/Wan2.2_VAE.pth"
+        
+        self.vae: Wan2_2_VAE = Wan2_2_VAE(vae_pth=vae_path, device=device, dtype=self.dtype)
+        # self.vae.requires_grad_(False).eval()
+        # self.vae.to(device=get_device())
+
+    @torch.no_grad()
+    def vae_encode(self, samples: List[Tensor], **kwargs) -> List[Tensor]:
+        device = get_device()
+
+        latents = []
+        with torch.autocast(device_type="cuda", dtype=self.dtype):
+            for x in samples:
+                x = x.to(device=device).unsqueeze(0)  # 1CTHW
+
+                u, log_var = self.vae.encode(x)  # [1,48,t,h,w], [1,48,t,h,w]
+
+                if self.use_sample:
+                    u = reparameterize(u, log_var)  # [1,48,t,h,w]
+
+                u = rearrange(u, "b c ... -> b ... c")  # -> [1,t,h,w,48] for 兼容
+
+                latents.append(u.squeeze(0))  # -> [t,h,w,48]
+
+            return latents
+
+    @torch.no_grad()
+    def vae_decode(self, latents: List[Tensor], **kwargs) -> List[Tensor]:
+        device = get_device()
+
+        samples = []
+        with torch.autocast(device_type="cuda", dtype=self.dtype):
+            for u in latents:
+                u = u.unsqueeze(0).to(device=device)  # -> [1,t,h,w,48]
+                u = rearrange(u, "b ... c -> b c ...")  # -> [1,48,t,h,w]
+
+                x_hat = self.vae.decode(u)  # -> [1,3,T,H,W]
+
+                samples.append(x_hat.squeeze(0))  # -> List[[3,T,H,W]]
+
+            return samples
diff --git a/modeling/vae/wan/vae2_2.py b/modeling/vae/wan/vae2_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..02234537f2c16cb331bf68a65d071b1f873c88ca
--- /dev/null
+++ b/modeling/vae/wan/vae2_2.py
@@ -0,0 +1,1022 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+import logging
+
+import torch
+
+# import torch.cuda.amp as amp
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+__all__ = [
+    "Wan2_2_VAE",
+]
+
+CACHE_T = 2
+
+
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (
+            self.padding[2],
+            self.padding[2],
+            self.padding[1],
+            self.padding[1],
+            2 * self.padding[0],
+            0,
+        )
+        self.padding = (0, 0, 0)
+
+    def forward(self, x, cache_x=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        x = F.pad(x, padding)
+
+        return super().forward(x)
+
+
+class RMS_norm(nn.Module):
+
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+
+
+class Upsample(nn.Upsample):
+
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+
+
+class Resample(nn.Module):
+
+    def __init__(self, dim, mode):
+        assert mode in (
+            "none",
+            "upsample2d",
+            "upsample3d",
+            "downsample2d",
+            "downsample3d",
+        )
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+                # nn.Conv2d(dim, dim//2, 3, padding=1)
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat(
+                            [
+                                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                                cache_x,
+                            ],
+                            dim=2,
+                        )
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat(
+                            [torch.zeros_like(cache_x).to(cache_x.device), cache_x],
+                            dim=2,
+                        )
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+
+    def init_weight(self, conv):
+        conv_weight = conv.weight.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+
+        # apply attention
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+        )
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        return x + identity
+
+
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c f (h q) (w r) -> b (c r q) f h w",
+            q=patch_size,
+            r=patch_size,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+
+    return x
+
+
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c r q) f h w -> b c f (h q) (w r)",
+            q=patch_size,
+            r=patch_size,
+        )
+    return x
+
+
+class AvgDown3D(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+
+
+class DupUp3D(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+
+
+class Down_ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False):
+        super().__init__()
+
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            downsamples.append(Resample(out_dim, mode=mode))
+
+        self.downsamples = nn.Sequential(*downsamples)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+
+        return x + self.avg_shortcut(x_copy)
+
+
+class Up_ResidualBlock(nn.Module):
+
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False):
+        super().__init__()
+        # Shortcut path with upsample
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2 if up_flag else 1,
+            )
+        else:
+            self.avg_shortcut = None
+
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+
+        # Add the final upsample block
+        if up_flag:
+            mode = "upsample3d" if temperal_upsample else "upsample2d"
+            upsamples.append(Resample(out_dim, mode=mode))
+
+        self.upsamples = nn.Sequential(*upsamples)
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+
+
+class Encoder3d(nn.Module):
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+
+        # init block
+        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
+
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = temperal_downsample[i] if i < len(temperal_downsample) else False
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                    down_flag=i != len(dim_mult) - 1,
+                )
+            )
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout),
+            AttentionBlock(out_dim),
+            ResidualBlock(out_dim, out_dim, dropout),
+        )
+
+        # # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, z_dim, 3, padding=1),
+        )
+
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        ## downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+
+        return x
+
+
+class Decoder3d(nn.Module):
+
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout),
+            AttentionBlock(dims[0]),
+            ResidualBlock(dims[0], dims[0], dropout),
+        )
+
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks + 1,
+                    temperal_upsample=t_up_flag,
+                    up_flag=i != len(dim_mult) - 1,
+                )
+            )
+        self.upsamples = nn.Sequential(*upsamples)
+
+        # output blocks
+        self.head = nn.Sequential(
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(out_dim, 12, 3, padding=1),
+        )
+
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat(
+                    [
+                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                        cache_x,
+                    ],
+                    dim=2,
+                )
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+
+        ## upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk)
+            else:
+                x = layer(x)
+
+        ## head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat(
+                        [
+                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device),
+                            cache_x,
+                        ],
+                        dim=2,
+                    )
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+
+
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+
+
+class WanVAE_(nn.Module):
+
+    def __init__(
+        self,
+        dim=160,
+        dec_dim=256,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+
+        # modules
+        self.encoder = Encoder3d(
+            dim,
+            z_dim * 2,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_downsample,
+            dropout,
+        )
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(
+            dec_dim,
+            z_dim,
+            dim_mult,
+            num_res_blocks,
+            attn_scales,
+            self.temperal_upsample,
+            dropout,
+        )
+
+    def forward(self, x, scale=[0, 1]):
+        mu = self.encode(x, scale)
+        x_recon = self.decode(mu, scale)
+        return x_recon, mu
+
+    def encode(self, x, scale):
+        self.clear_cache()
+        x = patchify(x, patch_size=2)
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, :1, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu, log_var
+
+    def decode(self, z, scale):
+        self.clear_cache()
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1) # 这里会扩充维度
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=True,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + 1, :, :],
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        out = unpatchify(out, patch_size=2)
+        self.clear_cache()
+        return out
+
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+
+
+def _video_vae(pretrained_path=None, z_dim=16, dim=160, device="cpu", **kwargs):
+    # params
+    cfg = dict(
+        dim=dim,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, True],
+        dropout=0.0,
+    )
+    cfg.update(**kwargs)
+
+    # init model
+    with torch.device("meta"):
+        model = WanVAE_(**cfg)
+
+    # load checkpoint
+    logging.info(f"loading {pretrained_path}")
+    model.load_state_dict(torch.load(pretrained_path, map_location=device, weights_only=True), assign=True)
+
+    return model
+
+
+class Wan2_2_VAE:
+
+    def __init__(
+        self,
+        z_dim=48,
+        c_dim=160,
+        vae_pth=None,
+        dim_mult=[1, 2, 4, 4],
+        temperal_downsample=[False, True, True],
+        dtype=torch.float,
+        device="cuda",
+    ):
+
+        self.dtype = dtype
+        # self.device = device
+
+        mean = torch.tensor(
+            [
+                -0.2289,
+                -0.0052,
+                -0.1323,
+                -0.2339,
+                -0.2799,
+                0.0174,
+                0.1838,
+                0.1557,
+                -0.1382,
+                0.0542,
+                0.2813,
+                0.0891,
+                0.1570,
+                -0.0098,
+                0.0375,
+                -0.1825,
+                -0.2246,
+                -0.1207,
+                -0.0698,
+                0.5109,
+                0.2665,
+                -0.2108,
+                -0.2158,
+                0.2502,
+                -0.2055,
+                -0.0322,
+                0.1109,
+                0.1567,
+                -0.0729,
+                0.0899,
+                -0.2799,
+                -0.1230,
+                -0.0313,
+                -0.1649,
+                0.0117,
+                0.0723,
+                -0.2839,
+                -0.2083,
+                -0.0520,
+                0.3748,
+                0.0152,
+                0.1957,
+                0.1433,
+                -0.2944,
+                0.3573,
+                -0.0548,
+                -0.1681,
+                -0.0667,
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        std = torch.tensor(
+            [
+                0.4765,
+                1.0364,
+                0.4514,
+                1.1677,
+                0.5313,
+                0.4990,
+                0.4818,
+                0.5013,
+                0.8158,
+                1.0344,
+                0.5894,
+                1.0901,
+                0.6885,
+                0.6165,
+                0.8454,
+                0.4978,
+                0.5759,
+                0.3523,
+                0.7135,
+                0.6804,
+                0.5833,
+                1.4146,
+                0.8986,
+                0.5659,
+                0.7069,
+                0.5338,
+                0.4889,
+                0.4917,
+                0.4069,
+                0.4999,
+                0.6866,
+                0.4093,
+                0.5709,
+                0.6065,
+                0.6415,
+                0.4944,
+                0.5726,
+                1.2042,
+                0.5458,
+                1.6887,
+                0.3971,
+                1.0600,
+                0.3943,
+                0.5537,
+                0.5444,
+                0.4089,
+                0.7468,
+                0.7744,
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        self.scale = [mean, 1.0 / std]
+
+        # init model
+        self.model = (
+            _video_vae(
+                pretrained_path=vae_pth,
+                z_dim=z_dim,
+                dim=c_dim,
+                dim_mult=dim_mult,
+                temperal_downsample=temperal_downsample,
+            )
+            .eval()
+            .requires_grad_(False)
+            .to(device=device, dtype=dtype)
+        )
+
+    def encode(self, video):
+        try:
+            # if not isinstance(videos, list):
+                # raise TypeError("videos should be a list")
+            # with amp.autocast(dtype=self.dtype):
+            # with torch.autocast(dtype=self.dtype):
+            u, log_var = self.model.encode(video, self.scale) # [B,48,t,h,w], [B,48,t,h,w]
+            return u.float(), log_var.float()
+        except TypeError as e:
+            logging.info(e)
+            return None
+
+    def decode(self, u):
+        try:
+            # if not isinstance(zs, list):
+                # raise TypeError("zs should be a list")
+            # with amp.autocast(dtype=self.dtype):
+            # with torch.autocast(dtype=self.dtype):
+            return self.model.decode(u, self.scale).float().clamp_(-1, 1)
+        except TypeError as e:
+            logging.info(e)
+            return None
diff --git a/modeling/vit/qwen2_5_vl_vit.py b/modeling/vit/qwen2_5_vl_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..01501ca7de669ffb37516c4b8224162acec9072e
--- /dev/null
+++ b/modeling/vit/qwen2_5_vl_vit.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# coding: utf-8
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+from transformers import Qwen2_5_VLPreTrainedModel
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
+from transformers.utils import is_flash_attn_2_available
+from transformers.activations import ACT2FN
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb
+
+else:
+    flash_attn_varlen_func = None
+    apply_rotary_emb = None
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2), sin.unsqueeze(-2)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+class Qwen2_5_VLVisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+def apply_rotary_pos_emb_flashatt(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    q_embed = apply_rotary_emb(q.float(), cos, sin).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos, sin).type_as(k)
+    return q_embed, k_embed
+
+class Qwen2_5_VLVisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+            cos, sin = cos.float(), sin.float()  # NOTE BAGEL中报错, AssertionError: Input and cos/sin must have the same dtype, got torch.float32 and torch.bfloat16
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+class Qwen2_5_VLVisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        if position_embeddings is None:
+            # logger.warning_once(
+            #     "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+            #     "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+            #     "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+            #     "removed and `position_embeddings` will be mandatory."
+            # )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+QWEN2_5_VL_VISION_ATTENTION_CLASSES = {
+    "eager": Qwen2_5_VLVisionAttention,
+    "flash_attention_2": Qwen2_5_VLVisionFlashAttention2,
+    "sdpa": Qwen2_5_VLVisionSdpaAttention,
+}
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+class Qwen2_5_VLMLP(nn.Module):
+    def __init__(self, config, bias: bool = False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+
+class Qwen2_5_VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = Qwen2RMSNorm(config.hidden_size, eps=1e-6)
+        self.attn = QWEN2_5_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.hidden_size, num_heads=config.num_heads
+        )
+        self.mlp = Qwen2_5_VLMLP(config, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+class Qwen2_5_VLPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = Qwen2RMSNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+
+class Qwen2_5_VisionTransformerPretrainedModel(Qwen2_5_VLPreTrainedModel):
+    config_class = Qwen2_5_VLVisionConfig
+    _no_split_modules = ["Qwen2_5_VLVisionBlock"]
+
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.fullatt_block_indexes = config.fullatt_block_indexes
+        self.window_size = config.window_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.hidden_size,
+        )
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList(
+            [Qwen2_5_VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = Qwen2_5_VLPatchMerger(
+            dim=config.out_hidden_size,
+            context_dim=config.hidden_size,
+            spatial_merge_size=config.spatial_merge_size,
+        )
+        # 将原来2*2个patch合并成一个token。通过先把2*2个token的结果concat起来，然后经过一个mlp层做hidden_size维度的映射得到
+        self.gradient_checkpointing = False
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:  # 对每个 grid（帧数t，高h，宽w）分别处理，生成二维位置索引
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)  # 生成高方向的位置索引（hpos_ids）
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)  # 生成宽方向的位置索引（wpos_ids）
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)  # 生成最大 grid 尺寸下的 rotary embedding
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)  # 按照每个 patch 的位置索引提取 embedding
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+
+        return window_index, cu_window_seqlens
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)  # 计算输入的每个视频的rope
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings)
+
+        hidden_states = self.merger(hidden_states) # L x 1280 -> L//4 x 2048
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :] # 还是 L//4 x 2048
+
+        return hidden_states
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3c73af7d22f2ec19d42e51a0e7ea7fbf3f7d2b4
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,66 @@
+absl-py==0.15.0
+accelerate==1.13.0
+addict==2.4.0
+albumentations==1.4.3
+annotated-types==0.7.0
+bitsandbytes==0.49.2
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.3.3
+Cython==3.0.11
+decord==0.6.0
+einops==0.8.1
+einops-exts==0.0.4
+filelock==3.16.1
+flash-attn==2.6.3
+fsspec==2023.6.0
+ftfy==6.1.1
+h5py==3.12.1
+huggingface-hub==0.29.1
+imageio==2.34.0
+imageio-ffmpeg==0.5.1
+Jinja2==3.1.3
+joblib==1.4.2
+kornia==0.8.2
+librosa==0.10.2.post1
+markupsafe==2.1.5
+numpy==1.24.4
+omegaconf==2.3.0
+opencv-python==4.7.0.72
+opt_einsum==3.4.0
+packaging==26.1
+peft==0.5.0
+pillow==11.0.0
+protobuf==3.20.3
+psutil==5.9.4
+pycparser==2.23
+pydantic==2.11.10
+pydantic_core==2.33.2
+PyYAML==6.0
+qwen-vl-utils==0.0.14
+requests==2.32.3
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.10.1
+sentencepiece==0.1.99
+soundfile==0.12.1
+tabulate==0.9.0
+tenacity==8.2.2
+termcolor==2.5.0
+tiktoken==0.7.0
+timm==0.6.13
+tokenizers==0.21.4
+torch-fidelity==0.3.0
+torchlibrosa==0.1.0
+torchmetrics==1.3.2
+tqdm==4.67.3
+transformers-stream-generator==0.0.5
+triton==3.1.0
+typing_extensions==4.15.0
+urllib3==1.26.20
+webdataset==0.2.48
+yacs==0.1.8
+zipp==3.23.1
+httpx==0.23.3
+gpustat
\ No newline at end of file
diff --git a/setup_env.sh b/setup_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b218f195fc3c25f4520537c6e2cda984fd6ceece
--- /dev/null
+++ b/setup_env.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# multi_pip_install.sh - 批量精准安装Python包 (极简版)
+# 用法：./multi_pip_install.sh [python_path]
+# 遇到任何错误会立即退出。
+
+set -euo pipefail  # 启用严格模式，任何错误立即退出
+
+# 禁用 pkg_resources 弃用警告
+export PYTHONWARNINGS="ignore::UserWarning:wandb.apis.public"
+
+# --- 配置区 ---
+PYTHON=${1:-python3}
+TIMEOUT=300
+
+# 关键包列表
+KEY_PACKAGES=(
+    "transformers==4.49.0"  # NOTE transformers==4.53.1在load language模型参数时候会有问题
+    "diffusers==0.29.1"
+    "torch==2.5.1+cu124"
+    "torchvision==0.20.1+cu124"
+    "torchaudio==2.5.1+cu124"
+    "gradio==5.35"
+)
+
+# --- 主流程 ---
+# 卸载pynvml（如果存在）
+echo ">>> 开始卸载pynvml..."
+$PYTHON -m pip uninstall -y pynvml || true
+
+# 从requirements.txt安装所有包
+echo ">>> 开始从requirements.txt安装软件包..."
+timeout $TIMEOUT $PYTHON -m pip install --upgrade --no-cache-dir -r requirements.txt
+
+# 单独安装关键包
+echo ">>> 开始安装关键软件包..."
+for pkg in "${KEY_PACKAGES[@]}"; do
+    echo "--- 正在安装: $pkg ---"
+    timeout $TIMEOUT $PYTHON -m pip install --upgrade --no-cache-dir "$pkg"
+done
+
+# 3. 成功结束
+echo "✓ 所有包均已成功安装或更新。"
\ No newline at end of file