bazaar-research commited on 20 days ago

Commit

0a7036f

verified ·

1 Parent(s): 90764a1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.gitignore +2 -0
.vscode/launch.json +58 -0
INSTALL.md +55 -0
LICENSE.txt +201 -0
LingBot_VA_paper.pdf +3 -0
Makefile +5 -0
README.md +371 -0
assets/teaser.mp4 +3 -0
assets/teaser_v3.png +3 -0
debug/place_fan/call1_reset.msgpack +3 -0
debug/place_fan/call2.msgpack +3 -0
debug/place_fan/call3.msgpack +3 -0
evaluation/robotwin/calc_stat.py +132 -0
evaluation/robotwin/eval_polict_client_openpi.py +696 -0
evaluation/robotwin/geometry.py +463 -0
evaluation/robotwin/launch_client.sh +40 -0
evaluation/robotwin/launch_client_multigpus.sh +81 -0
evaluation/robotwin/launch_server.sh +15 -0
evaluation/robotwin/launch_server_multigpus.sh +31 -0
evaluation/robotwin/msgpack_numpy.py +57 -0
evaluation/robotwin/test_render.py +81 -0
evaluation/robotwin/websocket_client_policy.py +108 -0
example/franka/observation.images.cam_high.png +0 -0
example/franka/observation.images.cam_left_wrist.png +0 -0
example/franka/observation.images.cam_right_wrist.png +0 -0
example/robotwin/observation.images.cam_high.png +0 -0
example/robotwin/observation.images.cam_left_wrist.png +0 -0
example/robotwin/observation.images.cam_right_wrist.png +0 -0
lingbot_robotwin_policy.py +506 -0
pyproject.toml +61 -0
requirements.txt +11 -0
script/run_launch_va_server_sync.sh +34 -0
wan_va/__init__.py +2 -0
wan_va/configs/__init__.py +12 -0
wan_va/configs/shared_config.py +13 -0
wan_va/configs/va_franka_cfg.py +59 -0
wan_va/configs/va_franka_i2va.py +11 -0
wan_va/configs/va_robotwin_cfg.py +54 -0
wan_va/configs/va_robotwin_i2va.py +11 -0
wan_va/distributed/__init__.py +1 -0
wan_va/distributed/fsdp.py +42 -0
wan_va/distributed/util.py +29 -0
wan_va/modules/__init__.py +7 -0
wan_va/modules/model.py +580 -0
wan_va/modules/utils.py +95 -0
wan_va/utils/Simple_Remote_Infer/LEGAL.md +7 -0
wan_va/utils/Simple_Remote_Infer/README.md +16 -0
wan_va/utils/Simple_Remote_Infer/deploy/__init__.py +0 -0
wan_va/utils/Simple_Remote_Infer/deploy/image_tools.py +66 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+LingBot_VA_paper.pdf filter=lfs diff=lfs merge=lfs -text
+assets/teaser.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/teaser_v3.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pyc
2	+ visualization/

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Wan Server",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "args": [
+                "--config-name",
+                "robotwin",
+                "--port",
+                "29056",
+                "--save_root",
+                "visualization/",
+                "--debug_infer_once"
+            ],
+            "env": {
+                "CUDA_VISIBLE_DEVICES": "6"
+            }
+        },
+        {
+            "name": "Robotwin Client",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "evaluation.robotwin.eval_polict_client_openpi",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "cwd": "${workspaceFolder}",
+            "args": [
+                "--config", "policy/ACT/deploy_policy.yml",
+                "--overrides",
+                "--task_name", "adjust_bottle",
+                "--task_config", "demo_clean",
+                "--train_config_name", "0",
+                "--model_name", "0",
+                "--ckpt_setting", "0",
+                "--seed", "0",
+                "--policy_name", "ACT",
+                "--save_root", "./results",
+                "--video_guidance_scale", "5",
+                "--action_guidance_scale", "1",
+                "--test_num", "100",
+                "--port", "29056"
+            ],
+            "env": {
+                "LD_LIBRARY_PATH": "/usr/lib64:/usr/lib:${env:LD_LIBRARY_PATH}",
+                "XLA_PYTHON_CLIENT_MEM_FRACTION": "0.9"
+            }
+        }
+    ]
+}

INSTALL.md ADDED Viewed

	@@ -0,0 +1,55 @@

+# Installation Guide
+## Install with pip
+```bash
+pip install .
+pip install .[dev]  # Installe aussi les outils de dev
+```
+## Install with Poetry
+Ensure you have [Poetry](https://python-poetry.org/docs/#installation) installed on your system.
+To install all dependencies:
+```bash
+poetry install
+```
+### Handling `flash-attn` Installation Issues
+If `flash-attn` fails due to **PEP 517 build issues**, you can try one of the following fixes.
+#### No-Build-Isolation Installation (Recommended)
+```bash
+poetry run pip install --upgrade pip setuptools wheel
+poetry run pip install flash-attn --no-build-isolation
+poetry install
+```
+#### Install from Git (Alternative)
+```bash
+poetry run pip install git+https://github.com/Dao-AILab/flash-attention.git
+```
+---
+### Running the Model
+Once the installation is complete, you can run **Wan2.2** using:
+```bash
+poetry run python generate.py --task t2v-A14B --size '1280*720' --ckpt_dir ./Wan2.2-T2V-A14B --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
+```
+#### Test
+```bash
+bash tests/test.sh
+```
+#### Format
+```bash
+black .
+isort .
+```

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

LingBot_VA_paper.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e791faff04ff10eccb62eef7952eabbcb2c654abc4d73f4b4a8d1f683a6e48ba
+size 7834795

Makefile ADDED Viewed

	@@ -0,0 +1,5 @@

+.PHONY: format
+format:
+	isort wan_va
+	yapf -i -r *.py wan_va

README.md ADDED Viewed

	@@ -0,0 +1,371 @@

+<h1 align="center">LingBot-VA: Causal World Modeling for Robot Control</h1>
+<p align="center">
+  <a href="https://arxiv.org/abs/2601.21998"><img src="https://img.shields.io/static/v1?label=Paper&message=PDF&color=red&logo=arxiv"></a>
+  <a href="https://technology.robbyant.com/lingbot-va"><img src="https://img.shields.io/badge/Project-Website-blue"></a>
+  <a href="https://huggingface.co/collections/robbyant/lingbot-va"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%97%20Model&message=HuggingFace&color=orange"></a>
+  <a href="https://modelscope.cn/collections/Robbyant/LingBot-VA"><img src="https://img.shields.io/static/v1?label=%F0%9F%A4%96%20Model&message=ModelScope&color=purple"></a>
+  <a href="LICENSE.txt"><img src="https://img.shields.io/badge/License-Apache--2.0-green"></a>
+</p>
+<p align="center">
+  <img src="assets/teaser_v3.png" width="100%">
+</p>
+https://github.com/user-attachments/assets/cec7b7a6-953b-4fa4-8f1a-47efc1fce547
+## 💫 Meet **LingBot-VA**!  We've built an AR diffusion framework for simultaneous world modeling and action! 🤖✨
+**LingBot-VA** has focused on:
+- **Autoregressive Video-Action World Modeling**: Architecturally unifies visual dynamics prediction and action inference within a single interleaved sequence while maintaining their conceptual distinction.
+- **High-efficiency Execution**: A dual-stream mixture-of-transformers(MoT) architecture with Asynchronous Execution and KV Cache.
+- **Long-Horizon Performance and Generalization**: High improvements in sample efficiency, long-horizon success rates, and generalization to novel scenes.
+# 🚀 News
+- **[2026-01-29]** Weights and code for shared backbone released! Please stay tuned for our separated version!
+---
+# 📦 Model Download
+- **Pretrained Checkpoints for Post-Training**
+| Model Name | Huggingface Repository | ModelScope Repository  | Description |
+| :--- | :--- | :--- | :--- |
+| lingbot-va-base &nbsp; | [🤗 robbyant/lingbot-va-base &nbsp;](https://huggingface.co/robbyant/lingbot-va-base) | [🤖 Robbyant/lingbot-va-base &nbsp;](https://modelscope.cn/models/Robbyant/lingbot-va-base)  | LingBot-VA w/ shared backbone|
+| lingbot-va-posttrain-robotwin &nbsp; | [🤗 robbyant/lingbot-va-posttrain-robotwin &nbsp;](https://huggingface.co/robbyant/lingbot-va-posttrain-robotwin) | [🤖 Robbyant/lingbot-va-posttrain-robotwin &nbsp;](https://modelscope.cn/models/Robbyant/lingbot-va-posttrain-robotwin)  | LingBot-VA-Posttrain-Robotwin w/ shared backbone|
+---
+# 🛠️ Quick Start
+## Installation
+**Requirements**
+ • Python == 3.10.16
+ • Pytorch == 2.9.0
+ • CUDA 12.6
+```bash
+pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0 --index-url https://download.pytorch.org/whl/cu126
+pip install websockets einops diffusers==0.36.0 transformers==4.55.2 accelerate msgpack opencv-python matplotlib ftfy easydict
+pip install flash-attn --no-build-isolation
+```
+## Deploying LingBot-VA for Inference
+LingBot-VA supports both standalone execution and Server-Client architecture which separates the model environment from simulation. By isolating dependencies, the design avoids package clashes and supports distributed inference on GPUs, clusters, and other devices.
+<!-- ### Standalone  Inference
+```python
+python inference.py
+```
+This processes the example data from `examples/0/` and saves visualizations to `result/`. -->
+### Evaluation on RoboTwin-2.0
+**Preparing the Environment**
+You can follow the official instructions from the original RoboTwin-2.0 repository:
+[https://robotwin-platform.github.io/doc/usage/robotwin-install.html](https://robotwin-platform.github.io/doc/usage/robotwin-install.html)
+In summary:
+1.
+```bash
+sudo apt install libvulkan1 mesa-vulkan-drivers vulkan-tools
+```
+2.
+```bash
+git clone https://github.com/RoboTwin-Platform/RoboTwin.git && cd RoboTwin
+```
+3. modify script/requirements.txt
+```bash
+transforms3d==0.4.2
+sapien==3.0.0b1
+scipy==1.10.1
+mplib==0.2.1
+gymnasium==0.29.1
+trimesh==4.4.3
+open3d==0.18.0
+imageio==2.34.2
+pydantic
+zarr
+openai
+huggingface_hub==0.36.2
+h5py
+# For Description Generation
+azure==4.0.0
+azure-ai-inference
+pyglet<2
+wandb
+moviepy
+imageio
+termcolor
+av
+matplotlib
+ffmpeg
+```
+4. modify line 8 of script/_install.sh:
+```bash
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" --no-build-isolation
+```
+5. run:
+```bash
+bash script/_install.sh
+```
+6. run:
+```bash
+bash script/_download_assets.sh
+```
+ **Deploying the Inference Server**
+```bash
+# single GPU
+bash evaluation/robotwin/launch_server.sh
+# multi-GPU
+bash evaluation/robotwin/launch_server_multigpus.sh
+```
+ **Executing the Inference Client**
+```bash
+# single GPU
+task_name="adjust_bottle";
+save_root="results/";
+bash evaluation/robotwin/launch_client.sh ${save_root} ${task_name}
+# multi-GPU
+save_root="results/"
+task_group_id=0;
+bash evaluation/robotwin/launch_client_multigpus.sh ${save_root} ${task_group_id}
+```
+Related experiments results will be save in `/path/to/your/RoboTwin/${save_root}`. Please note that an `eval_result` folder is also generated. This is a native output from RoboTwin and is identical to the contents in the results folder; it can be safely ignored.
+It is important to note that the inference server and client must be deployed on the same machine. For launching multi-GPU client, we padded the original 50 tasks to 56 via duplication and partitioned them into 7 groups to align with the 8-GPU configuration of our inference node. You can specify the `task_group_id` (0-6) to select a particular group for inference. For detailed grouping configurations, please refer to `evaluation/robotwin/launch_client_multigpus.sh`.
+### Run Image to Video-Action Generation
+We also provide a script for image to video-action generation:
+```bash
+NGPU=1 CONFIG_NAME='robotwin_i2av' bash script/run_launch_va_server_sync.sh
+```
+---
+# 📊 Performance
+We evaluate our model on both simulation benchmarks and real-world scenarios, and achieve state-of-the-art performance.
+## Simulation Evaluation
+- **RoboTwin 2.0**
+We are the first to propel RoboTwin 2.0 metrics performance past the 90+ threshold！
+<table style="border-collapse: collapse; width: auto; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif; font-size: 13px; line-height: 1.2;">
+<!-- 指标说明 -->
+  <p style="font-size: 12px; color: #666; margin-bottom: 5px;">* All metrics are reported in percentage (%). Higher values are <b>bolded</b>.</p>
+  <thead>
+    <tr style="border-top: 2px solid black; border-bottom: 1px solid black;">
+      <th align="left" style="padding: 6px 12px; white-space: nowrap;">Method (Average 50 Tasks)</th>
+      <th align="center" style="padding: 6px 12px;">Easy SR (%)</th>
+      <th align="center" style="padding: 6px 12px;">Hard SR (%)</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="padding: 4px 12px; white-space: nowrap;">X-VLA</td>
+      <td align="center">72.9</td>
+      <td align="center">72.8</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 12px; white-space: nowrap;">&pi;<sub>0</sub></td>
+      <td align="center">65.9</td>
+      <td align="center">58.4</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 12px; white-space: nowrap;">&pi;<sub>0.5</sub></td>
+      <td align="center">82.7</td>
+      <td align="center">76.8</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 12px; white-space: nowrap;">Motus</td>
+      <td align="center"><u>88.7</u></td>
+      <td align="center"><u>87.0</u></td>
+    </tr>
+    <tr style="border-top: 1px solid black; border-bottom: 2px solid black;">
+      <td style="padding: 6px 12px; white-space: nowrap;"><b>LingBot-VA (Ours)</b></td>
+      <td align="center"><b>92.9</b> <small>(+4.2)</small></td>
+      <td align="center"><b>91.6</b> <small>(+4.6)</small></td>
+    </tr>
+  </tbody>
+</table>
+- **LIBERO**
+<table style="border-collapse: collapse; width: auto; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif; font-size: 13px; line-height: 1.2;">
+<!-- 指标说明 -->
+  <p style="font-size: 12px; color: #666; margin-bottom: 5px;">* All metrics are reported in percentage (%). Higher values are <b>bolded</b>.</p>
+  <thead>
+    <tr style="border-top: 2px solid black; border-bottom: 1px solid black;">
+      <th align="left" style="padding: 6px 10px; border-right: 1px solid black; white-space: nowrap;">Methods</th>
+      <th align="center" style="padding: 6px 8px;">Spatial</th>
+      <th align="center" style="padding: 6px 8px;">Object</th>
+      <th align="center" style="padding: 6px 8px;">Goal</th>
+      <th align="center" style="padding: 6px 8px;">Long</th>
+      <th align="center" style="padding: 6px 8px;">Avg</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="padding: 4px 10px; border-right: 1px solid black; white-space: nowrap;">&pi;<sub>0</sub></td>
+      <td align="center">96.8</td><td align="center">98.8</td><td align="center">95.8</td><td align="center">85.2</td><td align="center">94.1</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 10px; border-right: 1px solid black; white-space: nowrap;">&pi;<sub>0.5</sub></td>
+      <td align="center">98.8</td><td align="center">98.2</td><td align="center">98.0</td><td align="center">92.4</td><td align="center">96.9</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 10px; border-right: 1px solid black; white-space: nowrap;">OpenVLA</td>
+      <td align="center">84.7</td><td align="center">88.4</td><td align="center">79.2</td><td align="center">53.7</td><td align="center">76.5</td>
+    </tr>
+    <tr>
+      <td style="padding: 4px 10px; border-right: 1px solid black; white-space: nowrap;">X-VLA</td>
+      <td align="center">98.2</td><td align="center">98.6</td><td align="center">97.8</td><td align="center">97.6</td><td align="center">98.1</td>
+    </tr>
+    <tr style="border-top: 1.5px solid black; border-bottom: 2px solid black;">
+      <td style="padding: 5px 10px; border-right: 1px solid black; white-space: nowrap;"><b>LingBot-VA (Ours)</b></td>
+      <td align="center"><b>98.5 &plusmn; 0.3</b></td>
+      <td align="center"><b>99.6 &plusmn; 0.3</b></td>
+      <td align="center"><b>97.2 &plusmn; 0.2</b></td>
+      <td align="center"><b>98.5 &plusmn; 0.5</b></td>
+      <td align="center"><b>98.5</b></td>
+    </tr>
+  </tbody>
+</table>
+&nbsp;
+## Real-world Deployment
+Six manipulation tasks across three categories: longhorizon tasks (Make Breakfast, Pick Screws), precision tasks (Insert Tube, Unpack Delivery), and deformable & articulated object
+manipulation (Fold Clothes, Fold Pants). Our method achieves state-of-the-art performance on both metrics (Progress Rate and Success Rate) with <b>only 50 trials</b> per task, substantially outperforming strong baseline &pi;<sub>0.5</sub>.
+<div style="text-align: left; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif; line-height: 1.6;">
+  <!-- 第一部分：PS 说明 -->
+  <div style="margin-bottom: 5px;"><strong>Progress Score (PS):</strong> The average score across all trials divided by the maximum possible score, expressed as a percentage:</div>
+  PS = Average_Progress / Max_Steps &times; 100%
+  <!-- 第二部分：SR 说明 -->
+  <div style="margin-bottom: 5px;"><strong>Success Rate (SR):</strong> The number of successful trials divided by the total number of trials, expressed as a percentage:</div>
+  SR = Successful_Trials / N &times; 100%
+</div>
+<div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif;">
+  <!-- 指标说明 -->
+  <p style="font-size: 12px; color: #666; margin-bottom: 5px;">* All metrics are reported in percentage (%). Higher values are <b>bolded</b>.</p>
+  <table style="border-collapse: collapse; width: auto; font-size: 13px; line-height: 1.2;">
+    <thead>
+      <tr style="border-top: 2px solid black;">
+        <th rowspan="2" align="left" style="padding: 4px 10px; border-bottom: 1px solid black; white-space: nowrap;"><b>Task</b></th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Make Breakfast</th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Pick Screws</th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Insert Tube</th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Unpack Delivery</th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Fold Clothes</th>
+        <th colspan="2" style="padding: 4px 10px; border-bottom: 1px solid black;">Fold Pants</th>
+      </tr>
+      <tr style="border-bottom: 1px solid black;">
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+        <th style="padding: 4px 8px;">PS</th>
+        <th style="padding: 4px 8px;">SR</th>
+      </tr>
+    </thead>
+    <tbody>
+      <tr>
+        <td style="padding: 6px 10px; white-space: nowrap;">&pi;<sub>0.5</sub></td>
+        <td align="center">73.0</td><td align="center">70.0</td>
+        <td align="center">74.0</td><td align="center">50.0</td>
+        <td align="center">79.2</td><td align="center">30.0</td>
+        <td align="center">73.0</td><td align="center">25.0</td>
+        <td align="center"><b>62.9</b></td><td align="center">30.0</td>
+        <td align="center">30.0</td><td align="center">30.0</td>
+      </tr>
+      <tr style="border-bottom: 2px solid black;">
+        <td style="padding: 6px 10px; white-space: nowrap;"><b>LingBot-VA (Ours)</b></td>
+        <td align="center"><b>97.0</b></td><td align="center"><b>75.0</b></td>
+        <td align="center"><b>82.5</b></td><td align="center"><b>70.0</b></td>
+        <td align="center"><b>85.8</b></td><td align="center"><b>40.0</b></td>
+        <td align="center"><b>84.5</b></td><td align="center"><b>65.0</b></td>
+        <td align="center">48.8</td><td align="center"><b>35.0</b></td>
+        <td align="center"><b>76.7</b></td><td align="center"><b>70.0</b></td>
+      </tr>
+    </tbody>
+  </table>
+</div>
+# 🪪 License
+This project is released under the Apache License 2.0. See [LICENSE](LICENSE.txt) file for details.
+# 📚Citation
+```bibtex
+@article{lingbot-va2026,
+  title={Causal World Modeling for Robot Control},
+  author={Li, Lin and Zhang, Qihang and Luo, Yiming and Yang, Shuai and Wang, Ruilin and Han, Fei and Yu, Mingrui and Gao, Zelin and Xue, Nan and Zhu, Xing and Shen, Yujun and Xu, Yinghao},
+  journal={arXiv preprint arXiv:2601.21998},
+  year={2026}
+}
+```
+# 🧩 Acknowledgments
+This work builds upon several excellent open-source projects:
+- [Wan-Video](https://github.com/Wan-Video) - Vision transformer backbone
+- [MoT](https://github.com/facebookresearch/Mixture-of-Transformers) - Mixture-of-Transformers architecture
+- The broader open-source computer vision and robotics communities
+---
+For questions, discussions, or collaborations:
+- **Issues**: Open an [issue](https://github.com/robbyant/lingbot-va/issues) on GitHub
+- **Email**: Contact Dr. [Qihang Zhang](https://zqh0253.github.io/) (liuhuan.zqh@antgroup.com) or Dr. [Lin Li](https://lilin-hitcrt.github.io/) (fengchang.ll@antgroup.com)

assets/teaser.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b23b4170e7784b82d8a6287451e886ac88fd0d7d841c55c4bc290b068c9f394
+size 12144486

assets/teaser_v3.png ADDED Viewed

Git LFS Details

SHA256: f27d456c7af839b5929aeffaffa9ac2cb9a3b69e5e0df2440b37b4f241aa933d
Pointer size: 132 Bytes
Size of remote file: 1.11 MB

debug/place_fan/call1_reset.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0d8e7cb0edd8bb74e138e0cc15f50470a7db8316c4ac39e8a2178fde2e1a3da
+size 140

debug/place_fan/call2.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23fc9654673dfad3b4c70a545782bab385b9b18f439fbfb6005b582de0e7005e
+size 691918

debug/place_fan/call3.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0a92955b91b046dc90ee8e1c671be2938f53d90712245b126b20e2a569651c6
+size 2769097

evaluation/robotwin/calc_stat.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from pathlib import Path
+def compute_success_rates(root_dir: str, true_suffix="True.mp4", false_suffix="False.mp4"):
+    root = Path(root_dir)
+    if not root.exists():
+        raise FileNotFoundError(f"Root dir not found: {root}")
+    results = []
+    for sub in sorted([p for p in root.iterdir() if p.is_dir()]):
+        true_cnt = 0
+        false_cnt = 0
+        for mp4 in sub.rglob("*.mp4"):
+            name = mp4.name
+            if name.endswith(true_suffix):
+                true_cnt += 1
+            elif name.endswith(false_suffix):
+                false_cnt += 1
+        total = true_cnt + false_cnt
+        rate = (true_cnt / total) if total > 0 else None
+        results.append((sub.name, true_cnt, false_cnt, total, rate))
+    return results
+# 你的三类：task -> 1/2/3
+TASK_CLASS = {
+    "adjust_bottle": 1,
+    "beat_block_hammer": 1,
+    "blocks_ranking_rgb": 3,
+    "blocks_ranking_size": 3,
+    "click_alarmclock": 1,
+    "click_bell": 1,
+    "dump_bin_bigbin": 1,
+    "grab_roller": 1,
+    "handover_block": 2,
+    "handover_mic": 2,
+    "hanging_mug": 2,
+    "lift_pot": 1,
+    "move_can_pot": 1,
+    "move_pillbottle_pad": 1,
+    "move_playingcard_away": 1,
+    "move_stapler_pad": 1,
+    "open_laptop": 1,
+    "open_microwave": 1,
+    "pick_diverse_bottles": 2,
+    "pick_dual_bottles": 2,
+    "place_a2b_left": 1,
+    "place_a2b_right": 1,
+    "place_bread_basket": 1,
+    "place_bread_skillet": 2,
+    "place_burger_fries": 2,
+    "place_can_basket": 2,
+    "place_cans_plasticbox": 2,
+    "place_container_plate": 1,
+    "place_dual_shoes": 2,
+    "place_empty_cup": 1,
+    "place_fan": 1,
+    "place_mouse_pad": 1,
+    "place_object_basket": 2,
+    "place_object_scale": 1,
+    "place_object_stand": 1,
+    "place_phone_stand": 1,
+    "place_shoe": 1,
+    "press_stapler": 1,
+    "put_bottles_dustbin": 3,
+    "put_object_cabinet": 2,
+    "rotate_qrcode": 1,
+    "scan_object": 2,
+    "shake_bottle_horizontally": 1,
+    "shake_bottle": 1,
+    "stack_blocks_three": 3,
+    "stack_blocks_two": 2,
+    "stack_bowls_three": 3,
+    "stack_bowls_two": 2,
+    "stamp_seal": 1,
+    "turn_switch": 1,
+}
+def mean_rate_of(results_subset):
+    rates = [r[4] for r in results_subset if r[4] is not None]
+    return (sum(rates) / len(rates)) if rates else None
+def print_table(results):
+    # 按成功率排序：None(=N/A) 放最后，其余从高到低
+    results = sorted(results, key=lambda r: (r[4] is None, -(r[4] or 0.0)))
+    print(f"{'folder':30s} {'True':>6s} {'False':>6s} {'Total':>6s} {'SuccessRate':>12s} {'Class':>6s}")
+    print("-" * 90)
+    for folder, t, f, total, rate in results:
+        rate_str = "N/A" if rate is None else f"{rate*100:9.2f}%"
+        cls = TASK_CLASS.get(folder, None)
+        cls_str = "N/A" if cls is None else str(cls)
+        print(f"{folder:30s} {t:6d} {f:6d} {total:6d} {rate_str:>12s} {cls_str:>6s}")
+    print("-" * 90)
+    # overall mean
+    overall_mean = mean_rate_of(results)
+    overall_str = "N/A" if overall_mean is None else f"{overall_mean*100:9.2f}%"
+    print(f"{'MEAN (ALL)':30s} {'':6s} {'':6s} {'':6s} {overall_str:>12s}")
+    # per-class mean (1/2/3)
+    for c in (1, 2, 3):
+        subset = [r for r in results if TASK_CLASS.get(r[0]) == c]
+        m = mean_rate_of(subset)
+        m_str = "N/A" if m is None else f"{m*100:9.2f}%"
+        print(f"{('MEAN (CLASS '+str(c)+')'):30s} {'':6s} {'':6s} {'':6s} {m_str:>12s}")
+    # optional: tasks not in mapping
+    unknown_subset = [r for r in results if r[0] not in TASK_CLASS]
+    if unknown_subset:
+        m = mean_rate_of(unknown_subset)
+        m_str = "N/A" if m is None else f"{m*100:9.2f}%"
+        print(f"{'MEAN (UNKNOWN)':30s} {'':6s} {'':6s} {'':6s} {m_str:>12s}")
+if __name__ == "__main__":
+    import sys
+    roots = sys.argv[1:]
+    if not roots:
+        raise SystemExit("Usage: python a.py <root_folder1> [<root_folder2> ...]")
+    all_results = []
+    for root_dir in roots:
+        all_results.extend(compute_success_rates(root_dir))
+    print_table(all_results)

evaluation/robotwin/eval_polict_client_openpi.py ADDED Viewed

	@@ -0,0 +1,696 @@

+import sys
+import os
+import subprocess
+import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+import cv2
+from pathlib import Path
+robowin_root = Path("/group/ossdphi_algo_scratch_11/weicxu/pythonproject/RoboTwin")
+if str(robowin_root) not in sys.path:
+    sys.path.insert(0, str(robowin_root))
+import os
+os.chdir(robowin_root)
+from envs import CONFIGS_PATH
+from envs.utils.create_actor import UnStableError
+import numpy as np
+from pathlib import Path
+from collections import deque
+import traceback
+import yaml
+from datetime import datetime
+import importlib
+import argparse
+import pdb
+from evaluation.robotwin.geometry import euler2quat
+import numpy as np
+from description.utils.generate_episode_instructions import *
+import traceback
+import imageio
+import numpy as np
+from pathlib import Path
+from scipy.spatial.transform import Rotation as R
+import json
+from pathlib import Path
+from evaluation.robotwin.websocket_client_policy import WebsocketClientPolicy
+from evaluation.robotwin.test_render import Sapien_TEST
+def write_json(data: dict, fpath: Path) -> None:
+    """Write data to a JSON file.
+    Creates parent directories if they don't exist.
+    Args:
+        data (dict): The dictionary to write.
+        fpath (Path): The path to the output JSON file.
+    """
+    fpath.parent.mkdir(exist_ok=True, parents=True)
+    with open(fpath, "w") as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+def add_title_bar(img, text, font_scale=0.8, thickness=2):
+    """Add a black title bar with text above the image"""
+    h, w, _ = img.shape
+    bar_height = 40
+    # Create black background bar
+    title_bar = np.zeros((bar_height, w, 3), dtype=np.uint8)
+    # Calculate text position to center it
+    (text_w, text_h), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, thickness)
+    text_x = (w - text_w) // 2
+    text_y = (bar_height + text_h) // 2 - 5
+    cv2.putText(title_bar, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX,
+                font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
+    return np.vstack([title_bar, img])
+def quaternion_to_euler(quat):
+    """
+    Convert quaternion to Euler angles (roll, pitch, yaw)
+    quat: [rx, ry, rz, rw] format
+    Return: [roll, pitch, yaw] (radians)
+    """
+    # scipy uses [x, y, z, w] format
+    rotation = R.from_quat(quat)
+    euler = rotation.as_euler('xyz', degrees=False)  # returns [roll, pitch, yaw]
+    return euler
+def visualize_action_step(action_history, step_idx, window=50):
+    """
+    Plot dual-arm action curves:
+    Subplot 1: Left arm XYZ Position + Gripper
+    Subplot 2: Left arm Euler angles (Roll, Pitch, Yaw) - converted from quaternion
+    Subplot 3: Right arm XYZ Position + Gripper
+    Subplot 4: Right arm Euler angles (Roll, Pitch, Yaw) - converted from quaternion
+    Input data format: [left_x, left_y, left_z, left_rx, left_ry, left_rz, left_rw, left_gripper,
+                   right_x, right_y, right_z, right_rx, right_ry, right_rz, right_rw, right_gripper]
+    Total 16 dimensions
+    """
+    # Create four subplots, sharing the X-axis
+    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 8), dpi=100, sharex=True)
+    # 1. Determine slice range
+    start = max(0, step_idx - window)
+    end = step_idx + 1
+    # 2. Get data subset
+    history_subset = np.array(action_history)[start:end]
+    # 3. Generate X-axis based on actual data length
+    actual_len = len(history_subset)
+    x_axis = range(start, start + actual_len)
+    if actual_len > 0 and history_subset.shape[1] >= 16:
+        # Convert quaternions to Euler angles
+        left_euler = []
+        right_euler = []
+        for action in history_subset:
+            # Left arm quaternion to Euler angles
+            left_quat = action[3:7]  # [rx, ry, rz, rw]
+            left_rpy = quaternion_to_euler(left_quat)
+            left_euler.append(left_rpy)
+            # Right arm quaternion to Euler angles
+            right_quat = action[11:15]  # [rx, ry, rz, rw]
+            right_rpy = quaternion_to_euler(right_quat)
+            right_euler.append(right_rpy)
+        left_euler = np.array(left_euler)
+        right_euler = np.array(right_euler)
+        # --- Left Arm ---
+        # Subplot 1: Left Arm Translation (XYZ) + Gripper
+        ax1.plot(x_axis, history_subset[:, 0], label='left_x', color='r', linewidth=1.5)
+        ax1.plot(x_axis, history_subset[:, 1], label='left_y', color='g', linewidth=1.5)
+        ax1.plot(x_axis, history_subset[:, 2], label='left_z', color='b', linewidth=1.5)
+        ax1.plot(x_axis, history_subset[:, 7], label='left_grip', color='orange',
+                 linestyle=':', linewidth=2, alpha=0.8)
+        ax1.set_ylabel('Position (m)')
+        ax1.legend(loc='upper right', fontsize='x-small', ncol=4)
+        ax1.grid(True, alpha=0.3)
+        ax1.set_title(f"Step {step_idx}: Left Arm Position & Gripper")
+        # Subplot 2: Left Arm Euler Angles (Roll, Pitch, Yaw)
+        ax2.plot(x_axis, left_euler[:, 0], label='left_roll', color='c', linewidth=1.5)
+        ax2.plot(x_axis, left_euler[:, 1], label='left_pitch', color='m', linewidth=1.5)
+        ax2.plot(x_axis, left_euler[:, 2], label='left_yaw', color='y', linewidth=1.5)
+        ax2.set_ylabel('Rotation (rad)')
+        ax2.legend(loc='upper right', fontsize='x-small', ncol=3)
+        ax2.grid(True, alpha=0.3)
+        ax2.set_title("Left Arm Rotation (RPY from Quaternion)")
+        # --- Right Arm ---
+        # Subplot 3: Right Arm Translation (XYZ) + Gripper
+        ax3.plot(x_axis, history_subset[:, 8], label='right_x', color='r', linewidth=1.5, linestyle='--')
+        ax3.plot(x_axis, history_subset[:, 9], label='right_y', color='g', linewidth=1.5, linestyle='--')
+        ax3.plot(x_axis, history_subset[:, 10], label='right_z', color='b', linewidth=1.5, linestyle='--')
+        ax3.plot(x_axis, history_subset[:, 15], label='right_grip', color='orange',
+                 linestyle=':', linewidth=2, alpha=0.8)
+        ax3.set_ylabel('Position (m)')
+        ax3.legend(loc='upper right', fontsize='x-small', ncol=4)
+        ax3.grid(True, alpha=0.3)
+        ax3.set_title("Right Arm Position & Gripper")
+        # Subplot 4: Right Arm Euler Angles (Roll, Pitch, Yaw)
+        ax4.plot(x_axis, right_euler[:, 0], label='right_roll', color='c', linewidth=1.5, linestyle='--')
+        ax4.plot(x_axis, right_euler[:, 1], label='right_pitch', color='m', linewidth=1.5, linestyle='--')
+        ax4.plot(x_axis, right_euler[:, 2], label='right_yaw', color='y', linewidth=1.5, linestyle='--')
+        ax4.set_ylabel('Rotation (rad)')
+        ax4.legend(loc='upper right', fontsize='x-small', ncol=3)
+        ax4.grid(True, alpha=0.3)
+        ax4.set_title("Right Arm Rotation (RPY from Quaternion)")
+    # Set X-axis display range to maintain sliding window effect
+    ax1.set_xlim(max(0, step_idx - window), max(window, step_idx))
+    ax3.set_xlabel('Step')
+    ax4.set_xlabel('Step')
+    plt.tight_layout()
+    canvas = FigureCanvas(fig)
+    canvas.draw()
+    img = np.asarray(canvas.buffer_rgba())
+    img = img[:, :, :3]
+    # Convert to uint8
+    if img.dtype != np.uint8:
+        img = (img * 255).astype(np.uint8)
+    plt.close(fig)
+    return img
+def save_comparison_video(real_obs_list, imagined_video, action_history, save_path, fps=15):
+    if not real_obs_list:
+        return
+    n_real = len(real_obs_list)
+    if imagined_video is not None:
+        imagined_video = np.concatenate(imagined_video, 0)
+        n_imagined = len(imagined_video)
+    else:
+        n_imagined = 0
+    n_frames = n_real # Based on real observation frames
+    print(f"Saving video: Real {n_real} frames, Imagined {n_imagined} frames...")
+    final_frames = []
+    for i in range(n_frames):
+        obs = real_obs_list[i]
+        cam_high = obs["observation.images.cam_high"]
+        cam_left = obs["observation.images.cam_left_wrist"]
+        cam_right = obs["observation.images.cam_right_wrist"]
+        base_h = cam_high.shape[0]
+        def resize_h(img, h):
+            if img.shape[0] != h:
+                w = int(img.shape[1] * h / img.shape[0])
+                return cv2.resize(img, (w, h))
+            return img
+        row_real = np.hstack([
+            resize_h(cam_high, base_h),
+            resize_h(cam_left, base_h),
+            resize_h(cam_right, base_h)
+        ])
+        if row_real.dtype != np.uint8:
+            row_real = (row_real * 255).astype(np.uint8)
+        row_real = add_title_bar(row_real, "Real Observation (High / Left / Right)")
+        target_width = row_real.shape[1]
+        if imagined_video is not None and i < n_imagined:
+            img_frame = imagined_video[i]
+            if img_frame.dtype != np.uint8 and img_frame.max() <= 1.0001:
+                img_frame = (img_frame * 255).astype(np.uint8)
+            elif img_frame.dtype != np.uint8:
+                img_frame = img_frame.astype(np.uint8)
+            h = int(img_frame.shape[0] * target_width / img_frame.shape[1])
+            row_imagined = cv2.resize(img_frame, (target_width, h))
+        else:
+            row_imagined = np.zeros((300, target_width, 3), dtype=np.uint8)
+            cv2.putText(row_imagined, "Coming soon", (target_width//2 - 100, 150),
+                        cv2.FONT_HERSHEY_SIMPLEX, 1, (100, 100, 100), 2)
+        row_imagined = add_title_bar(row_imagined, "Imagined Video Stream")
+        full_frame = np.vstack([row_real, row_imagined])
+        final_frames.append(full_frame)
+    imageio.mimsave(save_path, final_frames, fps=fps)
+    print(f"Combined video saved to: {save_path}")
+def class_decorator(task_name):
+    envs_module = importlib.import_module(f"envs.{task_name}")
+    try:
+        env_class = getattr(envs_module, task_name)
+        env_instance = env_class()
+    except:
+        raise SystemExit("No Task")
+    return env_instance
+def eval_function_decorator(policy_name, model_name):
+    try:
+        policy_model = importlib.import_module(policy_name)
+        return getattr(policy_model, model_name)
+    except ImportError as e:
+        raise e
+def get_camera_config(camera_type):
+    camera_config_path = os.path.join(robowin_root, "task_config/_camera_config.yml")
+    assert os.path.isfile(camera_config_path), "task config file is missing"
+    with open(camera_config_path, "r", encoding="utf-8") as f:
+        args = yaml.load(f.read(), Loader=yaml.FullLoader)
+    assert camera_type in args, f"camera {camera_type} is not defined"
+    return args[camera_type]
+def get_embodiment_config(robot_file):
+    robot_config_file = os.path.join(robot_file, "config.yml")
+    with open(robot_config_file, "r", encoding="utf-8") as f:
+        embodiment_args = yaml.load(f.read(), Loader=yaml.FullLoader)
+    return embodiment_args
+def main(usr_args):
+    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    task_name = usr_args["task_name"]
+    task_config = usr_args["task_config"]
+    ckpt_setting = usr_args["ckpt_setting"]
+    save_root = usr_args["save_root"]
+    policy_name = usr_args["policy_name"]
+    video_guidance_scale = usr_args["video_guidance_scale"]
+    action_guidance_scale = usr_args["action_guidance_scale"]
+    instruction_type = 'seen'
+    save_dir = None
+    video_save_dir = None
+    video_size = None
+    with open(f"./task_config/{task_config}.yml", "r", encoding="utf-8") as f:
+        args = yaml.load(f.read(), Loader=yaml.FullLoader)
+    args['task_name'] = task_name
+    args["task_config"] = task_config
+    args["ckpt_setting"] = ckpt_setting
+    args["save_root"] = save_root
+    embodiment_type = args.get("embodiment")
+    embodiment_config_path = os.path.join(CONFIGS_PATH, "_embodiment_config.yml")
+    with open(embodiment_config_path, "r", encoding="utf-8") as f:
+        _embodiment_types = yaml.load(f.read(), Loader=yaml.FullLoader)
+    def get_embodiment_file(embodiment_type):
+        robot_file = _embodiment_types[embodiment_type]["file_path"]
+        if robot_file is None:
+            raise "No embodiment files"
+        return robot_file
+    with open(CONFIGS_PATH + "_camera_config.yml", "r", encoding="utf-8") as f:
+        _camera_config = yaml.load(f.read(), Loader=yaml.FullLoader)
+    head_camera_type = args["camera"]["head_camera_type"]
+    args["head_camera_h"] = _camera_config[head_camera_type]["h"]
+    args["head_camera_w"] = _camera_config[head_camera_type]["w"]
+    if len(embodiment_type) == 1:
+        args["left_robot_file"] = get_embodiment_file(embodiment_type[0])
+        args["right_robot_file"] = get_embodiment_file(embodiment_type[0])
+        args["dual_arm_embodied"] = True
+    elif len(embodiment_type) == 3:
+        args["left_robot_file"] = get_embodiment_file(embodiment_type[0])
+        args["right_robot_file"] = get_embodiment_file(embodiment_type[1])
+        args["embodiment_dis"] = embodiment_type[2]
+        args["dual_arm_embodied"] = False
+    else:
+        raise "embodiment items should be 1 or 3"
+    args["left_embodiment_config"] = get_embodiment_config(args["left_robot_file"])
+    args["right_embodiment_config"] = get_embodiment_config(args["right_robot_file"])
+    if len(embodiment_type) == 1:
+        embodiment_name = str(embodiment_type[0])
+    else:
+        embodiment_name = str(embodiment_type[0]) + "+" + str(embodiment_type[1])
+    save_dir = Path(f"eval_result/{task_name}/{policy_name}/{task_config}/{ckpt_setting}/{current_time}")
+    save_dir.mkdir(parents=True, exist_ok=True)
+    if args["eval_video_log"]:
+        video_save_dir = save_dir
+        camera_config = get_camera_config(args["camera"]["head_camera_type"])
+        video_size = str(camera_config["w"]) + "x" + str(camera_config["h"])
+        video_save_dir.mkdir(parents=True, exist_ok=True)
+        args["eval_video_save_dir"] = video_save_dir
+    print("============= Config =============\n")
+    print("\033[95mMessy Table:\033[0m " + str(args["domain_randomization"]["cluttered_table"]))
+    print("\033[95mRandom Background:\033[0m " + str(args["domain_randomization"]["random_background"]))
+    if args["domain_randomization"]["random_background"]:
+        print(" - Clean Background Rate: " + str(args["domain_randomization"]["clean_background_rate"]))
+    print("\033[95mRandom Light:\033[0m " + str(args["domain_randomization"]["random_light"]))
+    if args["domain_randomization"]["random_light"]:
+        print(" - Crazy Random Light Rate: " + str(args["domain_randomization"]["crazy_random_light_rate"]))
+    print("\033[95mRandom Table Height:\033[0m " + str(args["domain_randomization"]["random_table_height"]))
+    print("\033[95mRandom Head Camera Distance:\033[0m " + str(args["domain_randomization"]["random_head_camera_dis"]))
+    print("\033[94mHead Camera Config:\033[0m " + str(args["camera"]["head_camera_type"]) + f", " +
+          str(args["camera"]["collect_head_camera"]))
+    print("\033[94mWrist Camera Config:\033[0m " + str(args["camera"]["wrist_camera_type"]) + f", " +
+          str(args["camera"]["collect_wrist_camera"]))
+    print("\033[94mEmbodiment Config:\033[0m " + embodiment_name)
+    print("\n==================================")
+    TASK_ENV = class_decorator(args["task_name"])
+    args["policy_name"] = policy_name
+    usr_args["left_arm_dim"] = len(args["left_embodiment_config"]["arm_joints_name"][0])
+    usr_args["right_arm_dim"] = len(args["right_embodiment_config"]["arm_joints_name"][1])
+    seed = usr_args["seed"]
+    st_seed = 10000 * (1 + seed)
+    suc_nums = []
+    test_num = usr_args["test_num"]
+    model = WebsocketClientPolicy(port=usr_args['port'])
+    st_seed, suc_num = eval_policy(task_name,
+                                   TASK_ENV,
+                                   args,
+                                   model,
+                                   st_seed,
+                                   test_num=test_num,
+                                   video_size=video_size,
+                                   instruction_type=instruction_type,
+                                   save_visualization=True,
+                                   video_guidance_scale=video_guidance_scale,
+                                   action_guidance_scale=action_guidance_scale)
+    suc_nums.append(suc_num)
+    file_path = os.path.join(save_dir, f"_result.txt")
+    with open(file_path, "w") as file:
+        file.write(f"Timestamp: {current_time}\n\n")
+        file.write(f"Instruction Type: {instruction_type}\n\n")
+        file.write("\n".join(map(str, np.array(suc_nums) / test_num)))
+    print(f"Data has been saved to {file_path}")
+def format_obs(observation, prompt):
+    return {
+                "observation.images.cam_high": observation["observation"]["head_camera"]["rgb"], # H,W,3
+                "observation.images.cam_left_wrist": observation["observation"]["left_camera"]["rgb"],
+                "observation.images.cam_right_wrist": observation["observation"]["right_camera"]["rgb"],
+                "observation.state": observation["joint_action"]["vector"],
+                "task": prompt,
+            }
+def add_eef_pose(new_pose, init_pose):
+    new_pose_R = R.from_quat(new_pose[3:7][None])
+    init_pose_R = R.from_quat(init_pose[3:7][None])
+    out_rot = (init_pose_R * new_pose_R).as_quat().reshape(-1)
+    out_trans = new_pose[:3] + init_pose[:3]
+    return np.concatenate([out_trans, out_rot, new_pose[7:8]])
+def add_init_pose(new_pose, init_pose):
+    left_pose = add_eef_pose(new_pose[:8], init_pose[:8])
+    right_pose = add_eef_pose(new_pose[8:], init_pose[8:])
+    return np.concatenate([left_pose, right_pose])
+def eval_policy(task_name,
+                TASK_ENV,
+                args,
+                model,
+                st_seed,
+                test_num=100,
+                video_size=None,
+                instruction_type=None,
+                save_visualization=False,
+                video_guidance_scale=5.0,
+                action_guidance_scale=5.0):
+    print(f"\033[34mTask Name: {args['task_name']}\033[0m")
+    print(f"\033[34mPolicy Name: {args['policy_name']}\033[0m")
+    expert_check = True
+    TASK_ENV.suc = 0
+    TASK_ENV.test_num = 0
+    now_id = 0
+    succ_seed = 0
+    suc_test_seed_list = []
+    now_seed = st_seed
+    clear_cache_freq = args["clear_cache_freq"]
+    args["eval_mode"] = True
+    while succ_seed < test_num:
+        render_freq = args["render_freq"]
+        args["render_freq"] = 0
+        if expert_check:
+            try:
+                TASK_ENV.setup_demo(now_ep_num=now_id, seed=now_seed, is_test=True, **args)
+                episode_info = TASK_ENV.play_once()
+                TASK_ENV.close_env()
+            except UnStableError as e:
+                TASK_ENV.close_env()
+                now_seed += 1
+                args["render_freq"] = render_freq
+                continue
+            except Exception as e:
+                TASK_ENV.close_env()
+                now_seed += 1
+                args["render_freq"] = render_freq
+                print(f"error occurs ! {e}")
+                traceback.print_exc()
+                continue
+        if (not expert_check) or (TASK_ENV.plan_success and TASK_ENV.check_success()):
+            succ_seed += 1
+            suc_test_seed_list.append(now_seed)
+        else:
+            now_seed += 1
+            args["render_freq"] = render_freq
+            continue
+        args["render_freq"] = render_freq
+        TASK_ENV.setup_demo(now_ep_num=now_id, seed=now_seed, is_test=True, **args)
+        episode_info_list = [episode_info["info"]]
+        results = generate_episode_descriptions(args["task_name"], episode_info_list, test_num)
+        instruction = np.random.choice(results[0][instruction_type])
+        TASK_ENV.set_instruction(instruction=instruction)  # set language instruction
+        if TASK_ENV.eval_video_path is not None:
+            ffmpeg = subprocess.Popen(
+                [
+                    "ffmpeg",
+                    "-y",
+                    "-loglevel",
+                    "error",
+                    "-f",
+                    "rawvideo",
+                    "-pixel_format",
+                    "rgb24",
+                    "-video_size",
+                    video_size,
+                    "-framerate",
+                    "10",
+                    "-i",
+                    "-",
+                    "-pix_fmt",
+                    "yuv420p",
+                    "-vcodec",
+                    "libx264",
+                    "-crf",
+                    "23",
+                    f"{TASK_ENV.eval_video_path}/episode{TASK_ENV.test_num}.mp4",
+                ],
+                stdin=subprocess.PIPE,
+            )
+            TASK_ENV._set_eval_video_ffmpeg(ffmpeg)
+        succ = False
+        prompt = TASK_ENV.get_instruction()
+        ret = model.infer(dict(reset = True, prompt=prompt, save_visualization=save_visualization))
+        first = True
+        full_obs_list = []
+        gen_video_list = []
+        full_action_history = []
+        initial_obs = TASK_ENV.get_obs()
+        inint_eef_pose = initial_obs['endpose']['left_endpose'] + \
+        [initial_obs['endpose']['left_gripper']] + \
+        initial_obs['endpose']['right_endpose'] + \
+        [initial_obs['endpose']['right_gripper']]
+        inint_eef_pose = np.array(inint_eef_pose, dtype=np.float64)
+        initial_formatted_obs = format_obs(initial_obs, prompt)
+        full_obs_list.append(initial_formatted_obs)
+        first_obs = None
+        while TASK_ENV.take_action_cnt<TASK_ENV.step_lim:
+            if first:
+                observation = TASK_ENV.get_obs()
+                first_obs = format_obs(observation, prompt)
+            ret = model.infer(dict(obs=first_obs, prompt=prompt, save_visualization=save_visualization, video_guidance_scale=video_guidance_scale, action_guidance_scale=action_guidance_scale)) #(TASK_ENV, model, observation)
+            action = ret['action']
+            if 'video' in ret:
+                imagined_video = ret['video']
+                gen_video_list.append(imagined_video)
+            key_frame_list = []
+            assert action.shape[2] % 4 == 0
+            action_per_frame = action.shape[2] // 4
+            start_idx = 1 if first else 0
+            for i in range(start_idx, action.shape[1]):
+                for j in range(action.shape[2]):
+                    raw_action_step = action[:, i, j].flatten()
+                    full_action_history.append(raw_action_step)
+                    ee_action = action[:, i, j]
+                    if action.shape[0] == 14:
+                        ee_action = np.concatenate([
+                            ee_action[:3],
+                            euler2quat(ee_action[3], ee_action[4], ee_action[5]),
+                            ee_action[6:10],
+                            euler2quat(ee_action[10], ee_action[11], ee_action[12]),
+                            ee_action[13:14]
+                        ])
+                    elif action.shape[0] == 16:
+                        ee_action =  add_init_pose(ee_action, inint_eef_pose)
+                        ee_action = np.concatenate([
+                            ee_action[:3],
+                            ee_action[3:7] / np.linalg.norm(ee_action[3:7]),
+                            ee_action[7:11],
+                            ee_action[11:15] / np.linalg.norm(ee_action[11:15]),
+                            ee_action[15:16]
+                        ])
+                    else:
+                        raise NotImplementedError
+                    TASK_ENV.take_action(ee_action, action_type='ee')
+                    if (j+1) % action_per_frame == 0:
+                        obs = format_obs(TASK_ENV.get_obs(), prompt)
+                        full_obs_list.append(obs)
+                        key_frame_list.append(obs)
+            first = False
+            model.infer(dict(obs = key_frame_list, compute_kv_cache=True, imagine=False, save_visualization=save_visualization, state=action))
+            if TASK_ENV.eval_success:
+                succ = True
+                break
+        vis_dir = Path(args['save_root']) / f'stseed-{st_seed}' / 'visualization' / task_name
+        vis_dir.mkdir(parents=True, exist_ok=True)
+        video_name = f"{TASK_ENV.test_num}_{prompt.replace(' ', '_')}_{succ}.mp4"
+        out_img_file = vis_dir / video_name
+        save_comparison_video(
+            real_obs_list=full_obs_list,
+            imagined_video=None, #gen_video_list,
+            action_history=full_action_history,
+            save_path=str(out_img_file),
+            fps=15 # Suggest adjusting fps based on simulation step
+        )
+        if TASK_ENV.eval_video_path is not None:
+            TASK_ENV._del_eval_video_ffmpeg()
+        if succ:
+            TASK_ENV.suc += 1
+            print("\033[92mSuccess!\033[0m")
+        else:
+            print("\033[91mFail!\033[0m")
+        now_id += 1
+        TASK_ENV.close_env(clear_cache=((succ_seed + 1) % clear_cache_freq == 0))
+        if TASK_ENV.render_freq:
+            TASK_ENV.viewer.close()
+        TASK_ENV.test_num += 1
+        save_dir = Path(args['save_root']) / f'stseed-{st_seed}' / 'metrics' / task_name
+        save_dir.mkdir(parents=True, exist_ok=True)
+        out_json_file = save_dir / 'res.json'
+        write_json({
+          "succ_num": float(TASK_ENV.suc),
+          "total_num": float(TASK_ENV.test_num),
+          "succ_rate": float(TASK_ENV.suc / TASK_ENV.test_num),
+        }, out_json_file)
+        print(
+            f"\033[93m{task_name}\033[0m | \033[94m{args['policy_name']}\033[0m | \033[92m{args['task_config']}\033[0m | \033[91m{args['ckpt_setting']}\033[0m\n"
+            f"Success rate: \033[96m{TASK_ENV.suc}/{TASK_ENV.test_num}\033[0m => \033[95m{round(TASK_ENV.suc/TASK_ENV.test_num*100, 1)}%\033[0m, current seed: \033[90m{now_seed}\033[0m\n"
+        )
+        now_seed += 1
+    return now_seed, TASK_ENV.suc
+def parse_args_and_config():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--overrides", nargs=argparse.REMAINDER)
+    parser.add_argument("--port", type=int, default=8000, help='remote policy socket port.')
+    parser.add_argument("--save_root", type=str, default="results/default_vis_path")
+    parser.add_argument("--video_guidance_scale", type=float, default=5.0)
+    parser.add_argument("--action_guidance_scale", type=float, default=5.0)
+    parser.add_argument("--test_num", type=int, default=100)
+    args = parser.parse_args()
+    with open(args.config, "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+    # Parse overrides
+    def parse_override_pairs(pairs):
+        override_dict = {}
+        for i in range(0, len(pairs), 2):
+            key = pairs[i].lstrip("--")
+            value = pairs[i + 1]
+            try:
+                value = eval(value)
+            except:
+                pass
+            override_dict[key] = value
+        return override_dict
+    if args.overrides:
+        overrides = parse_override_pairs(args.overrides)
+        config.update(overrides)
+    return config
+if __name__ == "__main__":
+    Sapien_TEST()
+    usr_args = parse_args_and_config()
+    main(usr_args)

evaluation/robotwin/geometry.py ADDED Viewed

	@@ -0,0 +1,463 @@

+"""
+Mostly copied from transforms3d library
+"""
+import math
+import numpy as np
+_FLOAT_EPS = np.finfo(np.float64).eps
+# axis sequences for Euler angles
+_NEXT_AXIS = [1, 2, 0, 1]
+# map axes strings to/from tuples of inner axis, parity, repetition, frame
+_AXES2TUPLE = {
+    "sxyz": (0, 0, 0, 0),
+    "sxyx": (0, 0, 1, 0),
+    "sxzy": (0, 1, 0, 0),
+    "sxzx": (0, 1, 1, 0),
+    "syzx": (1, 0, 0, 0),
+    "syzy": (1, 0, 1, 0),
+    "syxz": (1, 1, 0, 0),
+    "syxy": (1, 1, 1, 0),
+    "szxy": (2, 0, 0, 0),
+    "szxz": (2, 0, 1, 0),
+    "szyx": (2, 1, 0, 0),
+    "szyz": (2, 1, 1, 0),
+    "rzyx": (0, 0, 0, 1),
+    "rxyx": (0, 0, 1, 1),
+    "ryzx": (0, 1, 0, 1),
+    "rxzx": (0, 1, 1, 1),
+    "rxzy": (1, 0, 0, 1),
+    "ryzy": (1, 0, 1, 1),
+    "rzxy": (1, 1, 0, 1),
+    "ryxy": (1, 1, 1, 1),
+    "ryxz": (2, 0, 0, 1),
+    "rzxz": (2, 0, 1, 1),
+    "rxyz": (2, 1, 0, 1),
+    "rzyz": (2, 1, 1, 1),
+}
+_TUPLE2AXES = dict((v, k) for k, v in _AXES2TUPLE.items())
+# For testing whether a number is close to zero
+_EPS4 = np.finfo(float).eps * 4.0
+def mat2euler(mat, axes="sxyz"):
+    """Return Euler angles from rotation matrix for specified axis sequence.
+    Note that many Euler angle triplets can describe one matrix.
+    Parameters
+    ----------
+    mat : array-like shape (3, 3) or (4, 4)
+        Rotation matrix or affine.
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    Examples
+    --------
+    >>> R0 = euler2mat(1, 2, 3, 'syxz')
+    >>> al, be, ga = mat2euler(R0, 'syxz')
+    >>> R1 = euler2mat(al, be, ga, 'syxz')
+    >>> np.allclose(R0, R1)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+    M = np.array(mat, dtype=np.float64, copy=False)[:3, :3]
+    if repetition:
+        sy = math.sqrt(M[i, j] * M[i, j] + M[i, k] * M[i, k])
+        if sy > _EPS4:
+            ax = math.atan2(M[i, j], M[i, k])
+            ay = math.atan2(sy, M[i, i])
+            az = math.atan2(M[j, i], -M[k, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(sy, M[i, i])
+            az = 0.0
+    else:
+        cy = math.sqrt(M[i, i] * M[i, i] + M[j, i] * M[j, i])
+        if cy > _EPS4:
+            ax = math.atan2(M[k, j], M[k, k])
+            ay = math.atan2(-M[k, i], cy)
+            az = math.atan2(M[j, i], M[i, i])
+        else:
+            ax = math.atan2(-M[j, k], M[j, j])
+            ay = math.atan2(-M[k, i], cy)
+            az = 0.0
+    if parity:
+        ax, ay, az = -ax, -ay, -az
+    if frame:
+        ax, az = az, ax
+    return ax, ay, az
+def quat2mat(q):
+    """Calculate rotation matrix corresponding to quaternion
+    Parameters
+    ----------
+    q : 4 element array-like
+    Returns
+    -------
+    M : (3,3) array
+      Rotation matrix corresponding to input quaternion *q*
+    Notes
+    -----
+    Rotation matrix applies to column vectors, and is applied to the
+    left of coordinate vectors.  The algorithm here allows quaternions that
+    have not been normalized.
+    References
+    ----------
+    Algorithm from http://en.wikipedia.org/wiki/Rotation_matrix#Quaternion
+    Examples
+    --------
+    >>> import numpy as np
+    >>> M = quat2mat([1, 0, 0, 0]) # Identity quaternion
+    >>> np.allclose(M, np.eye(3))
+    True
+    >>> M = quat2mat([0, 1, 0, 0]) # 180 degree rotn around axis 0
+    >>> np.allclose(M, np.diag([1, -1, -1]))
+    True
+    """
+    w, x, y, z = q
+    Nq = w * w + x * x + y * y + z * z
+    if Nq < _FLOAT_EPS:
+        return np.eye(3)
+    s = 2.0 / Nq
+    X = x * s
+    Y = y * s
+    Z = z * s
+    wX = w * X
+    wY = w * Y
+    wZ = w * Z
+    xX = x * X
+    xY = x * Y
+    xZ = x * Z
+    yY = y * Y
+    yZ = y * Z
+    zZ = z * Z
+    return np.array(
+        [
+            [1.0 - (yY + zZ), xY - wZ, xZ + wY],
+            [xY + wZ, 1.0 - (xX + zZ), yZ - wX],
+            [xZ - wY, yZ + wX, 1.0 - (xX + yY)],
+        ]
+    )
+# Checks if a matrix is a valid rotation matrix.
+def isrotation(
+    R: np.ndarray,
+    thresh=1e-6,
+) -> bool:
+    Rt = np.transpose(R)
+    shouldBeIdentity = np.dot(Rt, R)
+    iden = np.identity(3, dtype=R.dtype)
+    n = np.linalg.norm(iden - shouldBeIdentity)
+    return n < thresh
+def euler2mat(ai, aj, ak, axes="sxyz"):
+    """Return rotation matrix from Euler angles and axis sequence.
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+    Returns
+    -------
+    mat : array (3, 3)
+        Rotation matrix or affine.
+    Examples
+    --------
+    >>> R = euler2mat(1, 2, 3, 'syxz')
+    >>> np.allclose(np.sum(R[0]), -1.34786452)
+    True
+    >>> R = euler2mat(1, 2, 3, (0, 1, 0, 1))
+    >>> np.allclose(np.sum(R[0]), -0.383436184)
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+    i = firstaxis
+    j = _NEXT_AXIS[i + parity]
+    k = _NEXT_AXIS[i - parity + 1]
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        ai, aj, ak = -ai, -aj, -ak
+    si, sj, sk = math.sin(ai), math.sin(aj), math.sin(ak)
+    ci, cj, ck = math.cos(ai), math.cos(aj), math.cos(ak)
+    cc, cs = ci * ck, ci * sk
+    sc, ss = si * ck, si * sk
+    M = np.eye(3)
+    if repetition:
+        M[i, i] = cj
+        M[i, j] = sj * si
+        M[i, k] = sj * ci
+        M[j, i] = sj * sk
+        M[j, j] = -cj * ss + cc
+        M[j, k] = -cj * cs - sc
+        M[k, i] = -sj * ck
+        M[k, j] = cj * sc + cs
+        M[k, k] = cj * cc - ss
+    else:
+        M[i, i] = cj * ck
+        M[i, j] = sj * sc - cs
+        M[i, k] = sj * cc + ss
+        M[j, i] = cj * sk
+        M[j, j] = sj * ss + cc
+        M[j, k] = sj * cs - sc
+        M[k, i] = -sj
+        M[k, j] = cj * si
+        M[k, k] = cj * ci
+    return M
+def euler2axangle(ai, aj, ak, axes="sxyz"):
+    """Return angle, axis corresponding to Euler angles, axis specification
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+    Returns
+    -------
+    vector : array shape (3,)
+       axis around which rotation occurs
+    theta : scalar
+       angle of rotation
+    Examples
+    --------
+    >>> vec, theta = euler2axangle(0, 1.5, 0, 'szyx')
+    >>> np.allclose(vec, [0, 1, 0])
+    True
+    >>> theta
+    1.5
+    """
+    return quat2axangle(euler2quat(ai, aj, ak, axes))
+def euler2quat(ai, aj, ak, axes="sxyz"):
+    """Return `quaternion` from Euler angles and axis sequence `axes`
+    Parameters
+    ----------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+    Returns
+    -------
+    quat : array shape (4,)
+       Quaternion in w, x, y z (real, then vector) format
+    Examples
+    --------
+    >>> q = euler2quat(1, 2, 3, 'ryxz')
+    >>> np.allclose(q, [0.435953, 0.310622, -0.718287, 0.444435])
+    True
+    """
+    try:
+        firstaxis, parity, repetition, frame = _AXES2TUPLE[axes.lower()]
+    except (AttributeError, KeyError):
+        _TUPLE2AXES[axes]  # validation
+        firstaxis, parity, repetition, frame = axes
+    i = firstaxis + 1
+    j = _NEXT_AXIS[i + parity - 1] + 1
+    k = _NEXT_AXIS[i - parity] + 1
+    if frame:
+        ai, ak = ak, ai
+    if parity:
+        aj = -aj
+    ai = ai / 2.0
+    aj = aj / 2.0
+    ak = ak / 2.0
+    ci = math.cos(ai)
+    si = math.sin(ai)
+    cj = math.cos(aj)
+    sj = math.sin(aj)
+    ck = math.cos(ak)
+    sk = math.sin(ak)
+    cc = ci * ck
+    cs = ci * sk
+    sc = si * ck
+    ss = si * sk
+    q = np.empty((4,))
+    if repetition:
+        q[0] = cj * (cc - ss)
+        q[i] = cj * (cs + sc)
+        q[j] = sj * (cc + ss)
+        q[k] = sj * (cs - sc)
+    else:
+        q[0] = cj * cc + sj * ss
+        q[i] = cj * sc - sj * cs
+        q[j] = cj * ss + sj * cc
+        q[k] = cj * cs - sj * sc
+    if parity:
+        q[j] *= -1.0
+    return q
+def quat2axangle(quat, identity_thresh=None):
+    """Convert quaternion to rotation of angle around axis
+    Parameters
+    ----------
+    quat : 4 element sequence
+       w, x, y, z forming quaternion.
+    identity_thresh : None or scalar, optional
+       Threshold below which the norm of the vector part of the quaternion (x,
+       y, z) is deemed to be 0, leading to the identity rotation.  None (the
+       default) leads to a threshold estimated based on the precision of the
+       input.
+    Returns
+    -------
+    theta : scalar
+       angle of rotation.
+    vector : array shape (3,)
+       axis around which rotation occurs.
+    Examples
+    --------
+    >>> vec, theta = quat2axangle([0, 1, 0, 0])
+    >>> vec
+    array([1., 0., 0.])
+    >>> np.allclose(theta, np.pi)
+    True
+    If this is an identity rotation, we return a zero angle and an arbitrary
+    vector:
+    >>> quat2axangle([1, 0, 0, 0])
+    (array([1., 0., 0.]), 0.0)
+    If any of the quaternion values are not finite, we return a NaN in the
+    angle, and an arbitrary vector:
+    >>> quat2axangle([1, np.inf, 0, 0])
+    (array([1., 0., 0.]), nan)
+    Notes
+    -----
+    A quaternion for which x, y, z are all equal to 0, is an identity rotation.
+    In this case we return a 0 angle and an arbitrary vector, here [1, 0, 0].
+    The algorithm allows for quaternions that have not been normalized.
+    """
+    quat = np.asarray(quat)
+    Nq = np.sum(quat**2)
+    if not np.isfinite(Nq):
+        return np.array([1.0, 0, 0]), float("nan")
+    if identity_thresh is None:
+        try:
+            identity_thresh = np.finfo(Nq.type).eps * 3
+        except (AttributeError, ValueError):  # Not a numpy type or not float
+            identity_thresh = _FLOAT_EPS * 3
+    if Nq < _FLOAT_EPS**2:  # Results unreliable after normalization
+        return np.array([1.0, 0, 0]), 0.0
+    if Nq != 1:  # Normalize if not normalized
+        s = math.sqrt(Nq)
+        quat = quat / s
+    xyz = quat[1:]
+    len2 = np.sum(xyz**2)
+    if len2 < identity_thresh**2:
+        # if vec is nearly 0,0,0, this is an identity rotation
+        return np.array([1.0, 0, 0]), 0.0
+    # Make sure w is not slightly above 1 or below -1
+    theta = 2 * math.acos(max(min(quat[0], 1), -1))
+    return xyz / math.sqrt(len2), theta
+def quat2euler(quaternion, axes="sxyz"):
+    """Euler angles from `quaternion` for specified axis sequence `axes`
+    Parameters
+    ----------
+    q : 4 element sequence
+       w, x, y, z of quaternion
+    axes : str, optional
+        Axis specification; one of 24 axis sequences as string or encoded
+        tuple - e.g. ``sxyz`` (the default).
+    Returns
+    -------
+    ai : float
+        First rotation angle (according to `axes`).
+    aj : float
+        Second rotation angle (according to `axes`).
+    ak : float
+        Third rotation angle (according to `axes`).
+    Examples
+    --------
+    >>> angles = quat2euler([0.99810947, 0.06146124, 0, 0])
+    >>> np.allclose(angles, [0.123, 0, 0])
+    True
+    """
+    return mat2euler(quat2mat(quaternion), axes)

evaluation/robotwin/launch_client.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/bin/bash
+export LD_LIBRARY_PATH=/usr/lib64:/usr/lib:$LD_LIBRARY_PATH
+task_groups=(
+  "stack_bowls_three handover_block hanging_mug scan_object lift_pot put_object_cabinet stack_blocks_three place_shoe"
+  "adjust_bottle place_mouse_pad dump_bin_bigbin move_pillbottle_pad pick_dual_bottles shake_bottle place_fan turn_switch"
+  "shake_bottle_horizontally place_container_plate rotate_qrcode place_object_stand put_bottles_dustbin move_stapler_pad place_burger_fries place_bread_basket"
+  "pick_diverse_bottles open_microwave beat_block_hammer press_stapler click_bell move_playingcard_away open_laptop move_can_pot"
+  "stack_bowls_two place_a2b_right stamp_seal place_object_basket handover_mic place_bread_skillet stack_blocks_two place_cans_plasticbox"
+  "click_alarmclock blocks_ranking_size place_phone_stand place_can_basket place_object_scale place_a2b_left grab_roller place_dual_shoes"
+  "place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb"
+)
+save_root='./results'
+task_name="adjust_bottle"
+policy_name=ACT
+task_config=demo_clean
+train_config_name=0
+model_name=0
+seed=0
+PORT=29056
+PYTHONWARNINGS=ignore::UserWarning \
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi --config policy/$policy_name/deploy_policy.yml \
+    --overrides \
+    --task_name ${task_name} \
+    --task_config ${task_config} \
+    --train_config_name ${train_config_name} \
+    --model_name ${model_name} \
+    --ckpt_setting ${model_name} \
+    --seed ${seed} \
+    --policy_name ${policy_name} \
+    --save_root ${save_root} \
+    --video_guidance_scale 5 \
+    --action_guidance_scale 1 \
+    --test_num 100 \
+    --port ${PORT}

evaluation/robotwin/launch_client_multigpus.sh ADDED Viewed

	@@ -0,0 +1,81 @@

+#!/bin/bash
+export LD_LIBRARY_PATH=/usr/lib64:/usr/lib:$LD_LIBRARY_PATH
+save_root=${1:-'./results'}
+# General parameters
+policy_name=ACT
+task_config=demo_clean
+train_config_name=0
+model_name=0
+seed=${3:-0}
+test_num=${4:-100}
+start_port=29556
+num_gpus=8
+task_list_id=${2:-0}
+task_groups=(
+  "stack_bowls_three handover_block hanging_mug scan_object lift_pot put_object_cabinet stack_blocks_three place_shoe"
+  "adjust_bottle place_mouse_pad dump_bin_bigbin move_pillbottle_pad pick_dual_bottles shake_bottle place_fan turn_switch"
+  "shake_bottle_horizontally place_container_plate rotate_qrcode place_object_stand put_bottles_dustbin move_stapler_pad place_burger_fries place_bread_basket"
+  "pick_diverse_bottles open_microwave beat_block_hammer press_stapler click_bell move_playingcard_away open_laptop move_can_pot"
+  "stack_bowls_two place_a2b_right stamp_seal place_object_basket handover_mic place_bread_skillet stack_blocks_two place_cans_plasticbox"
+  "click_alarmclock blocks_ranking_size place_phone_stand place_can_basket place_object_scale place_a2b_left grab_roller place_dual_shoes"
+  "place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb place_empty_cup blocks_ranking_rgb"
+)
+if (( task_list_id < 0 || task_list_id >= ${#task_groups[@]} )); then
+  echo "task_list_id out of range: $task_list_id (0..$(( ${#task_groups[@]} - 1 )))" >&2
+  exit 1
+fi
+read -r -a task_names <<< "${task_groups[$task_list_id]}"
+echo "task_list_id=$task_list_id"
+printf 'task_names (%d): %s\n' "${#task_names[@]}" "${task_names[*]}"
+log_dir="./logs"
+mkdir -p "$log_dir"
+echo -e "\033[32mLaunching ${#task_names[@]} tasks. GPUs assigned by mod ${num_gpus}, ports starting from ${start_port} incrementing.\033[0m"
+pid_file="pids.txt"
+> "$pid_file"
+batch_time=$(date +%Y%m%d_%H%M%S)
+for i in "${!task_names[@]}"; do
+    task_name="${task_names[$i]}"
+    gpu_id=$(( i % num_gpus ))
+    port=$(( start_port + i ))
+    export CUDA_VISIBLE_DEVICES=${gpu_id}
+    log_file="${log_dir}/${task_name}_${batch_time}.log"
+    echo -e "\033[33m[Task $i] Task: ${task_name}, GPU: ${gpu_id}, PORT: ${port}, Log: ${log_file}\033[0m"
+    PYTHONWARNINGS=ignore::UserWarning \
+    XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 python -m evaluation.robotwin.eval_polict_client_openpi --config policy/$policy_name/deploy_policy.yml \
+        --overrides \
+        --task_name ${task_name} \
+        --task_config ${task_config} \
+        --train_config_name ${train_config_name} \
+        --model_name ${model_name} \
+        --ckpt_setting ${model_name} \
+        --seed ${seed} \
+        --policy_name ${policy_name} \
+        --save_root ${save_root} \
+        --video_guidance_scale 5 \
+        --action_guidance_scale 1 \
+        --test_num ${test_num} \
+        --port ${port} > "$log_file" 2>&1 &
+    pid=$!
+    echo "${pid}" | tee -a "$pid_file"
+done
+echo -e "\033[32mAll tasks launched. PIDs saved to ${pid_file}\033[0m"
+echo -e "\033[36mTo terminate all processes, run: kill \$(cat ${pid_file})\033[0m"

evaluation/robotwin/launch_server.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+START_PORT=${START_PORT:-29056}
+MASTER_PORT=${MASTER_PORT:-29061}
+save_root='visualization/'
+mkdir -p $save_root
+python -m torch.distributed.run \
+    --nproc_per_node 1 \
+    --master_port $MASTER_PORT \
+    wan_va/wan_va_server.py \
+    --config-name robotwin \
+    --port $START_PORT \
+    --save_root $save_root

evaluation/robotwin/launch_server_multigpus.sh ADDED Viewed

	@@ -0,0 +1,31 @@

+START_PORT=${START_PORT:-29556}
+MASTER_PORT=${MASTER_PORT:-29661}
+LOG_DIR='./logs'
+mkdir -p $LOG_DIR
+save_root='./visualization/'
+mkdir -p $save_root
+batch_time=$(date +%Y%m%d_%H%M%S)
+for i in {0..7}; do
+    CURRENT_PORT=$((START_PORT + i))
+    CURRENT_MASTER_PORT=$((MASTER_PORT + i))
+    LOG_FILE="${LOG_DIR}/server_${i}_${batch_time}.log"
+    echo "[Task ${j}] GPU: ${i} | PORT: ${CURRENT_PORT} | MASTER_PORT: ${CURRENT_MASTER_PORT} | Log: ${LOG_FILE}"
+    CUDA_VISIBLE_DEVICES=$i  \
+    nohup python -m torch.distributed.run \
+        --nproc_per_node 1 \
+        --master_port $CURRENT_MASTER_PORT \
+        wan_va/wan_va_server.py \
+        --config-name robotwin \
+        --save_root $save_root \
+        --port $CURRENT_PORT  > $LOG_FILE 2>&1 &
+    sleep 2;
+done
+echo "All 8 instances have been launched in the background."
+wait

evaluation/robotwin/msgpack_numpy.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Adds NumPy array support to msgpack.
+msgpack is good for (de)serializing data over a network for multiple reasons:
+- msgpack is secure (as opposed to pickle/dill/etc which allow for arbitrary code execution)
+- msgpack is widely used and has good cross-language support
+- msgpack does not require a schema (as opposed to protobuf/flatbuffers/etc) which is convenient in dynamically typed
+    languages like Python and JavaScript
+- msgpack is fast and efficient (as opposed to readable formats like JSON/YAML/etc); I found that msgpack was ~4x faster
+    than pickle for serializing large arrays using the below strategy
+The code below is adapted from https://github.com/lebedov/msgpack-numpy. The reason not to use that library directly is
+that it falls back to pickle for object arrays.
+"""
+import functools
+import msgpack
+import numpy as np
+def pack_array(obj):
+    if (isinstance(obj, (np.ndarray, np.generic))) and obj.dtype.kind in ("V", "O", "c"):
+        raise ValueError(f"Unsupported dtype: {obj.dtype}")
+    if isinstance(obj, np.ndarray):
+        return {
+            b"__ndarray__": True,
+            b"data": obj.tobytes(),
+            b"dtype": obj.dtype.str,
+            b"shape": obj.shape,
+        }
+    if isinstance(obj, np.generic):
+        return {
+            b"__npgeneric__": True,
+            b"data": obj.item(),
+            b"dtype": obj.dtype.str,
+        }
+    return obj
+def unpack_array(obj):
+    if b"__ndarray__" in obj:
+        return np.ndarray(buffer=obj[b"data"], dtype=np.dtype(obj[b"dtype"]), shape=obj[b"shape"])
+    if b"__npgeneric__" in obj:
+        return np.dtype(obj[b"dtype"]).type(obj[b"data"])
+    return obj
+Packer = functools.partial(msgpack.Packer, default=pack_array)
+packb = functools.partial(msgpack.packb, default=pack_array)
+Unpacker = functools.partial(msgpack.Unpacker, object_hook=unpack_array)
+unpackb = functools.partial(msgpack.unpackb, object_hook=unpack_array)

evaluation/robotwin/test_render.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import sys
+import warnings
+import os
+warnings.simplefilter(action="ignore", category=FutureWarning)
+warnings.simplefilter(action="ignore", category=UserWarning)
+current_file_path = os.path.abspath(__file__)
+parent_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(parent_dir, "../../tools"))
+import numpy as np
+import pdb
+import json
+import torch
+import sapien.core as sapien
+from sapien.utils.viewer import Viewer
+import gymnasium as gym
+import toppra as ta
+import transforms3d as t3d
+from collections import OrderedDict
+import sys
+import warnings
+import os
+warnings.simplefilter(action="ignore", category=FutureWarning)
+warnings.simplefilter(action="ignore", category=UserWarning)
+current_file_path = os.path.abspath(__file__)
+parent_dir = os.path.dirname(current_file_path)
+sys.path.append(os.path.join(parent_dir, "../../tools"))
+import numpy as np
+import pdb
+import json
+import torch
+import sapien.core as sapien
+from sapien.utils.viewer import Viewer
+import gymnasium as gym
+import toppra as ta
+import transforms3d as t3d
+from collections import OrderedDict
+class Sapien_TEST(gym.Env):
+    def __init__(self):
+        super().__init__()
+        ta.setup_logging("CRITICAL")  # hide logging
+        try:
+            self.setup_scene()
+            print("\033[32m" + "Render Well" + "\033[0m")
+        except:
+            print("\033[31m" + "Render Error" + "\033[0m")
+            exit()
+    def setup_scene(self, **kwargs):
+        """
+        Set the scene
+            - Set up the basic scene: light source, viewer.
+        """
+        self.engine = sapien.Engine()
+        # declare sapien renderer
+        from sapien.render import set_global_config
+        set_global_config(max_num_materials=50000, max_num_textures=50000)
+        self.renderer = sapien.SapienRenderer()
+        # give renderer to sapien sim
+        self.engine.set_renderer(self.renderer)
+        sapien.render.set_camera_shader_dir("rt")
+        sapien.render.set_ray_tracing_samples_per_pixel(32)
+        sapien.render.set_ray_tracing_path_depth(8)
+        sapien.render.set_ray_tracing_denoiser("oidn")
+        # declare sapien scene
+        scene_config = sapien.SceneConfig()
+        self.scene = self.engine.create_scene(scene_config)
+if __name__ == "__main__":
+    a = Sapien_TEST()

evaluation/robotwin/websocket_client_policy.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import logging
+import time
+from typing import Dict, Optional, Tuple
+from typing_extensions import override
+import websockets.sync.client
+from .msgpack_numpy import Packer, unpackb
+class WebsocketClientPolicy:
+    """Implements the Policy interface by communicating with a server over websocket.
+    See WebsocketPolicyServer for a corresponding server implementation.
+    """
+    def __init__(self, host: str = "0.0.0.0", port: Optional[int] = None, api_key: Optional[str] = None) -> None:
+        self._uri = f"ws://{host}"
+        if port is not None:
+            self._uri += f":{port}"
+        self._packer = Packer()
+        self._api_key = api_key
+        self._ws, self._server_metadata = self._wait_for_server()
+    def get_server_metadata(self) -> Dict:
+        return self._server_metadata
+    # def _wait_for_server(self) -> Tuple[websockets.sync.client.ClientConnection, Dict]:
+    #     logging.info(f"Waiting for server at {self._uri}...")
+    #     while True:
+    #         try:
+    #             headers = {"Authorization": f"Api-Key {self._api_key}"} if self._api_key else None
+    #             conn = websockets.sync.client.connect(
+    #                 self._uri, compression=None, max_size=None, additional_headers=headers
+    #             )
+    #             metadata = unpackb(conn.recv())
+    #             return conn, metadata
+    #         except ConnectionRefusedError:
+    #             logging.info("Still waiting for server...")
+    #             time.sleep(5)
+    def _wait_for_server(self) -> Tuple[websockets.sync.client.ClientConnection, Dict]:
+        logging.info(f"Waiting for server at {self._uri}...")
+        while True:
+            try:
+                headers = {"Authorization": f"Api-Key {self._api_key}"} if self._api_key else None
+                # 禁用 ping 机制，防止推理时间过长导致超时
+                conn = websockets.sync.client.connect(
+                    self._uri,
+                    compression=None,
+                    max_size=None,
+                    additional_headers=headers,
+                    ping_interval=None,
+                    close_timeout=10
+                )
+                metadata = unpackb(conn.recv())
+                return conn, metadata
+            except (ConnectionRefusedError, Exception) as e:
+                logging.info(f"Still waiting for server... (Error: {e})")
+                time.sleep(5)
+    @override
+    def infer(self, obs: Dict) -> Dict:  # noqa: UP006
+        data = self._packer.pack(obs)
+        self._ws.send(data)
+        response = self._ws.recv()
+        if isinstance(response, str):
+            # we're expecting bytes; if the server sends a string, it's an error.
+            raise RuntimeError(f"Error in inference server:\n{response}")
+        return unpackb(response)
+    @override
+    def reset(self) -> None:
+        pass
+if __name__ == "__main__":
+    policy_on_device = WebsocketClientPolicy(port=8000)
+    import torch
+    import numpy as np
+    from PIL import Image
+    from .image_tools import convert_to_uint8
+    device = torch.device("cuda")
+    base_0_rgb = np.random.randint(0, 256, size=(1, 3, 224, 224), dtype=np.uint8)
+    left_wrist_0_rgb = np.random.randint(0, 256, size=(1, 3, 224, 224), dtype=np.uint8)
+    state = np.random.rand(1,8).astype(np.float32)
+    prompt = ["do something"]
+    # observation = {
+    #     "image": {
+    #         "base_0_rgb": torch.from_numpy(base_0_rgb).to(device)[None],
+    #         "left_wrist_0_rgb": torch.from_numpy(left_wrist_0_rgb).to(device)[None],
+    #     },
+    #     "state": torch.from_numpy(state).to(device)[None],
+    #     "prompt": prompt,
+    # }
+    observation = {
+        "image": {
+            "base_0_rgb": convert_to_uint8(base_0_rgb),
+            "left_wrist_0_rgb": convert_to_uint8(left_wrist_0_rgb),
+            "right_wrist_0_rgb": convert_to_uint8(left_wrist_0_rgb),
+        },
+        "state": state,
+        "prompt": prompt,
+    }
+    policy_on_device.infer(observation)
+    from IPython import embed;embed()

example/franka/observation.images.cam_high.png ADDED Viewed

example/franka/observation.images.cam_left_wrist.png ADDED Viewed

example/franka/observation.images.cam_right_wrist.png ADDED Viewed

example/robotwin/observation.images.cam_high.png ADDED Viewed

example/robotwin/observation.images.cam_left_wrist.png ADDED Viewed

example/robotwin/observation.images.cam_right_wrist.png ADDED Viewed

lingbot_robotwin_policy.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import json
+import os
+import time
+import random
+import numpy as np
+from collections import deque
+import torchvision
+import yaml
+from types import SimpleNamespace
+from packaging.version import Version
+from typing import Callable, Dict, List, Optional, Type, Union, Tuple, Any, Sequence
+from glob import glob
+from tqdm import tqdm
+from safetensors import safe_open
+from safetensors.torch import load_file
+from pathlib import Path
+from PIL import Image
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+import transformers
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers import (
+    AutoConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+    AutoProcessor,
+)
+from lerobot.configs.policies import PreTrainedConfig
+from lingbotvla.models.vla.pi0.modeling_pi0 import PI0Policy
+from lingbotvla.models.vla.pi0.modeling_lingbot_vla import LingbotVlaPolicy
+from lingbotvla.data.vla_data.transform import Normalizer, prepare_images, prepare_language, prepare_state
+from lingbotvla.models import build_processor
+def set_seed_everywhere(seed: int):
+    """Sets the random seed for Python, NumPy, and PyTorch functions."""
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PYTHONHASHSEED"] = str(seed)
+set_seed_everywhere(42)
+BASE_MODEL_PATH = {
+    'pi0': os.environ.get('PALIGEMMA_PATH', './paligemma-3b-pt-224/'),
+    'lingbotvla': os.environ.get('QWEN25_PATH', './Qwen2.5-VL-3B-Instruct/'),
+}
+def load_model_weights(policy, path_to_pi_model, strict=True):
+    all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+    merged_weights = {}
+    for file_path in tqdm(all_safetensors):
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                merged_weights[key] = f.get_tensor(key)
+    policy.load_state_dict(merged_weights, strict=strict)
+def center_crop_image(image: Union[np.ndarray, Image.Image]) -> Image.Image:
+    crop_scale = 0.9
+    side_scale = float(np.sqrt(np.clip(crop_scale, 0.0, 1.0)))  # side length scale
+    out_size = (224, 224)
+    # Convert input to PIL Image
+    if isinstance(image, np.ndarray):
+        arr = image
+        if arr.dtype.kind == "f":
+            # If floats likely in [0,1], map to [0,255]
+            if arr.max() <= 1.0 and arr.min() >= 0.0:
+                arr = (np.clip(arr, 0.0, 1.0) * 255.0).astype(np.uint8)
+            else:
+                arr = np.clip(arr, 0.0, 255.0).astype(np.uint8)
+        elif arr.dtype == np.uint16:
+            # Map 16-bit to 8-bit
+            arr = (arr / 257).astype(np.uint8)
+        elif arr.dtype != np.uint8:
+            arr = arr.astype(np.uint8)
+        pil = Image.fromarray(arr)
+    elif isinstance(image, Image.Image):
+        pil = image
+    else:
+        raise TypeError("image must be a numpy array or PIL.Image.Image")
+    # Force RGB for consistent output
+    pil = pil.convert("RGB")
+    W, H = pil.size
+    # Compute centered crop box (integer pixels)
+    crop_w = max(1, int(round(W * side_scale)))
+    crop_h = max(1, int(round(H * side_scale)))
+    left = (W - crop_w) // 2
+    top = (H - crop_h) // 2
+    right = left + crop_w
+    bottom = top + crop_h
+    cropped = pil.crop((left, top, right, bottom))
+    resized = cropped.resize(out_size, resample=Image.BILINEAR)
+    return resized
+def resize_with_pad(img, width, height, pad_value=-1):
+    # assume no-op when width height fits already
+    if img.ndim != 4:
+        raise ValueError(f"(b,c,h,w) expected, but {img.shape}")
+    # channel last to channel first if necessary
+    if img.shape[1] not in (1, 3) and img.shape[-1] in (1, 3):
+        img = img.permute(0, 3, 1, 2)
+    cur_height, cur_width = img.shape[2:]
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_img = F.interpolate(
+        img, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+    )
+    pad_height = max(0, int(height - resized_height))
+    pad_width = max(0, int(width - resized_width))
+    # pad on left and top of image
+    padded_img = F.pad(resized_img, (pad_width, 0, pad_height, 0), value=pad_value)
+    return padded_img
+class PolicyPreprocessMixin:
+    @torch.no_grad
+    def select_action(
+        self, observation: dict[str, Tensor], use_bf16: bool = False, vlm_causal: bool = False, noise: Tensor | None = None
+    ):
+        self.eval()
+        device = 'cuda'
+        if use_bf16:
+            dtype = torch.bfloat16
+        else:
+            dtype = torch.float32
+        s1 = time.time()
+        if len(observation['images'].shape) == 4:
+            observation['images'] = observation['images'].unsqueeze(0)
+            observation['img_masks'] = observation['img_masks'].unsqueeze(0)
+        if 'expert_imgs' in observation:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                observation['expert_imgs'].to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        else:
+            actions = self.model.sample_actions(
+                observation['images'].to(dtype=dtype, device=device),
+                observation['img_masks'].to(device=device),
+                observation['lang_tokens'].unsqueeze(0).to(device=device),
+                observation['lang_masks'].unsqueeze(0).to(device=device),
+                observation['state'].unsqueeze(0).to(dtype=dtype, device=device),
+                vlm_causal = vlm_causal
+            )
+        delta_time = time.time() - s1
+        print(f'sample_actions cost {delta_time} s')
+        observation['action'] = actions.squeeze(0)[:, :14].to(dtype=torch.float32, device='cpu')
+        if use_bf16:
+            observation['state'] = observation['state'].to(dtype=torch.float32)
+        data = self.normalizer.unnormalize(observation)
+        return data
+class LingBotVlaInferencePolicy(PolicyPreprocessMixin, LingbotVlaPolicy):
+    pass # Only combine necessary functions
+class PI0InfernecePolicy(PolicyPreprocessMixin, PI0Policy):
+    pass # Only combine necessary functions
+def merge_qwen_config(policy_config, qwen_config):
+    if hasattr(qwen_config, 'to_dict'):
+        config_dict = qwen_config.to_dict()
+    else:
+        config_dict = qwen_config
+    text_keys = {
+        "hidden_size",
+        "intermediate_size",
+        "num_hidden_layers",
+        "num_attention_heads",
+        "num_key_value_heads",
+        "rms_norm_eps",
+        "rope_theta",
+        "vocab_size",
+        "max_position_embeddings",
+        "hidden_act",
+        "tie_word_embeddings",
+        "tokenizer_path",
+    }
+    for key in text_keys:
+        if key in config_dict:
+            setattr(policy_config, key, config_dict[key])
+            print(f"✅ Merged LLM: {key} = {config_dict[key]}")
+    if "vision_config" in config_dict:
+        policy_config.vision_config = qwen_config.vision_config
+    else:
+        print("⚠️ Warning: 'vision_config' not found in qwen_config!")
+    return policy_config
+class QwenPiServer:
+    '''
+    policy wrapper to support action ensemble or chunk execution
+    '''
+    def __init__(
+        self,
+        path_to_pi_model="",
+        adaptive_ensemble_alpha=0.1,
+        action_ensemble_horizon=8,
+        use_length=1, # to control the execution length of the action chunk, -1 denotes using action ensemble
+        chunk_ret=False,
+        use_bf16=True,
+        use_fp32=False,
+    ) -> None:
+        assert not (use_bf16 and use_fp32), 'Bfloat16 or Float32!!!'
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+        self.use_length = use_length
+        self.chunk_ret = chunk_ret
+        self.task_description = None
+        self.vla = self.load_vla(path_to_pi_model)
+        self.vla = self.vla.cuda().eval()
+        if use_bf16:
+            self.vla = self.vla.to(torch.bfloat16)
+        elif use_fp32:
+            self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        self.use_bf16 = use_bf16
+        self.use_fp32 = use_fp32
+    def load_vla(self, path_to_pi_model) -> LingbotVlaPolicy:
+        # load model
+        print(f"loading model from: {path_to_pi_model}")
+        config = PreTrainedConfig.from_pretrained(path_to_pi_model)
+        # load training config
+        training_config_path = Path(path_to_pi_model).parent.parent.parent/'lingbotvla_cli.yaml'
+        with open(training_config_path, 'r') as f:
+            training_config = yaml.safe_load(f)
+        f.close()
+        # update model config according to training config
+        training_model_config = training_config['model']
+        training_model_config.update(training_config['train'])
+        for k, v in training_model_config.items():
+            v = getattr(config, k, training_model_config[k])
+            setattr(config, k, v)
+        # Set attention_implementation to 'eager' to speed up evaluation.
+        config.attention_implementation = 'eager'
+        # set base model according to training config
+        training_base_model = training_config['model']['tokenizer_path']
+        if 'paligemma' in training_base_model:
+            model_name = 'pi0'
+            config.vocab_size = 257152 # set vocab size for paligamma
+        elif 'qwen2' in training_base_model.lower():
+            model_name = 'lingbotvla'
+        else:
+            raise ValueError(f"Unsupported base model of {path_to_pi_model}")
+        base_model_path = BASE_MODEL_PATH[model_name]
+        config.tokenizer_path = base_model_path
+        self.model_name = model_name
+        qwen_config = AutoConfig.from_pretrained(base_model_path)
+        config = merge_qwen_config(config, qwen_config)
+        if 'vocab_size' in training_config['model'] and training_config['model']['vocab_size'] != 0:
+            config.vocab_size = training_config['model']['vocab_size']
+        # load processors
+        self.processor = build_processor(base_model_path)
+        self.language_tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        data_config = SimpleNamespace(**training_config['data'])
+        print('Initializing model ... ')
+        if 'paligemma' in training_base_model:
+            policy = PI0InfernecePolicy(config, tokenizer_path=base_model_path)
+        else:
+            policy = LingBotVlaInferencePolicy(config, tokenizer_path=base_model_path)
+        load_model_weights(policy, path_to_pi_model, strict=True)
+        policy.feature_transform = None
+        self.data_config = data_config
+        self.config = config
+        self.joint_max_dim = training_config['train']['max_action_dim']
+        self.action_dim = training_config['train']['action_dim']
+        self.chunk_size = training_config['train']['chunk_size']
+        policy.action_dim = self.action_dim
+        policy.chunk_size = self.chunk_size
+        self.norm_stats_file = data_config.norm_stats_file
+        if 'align_params' in training_config['train']:
+            self.use_depth_align = True
+        else: self.use_depth_align = False
+        with open(self.norm_stats_file) as f:
+            self.norm_stats = json.load(f)
+        policy.normalizer = Normalizer(
+            norm_stats=self.norm_stats['norm_stats'],
+            from_file=True,
+            data_type='robotwin',
+            norm_type={
+                "observation.images.cam_high": "identity",
+                "observation.images.cam_left_wrist": "identity",
+                "observation.images.cam_right_wrist": "identity",
+                "observation.state": self.data_config.norm_type,
+                "action": self.data_config.norm_type,
+            },
+        )
+        print('Model initialized ... ')
+        return policy
+    def reset(self, robo_name, path_to_pi_model = None) -> None:
+        if path_to_pi_model is not None:
+            self.vla = self.load_vla(path_to_pi_model)
+            self.vla = self.vla.cuda().eval()
+            if self.use_bf16:
+                self.vla = self.vla.to(torch.bfloat16)
+            elif self.use_fp32:
+                self.vla.model.float()
+        self.global_step = 0
+        self.last_action_chunk = None
+        if getattr(self.data_config, 'norm_type', None) is None:
+            self.data_config.norm_type = 'meanstd'
+        if getattr(self.config, 'vlm_causal', None) is None:
+            self.config.vlm_causal = False
+        if getattr(self.config, 'qwenvl_bos', None) is None:
+            self.config.qwenvl_bos = False
+        # if update ckpt path
+        if path_to_pi_model is not None:
+            all_safetensors = glob(os.path.join(path_to_pi_model, "*.safetensors"))
+            merged_weights = {}
+            for file_path in tqdm(all_safetensors):
+                with safe_open(file_path, framework="pt", device="cpu") as f:
+                    for key in f.keys():
+                        merged_weights[key] = f.get_tensor(key)
+            self.vla.load_state_dict(merged_weights, strict=True)
+    def resize_image(self, observation):
+        for image_feature in ['observation.images.cam_high', 'observation.images.cam_left_wrist', 'observation.images.cam_right_wrist']:
+            assert image_feature in observation
+            assert len(observation[image_feature].shape)==3 and observation[image_feature].shape[-1] == 3
+            image = observation[image_feature]
+            img_pil = Image.fromarray(image)
+            image_size = getattr(self.data_config, 'img_size', 224)
+            img_pil = img_pil.resize((image_size, image_size), Image.BILINEAR)
+            # img_resized shape: C*H*W
+            img_resized = np.transpose(np.array(img_pil), (2,0,1))  # (3,224,224)
+            observation[image_feature] = img_resized / 255.
+    def infer(self, observation, center_crop=True):
+        """Generates an action with the VLA policy."""
+        # (If trained with image augmentations) Center crop image and then resize back up to original size.
+        # IMPORTANT: Let's say crop scale == 0.9. To get the new height and width (post-crop), multiply
+        #            the original height and width by sqrt(0.9) -- not 0.9!
+        if 'reset' in observation and observation['reset']:
+            self.reset(robo_name=observation['robo_name'], path_to_pi_model=observation['path_to_pi_model'] if 'path_to_pi_model' in observation else None)
+            return dict(action = None)
+        self.resize_image(observation)
+        for k, v in observation.items():
+            if isinstance(v, np.ndarray):
+                observation[k] = torch.from_numpy(v)
+        if self.use_length == -1 or self.global_step % self.use_length == 0:
+            joint_max_dim = getattr(self, 'joint_max_dim')
+            action_dim = getattr(self, 'action_dim')
+            chunk_size = getattr(self, 'chunk_size')
+            normalized_observation = self.vla.normalizer.normalize(observation)
+            base_image = (normalized_observation["observation.images.cam_high"] * 255).to(torch.uint8)
+            left_wrist_image = (normalized_observation["observation.images.cam_left_wrist"] * 255).to(
+                torch.uint8
+            )
+            right_wrist_image = (normalized_observation["observation.images.cam_right_wrist"] * 255).to(
+                torch.uint8
+            )
+            obs_dict =  {
+                "image": {"base_0_rgb": base_image, "left_wrist_0_rgb": left_wrist_image, "right_wrist_0_rgb": right_wrist_image},
+                "state": normalized_observation["observation.state"].to(torch.float32),
+                "prompt": [observation["task"]],
+            }
+            state = prepare_state(self.config, obs_dict)
+            lang_tokens, lang_masks = prepare_language(self.config, self.language_tokenizer, obs_dict)
+            images, img_masks, _ = prepare_images(self.config, self.image_processor, obs_dict)
+            observation = {
+                'images': images,
+                'img_masks': img_masks,
+                'state': state,
+                'lang_tokens': lang_tokens,
+                'lang_masks': lang_masks,
+            }
+            if self.use_bf16:
+                observation['state'] = observation['state'].to(torch.bfloat16)
+        org_actions = ['action']
+        assert len(org_actions)==1, "Only support single action feature"
+        if self.chunk_ret:
+            action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]].float().cpu().numpy()
+            action = action[:self.use_length, :self.action_dim]
+        else:
+            if self.use_length == -1 or self.global_step % self.use_length == 0:
+                action = self.vla.select_action(observation, self.use_bf16, self.config.vlm_causal)[org_actions[0]]
+                self.last_action_chunk = action.float().cpu().numpy()
+            if self.use_length > 0:
+                action = self.last_action_chunk[self.global_step % self.use_length]
+            action = action[:, :self.action_dim]
+            print(f"on server step: {self.global_step}")
+            self.global_step+=1
+        return dict(action = action)
+import argparse
+from .websocket_policy_server import WebsocketPolicyServer
+def main():
+    parser = argparse.ArgumentParser(description="启动 QwenPi WebSocket 策略服务器")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--use_length",
+        type=int,
+        default=50,
+        help="used length of action chunk"
+    )
+    parser.add_argument(
+        "--chunk_ret",
+        type=bool,
+        default=True,
+        help=" True: The returned action tensor includes the horizon dimension. This allows the model to output a sequence of actions for each horizon step. False: The horizon dimension is omitted. The model selects and returns the next step autonomously based on its policy."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8006,
+        help="port of WebSocket"
+    )
+    parser.add_argument(
+        "--debug_infer_once",
+        action="store_true",
+        help="Run one infer with dummy observation then exit (for debugging infer() without WebSocket client)",
+    )
+    args = parser.parse_args()
+    model = QwenPiServer(args.model_path, use_length=args.use_length, chunk_ret=args.chunk_ret)
+    if args.debug_infer_once:
+        # 调试用：不启动 WebSocket，只跑一次 infer，可在 infer / select_action 里下断点
+        dummy_obs = {
+            "observation.images.cam_high": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.images.cam_left_wrist": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.images.cam_right_wrist": np.zeros((224, 224, 3), dtype=np.uint8),
+            "observation.state": np.zeros(model.action_dim, dtype=np.float32),
+            "task": "dummy task for debug",
+            "reset": False,
+        }
+        out = model.infer(dummy_obs)
+        print("debug_infer_once result keys:", out.keys())
+        return
+    model_server = WebsocketPolicyServer(model, port=args.port)
+    model_server.serve_forever()
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,61 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "LingBot_VA"
+version = "0.0.0"
+description = "LingBot-VA: A Pragmatic VA Foundation Model"
+authors = [
+    { name = "Robbyant Team", email = "fengchang.ll@antgroup.com" }
+]
+license = { file = "LICENSE.txt" }
+readme = "README.md"
+requires-python = ">=3.10,<4.0"
+dependencies = [
+    "torch>=2.9.0",
+    "torchvision>=0.24.0",
+    "diffusers>=0.36.0",
+    "transformers>=4.55.4",
+    "tokenizers>=0.21.4",
+    "tqdm",
+    "imageio",
+    "easydict",
+    "flash_attn",
+    "numpy>=1.26.4,<2"
+]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "black",
+    "flake8",
+    "isort",
+    "mypy",
+    "huggingface-hub[cli]"
+]
+[project.urls]
+homepage = "https://github.com/Robbyant"
+documentation = "https://github.com/Robbyant"
+repository = "https://github.com/Robbyant"
+huggingface = "https://github.com/Robbyant"
+modelscope = "https://github.com/Robbyant"
+discord = "https://github.com/Robbyant"
+[tool.setuptools]
+packages = ["lingbot_va"]
+[tool.setuptools.package-data]
+"lingbot_va" = ["**/*.py"]
+[tool.black]
+line-length = 88
+[tool.isort]
+profile = "black"
+[tool.mypy]
+strict = true

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch>=2.9.0
+torchvision>=0.24.0
+torchaudio
+diffusers>=0.36.0
+transformers>=4.55.4
+tokenizers>=0.21.4
+tqdm
+imageio[ffmpeg]
+easydict
+flash_attn
+numpy>=1.26.4,<2

script/run_launch_va_server_sync.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/usr/bin/bash
+set -x
+umask 007
+NGPU=${NGPU:-"8"}
+MASTER_PORT=${MASTER_PORT:-"29501"}
+PORT=${PORT:-"1106"}
+LOG_RANK=${LOG_RANK:-"0"}
+TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
+CONFIG_NAME=${CONFIG_NAME:-"robotwin"}
+overrides=""
+if [ $# -ne 0 ]; then
+    overrides="$*"
+fi
+## node setting
+num_gpu=${NGPU}
+master_port=${MASTER_PORT}
+log_rank=${LOG_RANK}
+torchft_lighthouse=${TORCHFT_LIGHTHOUSE}
+config_name=${CONFIG_NAME}
+## cmd setting
+export TOKENIZERS_PARALLELISM=false
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" TORCHFT_LIGHTHOUSE=${torchft_lighthouse} \
+python -m torch.distributed.run \
+    --nproc_per_node=${num_gpu} \
+    --local-ranks-filter=${log_rank} \
+    --master_port ${master_port} \
+    --tee 3 \
+    -m wan_va.wan_va_server --config-name ${config_name} $overrides

wan_va/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
2	+ from . import configs, distributed, modules

wan_va/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+from .va_franka_cfg import va_franka_cfg
+from .va_robotwin_cfg import va_robotwin_cfg
+from .va_franka_i2va import va_franka_i2va_cfg
+from .va_robotwin_i2va import va_robotwin_i2va_cfg
+VA_CONFIGS = {
+    'robotwin': va_robotwin_cfg,
+    'franka': va_franka_cfg,
+    'robotwin_i2av': va_robotwin_i2va_cfg,
+    'franka_i2av': va_franka_i2va_cfg,
+}

wan_va/configs/shared_config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+va_shared_cfg = EasyDict()
+va_shared_cfg.host = '0.0.0.0'
+va_shared_cfg.port = 29536
+va_shared_cfg.param_dtype = torch.bfloat16
+va_shared_cfg.save_root = './visualization'
+va_shared_cfg.patch_size = (1, 2, 2)

wan_va/configs/va_franka_cfg.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+from .shared_config import va_shared_cfg
+va_franka_cfg = EasyDict(__name__='Config: VA franka')
+va_franka_cfg.update(va_shared_cfg)
+va_shared_cfg.infer_mode = 'server'
+va_franka_cfg.wan22_pretrained_model_name_or_path = "/path/to/pretrained/model"
+va_franka_cfg.attn_window = 30
+va_franka_cfg.frame_chunk_size = 4
+va_franka_cfg.env_type = 'none'
+va_franka_cfg.height = 224
+va_franka_cfg.width = 320
+va_franka_cfg.action_dim = 30
+va_franka_cfg.action_per_frame = 20
+va_franka_cfg.obs_cam_keys = [
+    'observation.images.cam_high', 'observation.images.cam_left_wrist',
+    'observation.images.cam_right_wrist'
+]
+va_franka_cfg.guidance_scale = 5
+va_franka_cfg.action_guidance_scale = 1
+va_franka_cfg.num_inference_steps = 5
+va_franka_cfg.video_exec_step = -1
+va_franka_cfg.action_num_inference_steps = 10
+va_franka_cfg.snr_shift = 5.0
+va_franka_cfg.action_snr_shift = 1.0
+va_franka_cfg.used_action_channel_ids = list(range(0, 7)) + list(range(
+    28, 29)) + list(range(7, 14)) + list(range(29, 30))
+inverse_used_action_channel_ids = [len(va_franka_cfg.used_action_channel_ids)
+                                   ] * va_franka_cfg.action_dim
+for i, j in enumerate(va_franka_cfg.used_action_channel_ids):
+    inverse_used_action_channel_ids[j] = i
+va_franka_cfg.inverse_used_action_channel_ids = inverse_used_action_channel_ids
+va_franka_cfg.action_norm_method = 'quantiles'
+va_franka_cfg.norm_stat = {
+    "q01": [
+        0.3051295876502991, -0.22647984325885773, 0.19957000017166138,
+        -0.022680532187223434, -0.05553057789802551, -0.2693849802017212,
+        -0.29341773986816405, 0.2935442328453064, -0.4431332051753998,
+        0.21256473660469055, -0.7962440848350525, -0.40816226601600647,
+        -0.28359392285346985, -0.44507765769958496
+    ] + [0.] * 16,
+    "q99": [
+        0.7572150230407715, 0.47736290097236633, 0.6428080797195435,
+        0.9835678935050964, 0.9927203059196472, 0.28041139245033264,
+        0.47529348731040877, 0.7564866304397571, 0.04082797020673729,
+        0.5355993628501885, 0.9976375699043274, 0.8973174452781656,
+        0.6016915678977965, 0.5027598619461056
+    ] + [0.] * 14 + [1.0, 1.0],
+}

wan_va/configs/va_franka_i2va.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+from easydict import EasyDict
+from .va_franka_cfg import va_franka_cfg
+va_franka_i2va_cfg = EasyDict(__name__='Config: VA franka i2va')
+va_franka_i2va_cfg.update(va_franka_cfg)
+va_franka_i2va_cfg.input_img_path = 'example/franka'
+va_franka_i2va_cfg.num_chunks_to_infer = 10
+va_franka_i2va_cfg.prompt = 'pick bunk'
+va_franka_i2va_cfg.infer_mode = 'i2va'

wan_va/configs/va_robotwin_cfg.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+from easydict import EasyDict
+from .shared_config import va_shared_cfg
+va_robotwin_cfg = EasyDict(__name__='Config: VA robotwin')
+va_robotwin_cfg.update(va_shared_cfg)
+va_robotwin_cfg.wan22_pretrained_model_name_or_path = "/group/ossdphi_algo_scratch_11/weicxu/huggingface_cache/hub/models--robbyant--lingbot-va-posttrain-robotwin/snapshots/ef7242af28caff0af2ff8e947c78806f94719a39"
+va_robotwin_cfg.attn_window = 72
+va_robotwin_cfg.frame_chunk_size = 2
+va_robotwin_cfg.env_type = 'robotwin_tshape'
+va_robotwin_cfg.height = 256
+va_robotwin_cfg.width = 320
+va_robotwin_cfg.action_dim = 30
+va_robotwin_cfg.action_per_frame = 16
+va_robotwin_cfg.obs_cam_keys = [
+    'observation.images.cam_high', 'observation.images.cam_left_wrist',
+    'observation.images.cam_right_wrist'
+]
+va_robotwin_cfg.guidance_scale = 5
+va_robotwin_cfg.action_guidance_scale = 1
+va_robotwin_cfg.num_inference_steps = 25
+va_robotwin_cfg.video_exec_step = -1
+va_robotwin_cfg.action_num_inference_steps = 50
+va_robotwin_cfg.snr_shift = 5.0
+va_robotwin_cfg.action_snr_shift = 1.0
+va_robotwin_cfg.used_action_channel_ids = list(range(0, 7)) + list(
+    range(28, 29)) + list(range(7, 14)) + list(range(29, 30))
+inverse_used_action_channel_ids = [
+    len(va_robotwin_cfg.used_action_channel_ids)
+] * va_robotwin_cfg.action_dim
+for i, j in enumerate(va_robotwin_cfg.used_action_channel_ids):
+    inverse_used_action_channel_ids[j] = i
+va_robotwin_cfg.inverse_used_action_channel_ids = inverse_used_action_channel_ids
+va_robotwin_cfg.action_norm_method = 'quantiles'
+va_robotwin_cfg.norm_stat = {
+    "q01": [
+        -0.06172713458538055, -3.6716461181640625e-05, -0.08783501386642456,
+        -1, -1, -1, -1, -0.3547105032205582, -1.3113021850585938e-06,
+        -0.11975435614585876, -1, -1, -1, -1
+    ] + [0.] * 16,
+    "q99": [
+        0.3462600058317184, 0.39966784834861746, 0.14745532035827624, 1, 1, 1,
+        1, 0.034201726913452024, 0.39142737388610793, 0.1792279863357542, 1, 1,
+        1, 1
+    ] + [0.] * 14 + [1.0, 1.0],
+}

wan_va/configs/va_robotwin_i2va.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+from easydict import EasyDict
+from .va_robotwin_cfg import va_robotwin_cfg
+va_robotwin_i2va_cfg = EasyDict(__name__='Config: VA robotwin i2va')
+va_robotwin_i2va_cfg.update(va_robotwin_cfg)
+va_robotwin_i2va_cfg.input_img_path = 'example/robotwin'
+va_robotwin_i2va_cfg.num_chunks_to_infer = 10
+va_robotwin_i2va_cfg.prompt = 'Grab the medium-sized white mug, rotate it, place it on the table, and hook it onto the smooth dark gray rack.'
+va_robotwin_i2va_cfg.infer_mode = 'i2va'

wan_va/distributed/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.

wan_va/distributed/fsdp.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import gc
+from functools import partial
+import torch
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
+from torch.distributed.utils import _free_storage
+def shard_model(model,
+                device_id,
+                param_dtype=torch.bfloat16,
+                reduce_dtype=torch.float32,
+                buffer_dtype=torch.float32,
+                process_group=None,
+                sharding_strategy=ShardingStrategy.FULL_SHARD,
+                sync_module_states=True,
+                use_lora=False):
+    model = FSDP(module=model,
+                 process_group=process_group,
+                 sharding_strategy=sharding_strategy,
+                 auto_wrap_policy=partial(
+                     lambda_auto_wrap_policy,
+                     lambda_fn=lambda m: m in model.blocks),
+                 mixed_precision=MixedPrecision(param_dtype=param_dtype,
+                                                reduce_dtype=reduce_dtype,
+                                                buffer_dtype=buffer_dtype),
+                 device_id=device_id,
+                 sync_module_states=sync_module_states,
+                 use_orig_params=True if use_lora else False)
+    return model
+def free_model(model):
+    for m in model.modules():
+        if isinstance(m, FSDP):
+            _free_storage(m._handle.flat_param.data)
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()

wan_va/distributed/util.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+import torch.distributed as dist
+def _configure_model(model, shard_fn, param_dtype, device):
+    """
+    TODO
+    """
+    model.eval().requires_grad_(False)
+    if dist.is_initialized():
+        dist.barrier()
+    if dist.is_initialized():
+        model = shard_fn(model)
+    else:
+        model.to(param_dtype)
+        model.to(device)
+    return model
+def init_distributed(world_size, local_rank, rank):
+    torch.cuda.set_device(local_rank)
+    if world_size > 1:
+        dist.init_process_group(backend="nccl",
+                                init_method="env://",
+                                rank=rank,
+                                world_size=world_size)

wan_va/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+from .utils import load_text_encoder, load_tokenizer, load_transformer, load_vae
+__all__ = [
+    'load_transformer', 'load_text_encoder', 'load_tokenizer', 'load_vae',
+    'WanVAEStreamingWrapper'
+]

wan_va/modules/model.py ADDED Viewed

	@@ -0,0 +1,580 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import math
+from copy import deepcopy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import FeedForward
+from diffusers.models.embeddings import (
+    PixArtAlphaTextProjection,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import FP32LayerNorm
+from einops import rearrange
+try:
+    from flash_attn_interface import flash_attn_func
+except:
+    from flash_attn import flash_attn_func
+__all__ = ['WanTransformer3DModel']
+def custom_sdpa(q, k, v):
+    out = F.scaled_dot_product_attention(q.transpose(1, 2), k.transpose(1, 2),
+                                         v.transpose(1, 2))
+    return out.transpose(1, 2)
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        time_freq_dim,
+        time_proj_dim,
+        text_embed_dim,
+        pos_embed_seq_len,
+    ):
+        super().__init__()
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim,
+                                        flip_sin_to_cos=True,
+                                        downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim,
+                                               time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim,
+                                                       dim,
+                                                       act_fn="gelu_tanh")
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        dtype=None,
+    ):
+        B, L = timestep.shape
+        timestep = timestep.reshape(-1)
+        timestep = self.timesteps_proj(timestep)
+        # time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        time_embedder_dtype = self.time_embedder.linear_1.weight.dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).to(dtype=dtype)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+        return temb.reshape(B, L, -1), timestep_proj.reshape(B, L, -1)
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim,
+        patch_size,
+        max_seq_len,
+        theta=10000.0,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        self.theta = theta
+        self.f_dim = self.attention_head_dim - 2 * (self.attention_head_dim //
+                                                    3)
+        self.h_dim = self.attention_head_dim // 3
+        self.w_dim = self.attention_head_dim // 3
+        # Precompute and register buffers
+        f_freqs_base, h_freqs_base, w_freqs_base = self._precompute_freqs_base(
+        )
+        self.register_buffer("f_freqs_base", f_freqs_base, persistent=False)
+        self.register_buffer("h_freqs_base", h_freqs_base, persistent=False)
+        self.register_buffer("w_freqs_base", w_freqs_base, persistent=False)
+    def _precompute_freqs_base(self):
+        # freqs_base = 1.0 / (theta ** (2k / dim))
+        f_freqs_base = 1.0 / (self.theta**(torch.arange(
+            0, self.f_dim, 2)[:(self.f_dim // 2)].double() / self.f_dim))
+        h_freqs_base = 1.0 / (self.theta**(torch.arange(
+            0, self.h_dim, 2)[:(self.h_dim // 2)].double() / self.h_dim))
+        w_freqs_base = 1.0 / (self.theta**(torch.arange(
+            0, self.w_dim, 2)[:(self.w_dim // 2)].double() / self.w_dim))
+        return f_freqs_base, h_freqs_base, w_freqs_base
+    def forward(self, grid_ids):
+        with torch.no_grad():
+            f_freqs = grid_ids[:, 0, :].unsqueeze(-1) * self.f_freqs_base
+            h_freqs = grid_ids[:, 1, :].unsqueeze(-1) * self.h_freqs_base
+            w_freqs = grid_ids[:, 2, :].unsqueeze(-1) * self.w_freqs_base
+            freqs = torch.cat([f_freqs, h_freqs, w_freqs], dim=-1).float()
+            freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs_cis
+class WanAttention(torch.nn.Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        eps=1e-5,
+        dropout=0.0,
+        cross_attention_dim_head=None,
+        attn_mode='torch',
+    ):
+        super().__init__()
+        if attn_mode == 'torch':
+            self.attn_op = custom_sdpa
+        elif attn_mode == 'flashattn':
+            self.attn_op = flash_attn_func
+        else:
+            raise ValueError(
+                f"Unsupported attention mode: {attn_mode}, only support torch and flashattn"
+            )
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.cross_attention_dim_head = cross_attention_dim_head
+        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList([
+            torch.nn.Linear(self.inner_dim, dim, bias=True),
+            torch.nn.Dropout(dropout),
+        ])
+        self.norm_q = torch.nn.RMSNorm(dim_head * heads,
+                                       eps=eps,
+                                       elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim_head * heads,
+                                       eps=eps,
+                                       elementwise_affine=True)
+        self.attn_caches = {} if cross_attention_dim_head is None else None
+    def clear_pred_cache(self, cache_name):
+        if self.attn_caches is None:
+            return
+        cache = self.attn_caches[cache_name]
+        is_pred = cache['is_pred']
+        cache['mask'][is_pred] = False
+    def clear_cache(self, cache_name):
+        if self.attn_caches is None:
+            return
+        self.attn_caches[cache_name] = None
+    def init_kv_cache(self, cache_name, total_tolen, num_head, head_dim,
+                      device, dtype, batch_size):
+        if self.attn_caches is None:
+            return
+        self.attn_caches[cache_name] = {
+            'k':
+            torch.empty([batch_size, total_tolen, num_head, head_dim],
+                        device=device,
+                        dtype=dtype),
+            'v':
+            torch.empty([batch_size, total_tolen, num_head, head_dim],
+                        device=device,
+                        dtype=dtype),
+            'id':
+            torch.full((total_tolen, ), -1, device=device),
+            "mask":
+            torch.zeros((total_tolen, ), dtype=torch.bool, device=device),
+            "is_pred":
+            torch.zeros((total_tolen, ), dtype=torch.bool, device=device),
+        }
+    def allocate_slots(self, cache_name, key_size):
+        cache = self.attn_caches[cache_name]
+        mask = cache["mask"]
+        ids = cache["id"]
+        free = (~mask).nonzero(as_tuple=False).squeeze(-1)
+        if free.numel() < key_size:
+            used = mask.nonzero(as_tuple=False).squeeze(-1)
+            used_ids = ids[used]
+            order = torch.argsort(used_ids)
+            need = key_size - free.numel()
+            to_free = used[order[:need]]
+            mask[to_free] = False
+            ids[to_free] = -1
+            free = (~mask).nonzero(as_tuple=False).squeeze(-1)
+        assert free.numel() >= key_size
+        return free[:key_size]
+    def _next_cache_id(self, cache_name):
+        ids = self.attn_caches[cache_name]['id']
+        mask = self.attn_caches[cache_name]['mask']
+        if mask.any():
+            return ids[mask].max() + 1
+        else:
+            return torch.tensor(0, device=ids.device, dtype=ids.dtype)
+    def update_cache(self, cache_name, key, value, is_pred):
+        cache = self.attn_caches[cache_name]
+        key_size = key.shape[1]
+        slots = self.allocate_slots(cache_name, key_size)
+        new_id = self._next_cache_id(cache_name)
+        cache['k'][:, slots] = key
+        cache['v'][:, slots] = value
+        cache['mask'][slots] = True
+        cache['id'][slots] = new_id
+        cache['is_pred'][slots] = is_pred
+        return slots
+    def restore_cache(self, cache_name, slots):
+        self.attn_caches[cache_name]['mask'][slots] = False
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        rotary_emb,
+        update_cache=0,
+        cache_name='pos',
+    ):
+        kv_cache = self.attn_caches[
+            cache_name] if self.attn_caches is not None else None
+        query, key, value = self.to_q(q), self.to_k(k), self.to_v(v)
+        query = self.norm_q(query)
+        query = query.unflatten(2, (self.heads, -1))
+        key = self.norm_k(key)
+        key = key.unflatten(2, (self.heads, -1))
+        value = value.unflatten(2, (self.heads, -1))
+        if rotary_emb is not None:
+            def apply_rotary_emb(x, freqs):
+                x_out = torch.view_as_complex(
+                    x.to(torch.float64).reshape(x.shape[0], x.shape[1],
+                                                x.shape[2], -1, 2))
+                x_out = torch.view_as_real(x_out * freqs).flatten(3)
+                return x_out.to(x.dtype)
+            query = apply_rotary_emb(query, rotary_emb)
+            key = apply_rotary_emb(key, rotary_emb)
+        slots = None
+        if kv_cache is not None and kv_cache['k'] is not None:
+            slots = self.update_cache(cache_name,
+                                      key,
+                                      value,
+                                      is_pred=(update_cache == 1))
+            key_pool = self.attn_caches[cache_name]['k']
+            value_pool = self.attn_caches[cache_name]['v']
+            mask = self.attn_caches[cache_name]['mask']
+            valid = mask.nonzero(as_tuple=False).squeeze(-1)
+            key = key_pool[:, valid]
+            value = value_pool[:, valid]
+        hidden_states = self.attn_op(query, key, value)
+        if update_cache == 0:
+            if kv_cache is not None and kv_cache['k'] is not None:
+                self.restore_cache(cache_name, slots)
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = self.to_out[0](hidden_states)
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        ffn_dim,
+        num_heads,
+        cross_attn_norm=False,
+        eps=1e-6,
+        attn_mode: str = "flashattn",
+    ):
+        super().__init__()
+        self.attn_mode = attn_mode
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=None,
+            attn_mode=attn_mode,
+        )
+        # 2. Cross-attention
+        self.attn2 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=dim // num_heads,
+            attn_mode=attn_mode,
+        )
+        self.norm2 = FP32LayerNorm(
+            dim, eps,
+            elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim,
+                               inner_dim=ffn_dim,
+                               activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        temb,
+        rotary_emb,
+        update_cache=0,
+        cache_name='pos',
+    ) -> torch.Tensor:
+        temb_scale_shift_table = self.scale_shift_table[None] + temb.float()
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = \
+            rearrange(temb_scale_shift_table, 'b l n c -> b n l c').chunk(6, dim=1)
+        shift_msa = shift_msa.squeeze(1)
+        scale_msa = scale_msa.squeeze(1)
+        gate_msa = gate_msa.squeeze(1)
+        c_shift_msa = c_shift_msa.squeeze(1)
+        c_scale_msa = c_scale_msa.squeeze(1)
+        c_gate_msa = c_gate_msa.squeeze(1)
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) *
+                              (1. + scale_msa) +
+                              shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(norm_hidden_states,
+                                 norm_hidden_states,
+                                 norm_hidden_states,
+                                 rotary_emb,
+                                 update_cache=update_cache,
+                                 cache_name=cache_name)
+        hidden_states = (hidden_states.float() +
+                         attn_output * gate_msa).type_as(hidden_states)
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(
+            hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(norm_hidden_states,
+                                 encoder_hidden_states,
+                                 encoder_hidden_states,
+                                 None,
+                                 update_cache=0,
+                                 cache_name=cache_name)
+        hidden_states = hidden_states + attn_output
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) *
+                              (1. + c_scale_msa) +
+                              c_shift_msa).type_as(hidden_states)
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() +
+                         ff_output.float() * c_gate_msa).type_as(hidden_states)
+        return hidden_states
+class WanTransformer3DModel(ModelMixin, ConfigMixin):
+    r"""
+    TODO
+    """
+    @register_to_config
+    def __init__(self,
+                 patch_size=[1, 2, 2],
+                 num_attention_heads=24,
+                 attention_head_dim=128,
+                 in_channels=48,
+                 out_channels=48,
+                 action_dim=30,
+                 text_dim=4096,
+                 freq_dim=256,
+                 ffn_dim=14336,
+                 num_layers=30,
+                 cross_attn_norm=True,
+                 eps=1e-06,
+                 rope_max_seq_len=1024,
+                 pos_embed_seq_len=None,
+                 attn_mode="torch"):
+        r"""
+        TODO
+        """
+        super().__init__()
+        self.patch_size = patch_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.rope = WanRotaryPosEmbed(attention_head_dim, patch_size,
+                                      rope_max_seq_len)
+        self.patch_embedding_mlp = nn.Linear(
+            in_channels * patch_size[0] * patch_size[1] * patch_size[2],
+            inner_dim)
+        self.action_embedder = nn.Linear(action_dim, inner_dim)
+        self.condition_embedder = WanTimeTextImageEmbedding(
+            dim=inner_dim,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            pos_embed_seq_len=pos_embed_seq_len,
+        )
+        self.condition_embedder_action = deepcopy(self.condition_embedder)
+        self.blocks = nn.ModuleList([
+            WanTransformerBlock(inner_dim,
+                                ffn_dim,
+                                num_attention_heads,
+                                cross_attn_norm,
+                                eps,
+                                attn_mode=attn_mode) for _ in range(num_layers)
+        ])
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim,
+                                  out_channels * math.prod(patch_size))
+        self.action_proj_out = nn.Linear(inner_dim, action_dim)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+    def clear_cache(self, cache_name):
+        for block in self.blocks:
+            block.attn1.clear_cache(cache_name)
+    def clear_pred_cache(self, cache_name):
+        for block in self.blocks:
+            block.attn1.clear_pred_cache(cache_name)
+    def create_empty_cache(self, cache_name, attn_window,
+                           latent_token_per_chunk, action_token_per_chunk,
+                           device, dtype, batch_size):
+        total_tolen = (attn_window // 2) * latent_token_per_chunk + (
+            attn_window // 2) * action_token_per_chunk
+        for block in self.blocks:
+            block.attn1.init_kv_cache(cache_name, total_tolen,
+                                      self.num_attention_heads,
+                                      self.attention_head_dim, device, dtype, batch_size)
+    def forward(
+        self,
+        input_dict,
+        update_cache=0,
+        cache_name="pos",
+        action_mode=False,
+    ):
+        r"""
+        Forward pass through the diffusion model
+        Args:
+            x (List[Tensor]):
+                List of input video tensors, each with shape [C_in, F, H, W]
+            t (Tensor):
+                Diffusion timesteps tensor of shape [B]
+            context (List[Tensor]):
+                List of text embeddings each with shape [L, C]
+            seq_len (`int`):
+                Maximum sequence length for positional encoding
+            y (List[Tensor], *optional*):
+                Conditional video inputs for image-to-video mode, same shape as x
+        Returns:
+            List[Tensor]:
+                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
+        """
+        if action_mode:  # action input emb
+            latent_hidden_states = rearrange(input_dict['noisy_latents'],
+                                             'b c f h w -> b (f h w) c')
+            latent_hidden_states = self.action_embedder(
+                latent_hidden_states)  # B L1 C
+        else:  # latent input emb
+            latent_hidden_states = rearrange(
+                input_dict['noisy_latents'],
+                'b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)',
+                p1=self.patch_size[0],
+                p2=self.patch_size[1],
+                p3=self.patch_size[2])
+            latent_hidden_states = self.patch_embedding_mlp(
+                latent_hidden_states)
+        text_hidden_states = self.condition_embedder.text_embedder(
+            input_dict["text_emb"])  # B L2 C
+        latent_grid_id = input_dict['grid_id']
+        rotary_emb = self.rope(latent_grid_id)[:, :, None]  # 1 L 1 C
+        pach_scale_h, pach_scale_w = (1, 1) if action_mode else (
+            self.patch_size[1], self.patch_size[2])
+        latent_time_steps = torch.repeat_interleave(
+            input_dict['timesteps'],
+            (input_dict['noisy_latents'].shape[-2] // pach_scale_h) *
+            (input_dict['noisy_latents'].shape[-1] // pach_scale_w), dim=1)  # L
+        current_condition_embedder = self.condition_embedder_action if action_mode else self.condition_embedder
+        temb, timestep_proj = current_condition_embedder(
+            latent_time_steps, dtype=latent_hidden_states.dtype)
+        timestep_proj = timestep_proj.unflatten(2, (6, -1))  # B L 6 C
+        for block in self.blocks:
+            latent_hidden_states = block(latent_hidden_states,
+                                         text_hidden_states,
+                                         timestep_proj,
+                                         rotary_emb,
+                                         update_cache=update_cache,
+                                         cache_name=cache_name)
+        temb_scale_shift_table = self.scale_shift_table[None] + temb[:, :, None, ...]
+        shift, scale = rearrange(temb_scale_shift_table,
+                                 'b l n c -> b n l c').chunk(2, dim=1)
+        shift = shift.to(latent_hidden_states.device).squeeze(1)
+        scale = scale.to(latent_hidden_states.device).squeeze(1)
+        latent_hidden_states = (self.norm_out(latent_hidden_states.float()) *
+                                (1. + scale) +
+                                shift).type_as(latent_hidden_states)
+        if action_mode:
+            latent_hidden_states = self.action_proj_out(latent_hidden_states)
+        else:
+            latent_hidden_states = self.proj_out(latent_hidden_states)
+            latent_hidden_states = rearrange(latent_hidden_states,
+                                             'b l (n c) -> b (l n) c',
+                                             n=math.prod(self.patch_size))  #
+        return latent_hidden_states
+if __name__ == '__main__':
+    model = WanTransformer3DModel(patch_size=[1, 2, 2],
+                                  num_attention_heads=24,
+                                  attention_head_dim=128,
+                                  in_channels=48,
+                                  out_channels=48,
+                                  action_dim=30,
+                                  text_dim=4096,
+                                  freq_dim=256,
+                                  ffn_dim=14336,
+                                  num_layers=30,
+                                  cross_attn_norm=True,
+                                  eps=1e-6,
+                                  rope_max_seq_len=1024,
+                                  pos_embed_seq_len=None,
+                                  attn_mode="torch")
+    print(model)

wan_va/modules/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright 2024-2025 The Robbyant Team Authors. All rights reserved.
+import torch
+from diffusers import AutoencoderKLWan
+from transformers import (
+    T5TokenizerFast,
+    UMT5EncoderModel,
+)
+from .model import WanTransformer3DModel
+def load_vae(
+    vae_path,
+    torch_dtype,
+    torch_device,
+):
+    vae = AutoencoderKLWan.from_pretrained(
+        vae_path,
+        torch_dtype=torch_dtype,
+    )
+    return vae.to(torch_device)
+def load_text_encoder(
+    text_encoder_path,
+    torch_dtype,
+    torch_device,
+):
+    text_encoder = UMT5EncoderModel.from_pretrained(
+        text_encoder_path,
+        torch_dtype=torch_dtype,
+    )
+    return text_encoder.to(torch_device)
+def load_tokenizer(tokenizer_path, ):
+    tokenizer = T5TokenizerFast.from_pretrained(tokenizer_path, )
+    return tokenizer
+def load_transformer(
+    transformer_path,
+    torch_dtype,
+    torch_device,
+):
+    model = WanTransformer3DModel.from_pretrained(
+        transformer_path,
+        torch_dtype=torch_dtype,
+    )
+    return model.to(torch_device)
+def patchify(x, patch_size):
+    if patch_size is None or patch_size == 1:
+        return x
+    batch_size, channels, frames, height, width = x.shape
+    x = x.view(batch_size, channels, frames, height // patch_size, patch_size,
+               width // patch_size, patch_size)
+    x = x.permute(0, 1, 6, 4, 2, 3, 5).contiguous()
+    x = x.view(batch_size, channels * patch_size * patch_size, frames,
+               height // patch_size, width // patch_size)
+    return x
+class WanVAEStreamingWrapper:
+    def __init__(self, vae_model):
+        self.vae = vae_model
+        self.encoder = vae_model.encoder
+        self.quant_conv = vae_model.quant_conv
+        if hasattr(self.vae, "_cached_conv_counts"):
+            self.enc_conv_num = self.vae._cached_conv_counts["encoder"]
+        else:
+            count = 0
+            for m in self.encoder.modules():
+                if m.__class__.__name__ == "WanCausalConv3d":
+                    count += 1
+            self.enc_conv_num = count
+        self.clear_cache()
+    def clear_cache(self):
+        self.feat_cache = [None] * self.enc_conv_num
+    def encode_chunk(self, x_chunk):
+        if hasattr(self.vae.config,
+                   "patch_size") and self.vae.config.patch_size is not None:
+            x_chunk = patchify(x_chunk, self.vae.config.patch_size)
+        feat_idx = [0]
+        out = self.encoder(x_chunk,
+                           feat_cache=self.feat_cache,
+                           feat_idx=feat_idx)
+        enc = self.quant_conv(out)
+        return enc

wan_va/utils/Simple_Remote_Infer/LEGAL.md ADDED Viewed

	@@ -0,0 +1,7 @@

+Legal Disclaimer
+Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
+法律免责声明
+关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。

wan_va/utils/Simple_Remote_Infer/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# 通用的server-client
+## /Simple_Remote_Infer/deploy/qwenpi_policy.py
+- QwenPiServer: 一个用于示范的模型，拥有init和infer方法
+将加载好的模型用`WebsocketPolicyServer`包裹，并指定端口即可
+```python
+model_server = WebsocketPolicyServer(model, port=8002)
+model_server.serve_forever() # 开启监听
+```
+## ./websocket_client_policy.py
+在`__main__()`中展示了如何创造一个假模型向真模型发送环境信息，只需要用`WebsocketClientPolicy`代替原有的模型即可

wan_va/utils/Simple_Remote_Infer/deploy/__init__.py ADDED Viewed

File without changes

wan_va/utils/Simple_Remote_Infer/deploy/image_tools.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+from PIL import Image
+def convert_to_uint8(img: np.ndarray) -> np.ndarray:
+    """Converts an image to uint8 if it is a float image.
+    This is important for reducing the size of the image when sending it over the network.
+    """
+    if np.issubdtype(img.dtype, np.floating):
+        img = (255 * img).astype(np.uint8)
+    return img
+def resize_with_pad(images: np.ndarray,
+                    height: int,
+                    width: int,
+                    method=Image.BILINEAR) -> np.ndarray:
+    """Replicates tf.image.resize_with_pad for multiple images using PIL. Resizes a batch of images to a target height.
+    Args:
+        images: A batch of images in [..., height, width, channel] format.
+        height: The target height of the image.
+        width: The target width of the image.
+        method: The interpolation method to use. Default is bilinear.
+    Returns:
+        The resized images in [..., height, width, channel].
+    """
+    # If the images are already the correct size, return them as is.
+    if images.shape[-3:-1] == (height, width):
+        return images
+    original_shape = images.shape
+    images = images.reshape(-1, *original_shape[-3:])
+    resized = np.stack([
+        _resize_with_pad_pil(Image.fromarray(im), height, width, method=method)
+        for im in images
+    ])
+    return resized.reshape(*original_shape[:-3], *resized.shape[-3:])
+def _resize_with_pad_pil(image: Image.Image, height: int, width: int,
+                         method: int) -> Image.Image:
+    """Replicates tf.image.resize_with_pad for one image using PIL. Resizes an image to a target height and
+    width without distortion by padding with zeros.
+    Unlike the jax version, note that PIL uses [width, height, channel] ordering instead of [batch, h, w, c].
+    """
+    cur_width, cur_height = image.size
+    if cur_width == width and cur_height == height:
+        return image  # No need to resize if the image is already the correct size.
+    ratio = max(cur_width / width, cur_height / height)
+    resized_height = int(cur_height / ratio)
+    resized_width = int(cur_width / ratio)
+    resized_image = image.resize((resized_width, resized_height),
+                                 resample=method)
+    zero_image = Image.new(resized_image.mode, (width, height), 0)
+    pad_height = max(0, int((height - resized_height) / 2))
+    pad_width = max(0, int((width - resized_width) / 2))
+    zero_image.paste(resized_image, (pad_width, pad_height))
+    assert zero_image.size == (width, height)
+    return zero_image