Robotics
English
starVLA
vla
vision-language-action
qwen3-vl
flow-matching
pi-zero
manipulation
bridge
rt-1
oxe
Jinhuiye commited on
Commit
1982ed2
·
verified ·
1 Parent(s): c531a88

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +244 -0
  2. checkpoints/server_logs/steps_10000_pytorch_model_policy_server_6418.log +12 -0
  3. checkpoints/server_logs/steps_10000_pytorch_model_policy_server_6420.log +12 -0
  4. checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6455.log +12 -0
  5. checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6456.log +12 -0
  6. checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6457.log +12 -0
  7. checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6554.log +12 -0
  8. checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6555.log +12 -0
  9. checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6557.log +12 -0
  10. checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6450.log +12 -0
  11. checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6451.log +12 -0
  12. checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6453.log +12 -0
  13. checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6455.log +12 -0
  14. checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6457.log +12 -0
  15. checkpoints/server_logs/steps_50000_pytorch_model_policy_server_6555.log +12 -0
  16. checkpoints/server_logs/steps_50000_pytorch_model_policy_server_6557.log +12 -0
  17. checkpoints/server_logs/steps_60000_pytorch_model_policy_server_6418.log +12 -0
  18. checkpoints/server_logs/steps_60000_pytorch_model_policy_server_6420.log +12 -0
  19. checkpoints/steps_10000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 +0 -0
  20. checkpoints/steps_20000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 +0 -0
  21. checkpoints/steps_20000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 +0 -0
  22. checkpoints/steps_20000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 +0 -0
  23. checkpoints/steps_30000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 +0 -0
  24. checkpoints/steps_30000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 +0 -0
  25. checkpoints/steps_30000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 +0 -0
  26. checkpoints/steps_30000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 +0 -0
  27. checkpoints/steps_40000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 +0 -0
  28. checkpoints/steps_40000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 +0 -0
  29. checkpoints/steps_40000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 +0 -0
  30. checkpoints/steps_40000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run2 +0 -0
  31. checkpoints/steps_50000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 +0 -0
  32. checkpoints/steps_50000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 +0 -0
  33. checkpoints/steps_60000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 +0 -0
  34. checkpoints/steps_60000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 +0 -0
  35. config.full.yaml +125 -0
  36. config.yaml +72 -0
  37. dataset_statistics.json +264 -0
  38. run_oxe_train.sh +136 -0
  39. success_summary/raw_success.txt +29 -0
  40. success_summary/success_plot.png +0 -0
  41. success_summary/success_summary.csv +7 -0
  42. summary.jsonl +7 -0
  43. wandb/wandb/debug-internal.log +0 -0
  44. wandb/wandb/debug.log +0 -0
  45. wandb/wandb/run-20260426_011111-enstjn5q/files/output.log +322 -0
  46. wandb/wandb/run-20260426_011111-enstjn5q/files/requirements.txt +227 -0
  47. wandb/wandb/run-20260426_011111-enstjn5q/files/wandb-metadata.json +175 -0
  48. wandb/wandb/run-20260426_011111-enstjn5q/logs/debug-core.log +8 -0
  49. wandb/wandb/run-20260426_011111-enstjn5q/logs/debug-internal.log +0 -0
  50. wandb/wandb/run-20260426_011111-enstjn5q/logs/debug.log +0 -0
README.md ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: starVLA
4
+ pipeline_tag: robotics
5
+ tags:
6
+ - vla
7
+ - vision-language-action
8
+ - robotics
9
+ - qwen3-vl
10
+ - flow-matching
11
+ - pi-zero
12
+ - manipulation
13
+ - bridge
14
+ - rt-1
15
+ - oxe
16
+ datasets:
17
+ - IPEC-COMMUNITY/bridge_orig_lerobot
18
+ - IPEC-COMMUNITY/fractal20220817_data_lerobot
19
+ language:
20
+ - en
21
+ base_model:
22
+ - Qwen/Qwen3-VL-4B-Instruct
23
+ ---
24
+
25
+ # Qwen3VL-PI_v3-Bridge-RT-1
26
+
27
+ A **Vision-Language-Action (VLA)** model from the [StarVLA](https://github.com/starVLA/starVLA)
28
+ project, combining a **Qwen3-VL-4B-Instruct** backbone with a **layer-wise
29
+ cross-attention flow-matching action head** (`QwenPI_v3`). The model is
30
+ co-trained on the [Bridge V2](https://huggingface.co/datasets/IPEC-COMMUNITY/bridge_orig_lerobot)
31
+ and [RT-1 / Fractal](https://huggingface.co/datasets/IPEC-COMMUNITY/fractal20220817_data_lerobot)
32
+ slices of the Open X-Embodiment (OXE) collection, and is evaluated on the
33
+ **SimplerEnv WidowX** benchmark.
34
+
35
+ `QwenPI_v3` is StarVLA's open-weight realisation of the π₀.₅ recipe:
36
+
37
+ 1. **Layer-wise cross-DiT flow-matching action head** — every VLM layer's
38
+ hidden state participates in cross-attention with the action DiT, instead
39
+ of consuming only the last-layer feature.
40
+ 2. **Compressed Action DiT** — per-layer `LayerNorm + Linear` projectors
41
+ compress the 2560-d Qwen3-VL hidden states down to a 1024-d DiT latent,
42
+ shrinking the action-head footprint by ~6× while preserving the
43
+ layer-wise interaction.
44
+ 3. **Discretised-state language injection** — proprioceptive state is
45
+ quantised into 256 bins and appended to the instruction as plain tokens
46
+ (`[STATE] <bins> [ACTION]`), so the VLM can attend to robot state with
47
+ no additional encoder.
48
+
49
+ ---
50
+
51
+ ## Model Summary
52
+
53
+ | | |
54
+ | --- | --- |
55
+ | **Architecture** | `QwenPI_v3` (Qwen3-VL + layer-wise cross-DiT flow-matching head) |
56
+ | **VLM backbone** | [`Qwen3-VL-4B-Instruct`](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) |
57
+ | **Action head** | Layer-wise Flow-Matching DiT (36 layers, 1024 hidden, 16 heads) |
58
+ | **Action chunk** | 16 steps |
59
+ | **Action / state dim** | 7 / 7 (delta end-effector) |
60
+ | **Image resolution** | 224 × 224, single 3rd-person view |
61
+ | **Inference timesteps** | 4 (flow matching) |
62
+ | **Total parameters** | **≈ 5.07 B** |
63
+ | **License** | MIT |
64
+ | **Codebase** | [starVLA/starVLA](https://github.com/starVLA/starVLA) |
65
+
66
+ ### Parameter breakdown
67
+
68
+ | Module | Parameters | Share |
69
+ | --- | ---: | ---: |
70
+ | `qwen_vl_interface` (Qwen3-VL-4B) | 4,437,815,808 | 87.5 % |
71
+ | `action_model` (layer-wise FM DiT, hidden 1024) | 538,678,305 | 10.6 % |
72
+ | `project_layers` (per-layer 2560 → 1024 projectors) | 94,593,024 | 1.9 % |
73
+ | **Total** | **5,071,087,137** | **100 %** |
74
+
75
+ ---
76
+
77
+ ## Training Data
78
+
79
+ Co-training mixture **`bridge_rt_1`** (1 : 1 sampling):
80
+
81
+ | Dataset | Embodiment | Source |
82
+ | --- | --- | --- |
83
+ | `bridge_orig_1.0.0_lerobot` | WidowX | [IPEC-COMMUNITY/bridge_orig_lerobot](https://huggingface.co/datasets/IPEC-COMMUNITY/bridge_orig_lerobot) |
84
+ | `fractal20220817_data_0.1.0_lerobot` (RT-1) | Google Robot | [IPEC-COMMUNITY/fractal20220817_data_lerobot](https://huggingface.co/datasets/IPEC-COMMUNITY/fractal20220817_data_lerobot) |
85
+
86
+ - Action representation: **delta end-effector** (7-d, gripper included)
87
+ - Image observation: single primary RGB view, resized to 224 × 224
88
+ - Per-dataset normalisation statistics are stored in
89
+ [`dataset_statistics.json`](dataset_statistics.json).
90
+
91
+ ---
92
+
93
+ ## Training Recipe
94
+
95
+ | | |
96
+ | --- | --- |
97
+ | Total steps | 100,000 (released checkpoints up to 60k) |
98
+ | Warm-up steps | 5,000 |
99
+ | Per-device batch size | 24 |
100
+ | Hardware | 8 × NVIDIA H100 / A100 (DeepSpeed ZeRO-2) |
101
+ | Precision | bf16, mixed-precision + gradient checkpointing |
102
+ | Optimizer | AdamW (β₁ = 0.9, β₂ = 0.95, ε = 1e-8, wd = 1e-8) |
103
+ | LR (base / VLM) | 1e-5 |
104
+ | LR (action head) | 1e-4 |
105
+ | LR scheduler | `cosine_with_min_lr` (min lr 5e-7) |
106
+ | Gradient clipping | 1.0 |
107
+ | Flow-matching noise | β-distribution (α=1.5, β=1.0), s = 0.999 |
108
+ | Repeated diffusion steps | 8 |
109
+ | Frozen modules | none (full fine-tuning) |
110
+ | Attention impl. | FlashAttention-2 |
111
+
112
+ The exact training config is preserved in
113
+ [`config.yaml`](config.yaml) / [`config.full.yaml`](config.full.yaml), and the
114
+ launch script in [`run_oxe_train.sh`](run_oxe_train.sh).
115
+
116
+ ---
117
+
118
+ ## Evaluation — SimplerEnv WidowX
119
+
120
+ Following the standard SimplerEnv WidowX protocol on four pick-and-place
121
+ tasks (24 episodes per task per run). Numbers are success rates (↑).
122
+
123
+ | Step | PutCarrotOnPlate | PutEggplantInBasket | PutSpoonOnTableCloth | StackGreenCubeOnYellowCube | **Average** |
124
+ | ---: | ---: | ---: | ---: | ---: | ---: |
125
+ | 40k | 0.688 | 0.917 | 0.750 | 0.333 | **0.672** |
126
+ | 50k | 0.625 | **1.000** | 0.792 | **0.375** | **0.698** |
127
+ | 60k | 0.667 | **1.000** | 0.750 | 0.167 | 0.646 |
128
+
129
+ Best average: **69.8 %** at the 50k checkpoint
130
+ ([`steps_50000_pytorch_model.pt`](checkpoints/steps_50000_pytorch_model.pt)),
131
+ which we ship as the recommended checkpoint.
132
+
133
+ For comparison with other StarVLA frameworks on the same `bridge_rt_1`
134
+ mixture and protocol see the [StarVLA Model Zoo](https://github.com/starVLA/starVLA/blob/main/docs/model_zoo.md).
135
+
136
+ ---
137
+
138
+ ## Repository Layout
139
+
140
+ ```
141
+ .
142
+ ├── README.md # this model card
143
+ ├── config.yaml # minimal training config
144
+ ├── config.full.yaml # fully resolved training config
145
+ ├── run_oxe_train.sh # launch script used for this run
146
+ ├── dataset_statistics.json # per-dataset action/state normalisation stats
147
+ ├── summary.jsonl # training step summary
148
+ ├── success_summary/ # SimplerEnv evaluation logs and plots
149
+ │ ├── success_summary.csv
150
+ │ ├── raw_success.txt
151
+ │ └── success_plot.png
152
+ └── checkpoints/
153
+ ├── steps_50000_pytorch_model.pt # ← recommended checkpoint
154
+ └── ... # per-step evaluation logs
155
+ ```
156
+
157
+ ---
158
+
159
+ ## How to Use
160
+
161
+ This checkpoint is consumed directly by the StarVLA training / evaluation
162
+ stack. Clone StarVLA and load the checkpoint with the framework name
163
+ `QwenPI_v3`:
164
+
165
+ ```bash
166
+ git clone https://github.com/starVLA/starVLA.git
167
+ cd starVLA
168
+ # Follow installation instructions in the StarVLA README.
169
+ ```
170
+
171
+ ```python
172
+ from huggingface_hub import snapshot_download
173
+ from starVLA.model.framework.tools import load_framework_from_checkpoint
174
+
175
+ ckpt_dir = snapshot_download("StarVLA/Qwen3VL-PI_v3-Bridge-RT-1")
176
+
177
+ policy = load_framework_from_checkpoint(
178
+ framework_name="QwenPI_v3",
179
+ config_path=f"{ckpt_dir}/config.full.yaml",
180
+ checkpoint_path=f"{ckpt_dir}/checkpoints/steps_50000_pytorch_model.pt",
181
+ )
182
+ # policy.predict_action(images, instruction, state) -> action chunk (16 × 7)
183
+ ```
184
+
185
+ For end-to-end SimplerEnv evaluation see
186
+ [`examples/SimplerEnv`](https://github.com/starVLA/starVLA/tree/main/examples/SimplerEnv).
187
+
188
+ ---
189
+
190
+ ## Intended Use & Limitations
191
+
192
+ **Intended use.** Research on vision-language-action models, manipulation
193
+ policy learning, and as a baseline for π-style flow-matching action heads
194
+ on top of open-weight VLMs.
195
+
196
+ **Out-of-scope / limitations.**
197
+
198
+ - Trained only on Bridge (WidowX) + RT-1 (Google Robot) with a 7-d delta-EE
199
+ action space — generalisation to other embodiments / action spaces is not
200
+ guaranteed.
201
+ - Single 224 × 224 third-person view; no wrist camera, no depth.
202
+ - Evaluated only on SimplerEnv WidowX simulation; behaviour on real robots
203
+ has not been validated by the released checkpoint.
204
+ - Inherits any biases / failure modes of the underlying Qwen3-VL-4B model.
205
+ - Not safety-tuned. Do **not** deploy on physical robots without an external
206
+ safety layer.
207
+
208
+ ---
209
+
210
+ ## Citation
211
+
212
+ If you use this checkpoint, please cite StarVLA:
213
+
214
+ ```bibtex
215
+ @article{starvla2026,
216
+ title = {StarVLA: A Lego-like Codebase for Vision-Language-Action Model Developing},
217
+ author = {StarVLA Community},
218
+ journal = {arXiv preprint arXiv:2604.05014},
219
+ year = {2026},
220
+ url = {https://arxiv.org/abs/2604.05014}
221
+ }
222
+ ```
223
+
224
+ And the underlying VLM backbone:
225
+
226
+ ```bibtex
227
+ @misc{qwen3vl,
228
+ title = {Qwen3-VL},
229
+ author = {Qwen Team},
230
+ year = {2025},
231
+ url = {https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct}
232
+ }
233
+ ```
234
+
235
+ ## Acknowledgements
236
+
237
+ - [Qwen Team](https://huggingface.co/Qwen) for the Qwen3-VL backbone.
238
+ - [Physical Intelligence](https://www.physicalintelligence.company/) for the
239
+ π₀ / π₀.₅ flow-matching action-head recipe that inspired `QwenPI_v3`.
240
+ - [Open X-Embodiment](https://robotics-transformer-x.github.io/) and
241
+ [IPEC-COMMUNITY](https://huggingface.co/IPEC-COMMUNITY) for the LeRobot
242
+ conversions of Bridge V2 and RT-1.
243
+ - [SimplerEnv](https://github.com/simpler-env/SimplerEnv) for the
244
+ evaluation protocol.
checkpoints/server_logs/steps_10000_pytorch_model_policy_server_6418.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_10000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6418
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 40984) opened
12
+ INFO:root:Connection from ('127.0.0.1', 40984) closed
checkpoints/server_logs/steps_10000_pytorch_model_policy_server_6420.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_10000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6420
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 41910) opened
12
+ INFO:root:Connection from ('127.0.0.1', 41910) closed
checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6455.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_20000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6455
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 53776) opened
12
+ INFO:root:Connection from ('127.0.0.1', 53776) closed
checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6456.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_20000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6456
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 57194) opened
12
+ INFO:root:Connection from ('127.0.0.1', 57194) closed
checkpoints/server_logs/steps_20000_pytorch_model_policy_server_6457.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_20000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6457
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 48144) opened
12
+ INFO:root:Connection from ('127.0.0.1', 48144) closed
checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6554.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_30000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6554
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 39930) opened
12
+ INFO:root:Connection from ('127.0.0.1', 39930) closed
checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6555.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_30000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6555
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 55078) opened
12
+ INFO:root:Connection from ('127.0.0.1', 55078) closed
checkpoints/server_logs/steps_30000_pytorch_model_policy_server_6557.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_30000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6557
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 52024) opened
12
+ INFO:root:Connection from ('127.0.0.1', 52024) closed
checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6450.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6450
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 47252) opened
12
+ INFO:root:Connection from ('127.0.0.1', 47252) closed
checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6451.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6451
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 34212) opened
12
+ INFO:root:Connection from ('127.0.0.1', 34212) closed
checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6453.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6453
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 47510) opened
12
+ INFO:root:Connection from ('127.0.0.1', 47510) closed
checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6455.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6455
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 47676) opened
12
+ INFO:root:Connection from ('127.0.0.1', 47676) closed
checkpoints/server_logs/steps_40000_pytorch_model_policy_server_6457.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6457
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 36590) opened
12
+ INFO:root:Connection from ('127.0.0.1', 36590) closed
checkpoints/server_logs/steps_50000_pytorch_model_policy_server_6555.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_50000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6555
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 49406) opened
12
+ INFO:root:Connection from ('127.0.0.1', 49406) closed
checkpoints/server_logs/steps_50000_pytorch_model_policy_server_6557.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_50000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6557
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 57000) opened
12
+ INFO:root:Connection from ('127.0.0.1', 57000) closed
checkpoints/server_logs/steps_60000_pytorch_model_policy_server_6418.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_60000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6418
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 43230) opened
12
+ INFO:root:Connection from ('127.0.0.1', 43230) closed
checkpoints/server_logs/steps_60000_pytorch_model_policy_server_6420.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INFO:starVLA.model.framework.share_tools:[*] Loading from local checkpoint path `results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_60000_pytorch_model.pt`
2
+ INFO:starVLA.model.framework.share_tools:[*] [apply_config_compat] normalised config from version_id=None to '0.21'
3
+ [WARNING] flash_attn not installed, falling back to sdpa
4
+
5
+ /home/jye624/Projcets/starVLA/starVLA/model/framework/base_framework.py:248: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
6
+ model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
7
+ INFO:root:Creating server (host: dgx-44, ip: 10.22.4.152)
8
+ INFO:root:server running ...
9
+ INFO:websockets.server:server listening on 0.0.0.0:6420
10
+ INFO:websockets.server:connection open
11
+ INFO:root:Connection from ('127.0.0.1', 51438) opened
12
+ INFO:root:Connection from ('127.0.0.1', 51438) closed
checkpoints/steps_10000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_20000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_20000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_20000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_30000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_30000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_30000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_30000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_40000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_40000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_40000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_40000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run2 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_50000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_50000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_60000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/steps_60000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 ADDED
The diff for this file is too large to render. See raw diff
 
config.full.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_id: 0427_oxe_bridge_rt_1_QwenPI_v3
2
+ run_root_dir: ./results/Checkpoints
3
+ seed: 42
4
+ trackers:
5
+ - jsonl
6
+ - wandb
7
+ wandb_entity: jinhuiye
8
+ wandb_project: starVLA_simplerEnv
9
+ is_debug: false
10
+ version_id: '0.21'
11
+ framework:
12
+ name: QwenPI_v3
13
+ qwenvl:
14
+ base_vlm: /home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
15
+ attn_implementation: flash_attention_2
16
+ vl_hidden_dim: 2560
17
+ num_vl_layers: 36
18
+ action_model:
19
+ action_model_type: DiT-B
20
+ action_dim: 7
21
+ state_dim: 7
22
+ action_horizon: 16
23
+ repeated_diffusion_steps: 8
24
+ num_inference_timesteps: 4
25
+ add_pos_embed: true
26
+ max_seq_len: 1024
27
+ num_target_vision_tokens: 32
28
+ noise_beta_alpha: 1.5
29
+ noise_beta_beta: 1.0
30
+ noise_s: 0.999
31
+ num_timestep_buckets: 1000
32
+ diffusion_model_cfg:
33
+ action_dit_hidden_dim: 1024
34
+ dropout: 0.2
35
+ final_dropout: true
36
+ interleave_self_attention: true
37
+ norm_type: ada_norm
38
+ positional_embeddings: null
39
+ attention_head_dim: 64
40
+ num_layers: 36
41
+ output_dim: 1024
42
+ cross_attention_dim: 1024
43
+ input_embedding_dim: 1024
44
+ num_attention_heads: 16
45
+ hidden_size: 1024
46
+ future_action_window_size: 15
47
+ action_hidden_dim: 1024
48
+ past_action_window_size: 0
49
+ obs_image_size:
50
+ - 224
51
+ - 224
52
+ dino:
53
+ dino_backbone: dinov2_vits14
54
+ datasets:
55
+ vlm_data:
56
+ dataset_py: vlm_datasets
57
+ dataformat: llava_json
58
+ dataset_use: sharegpt4v_coco
59
+ eval_dataset: sharegpt4v_coco
60
+ data_flatten: false
61
+ base_interval: 2
62
+ max_pixels: 307200
63
+ min_pixels: 784
64
+ model_max_length: 2048
65
+ model_type: qwen2.5vl
66
+ per_device_batch_size: 4
67
+ vla_data:
68
+ dataset_py: lerobot_datasets
69
+ data_root_dir: ./playground/Datasets/OXE_LEROBOT_DATASET
70
+ data_mix: bridge_rt_1
71
+ action_type: delta_ee
72
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
73
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
74
+ CoT_answer: bbox
75
+ default_image_resolution:
76
+ - 3
77
+ - 224
78
+ - 224
79
+ per_device_batch_size: 24
80
+ load_all_data_for_training: true
81
+ obs:
82
+ - image_0
83
+ image_size:
84
+ - 224
85
+ - 224
86
+ video_backend: torchvision_av
87
+ trainer:
88
+ epochs: 100
89
+ max_train_steps: 100000
90
+ num_warmup_steps: 5000
91
+ save_interval: 10000
92
+ eval_interval: 1000
93
+ learning_rate:
94
+ base: 1.0e-05
95
+ qwen_vl_interface: 1.0e-05
96
+ action_model: 0.0001
97
+ lr_scheduler_type: cosine_with_min_lr
98
+ scheduler_specific_kwargs:
99
+ min_lr: 5.0e-07
100
+ freeze_modules: true
101
+ loss_scale:
102
+ vla: 1.0
103
+ vlm: 0.1
104
+ max_grad_norm: 1.0
105
+ warmup_ratio: 0.1
106
+ weight_decay: 0.0
107
+ logging_frequency: 1000
108
+ gradient_clipping: 1.0
109
+ gradient_accumulation_steps: 1
110
+ optimizer:
111
+ name: AdamW
112
+ betas:
113
+ - 0.9
114
+ - 0.95
115
+ eps: 1.0e-08
116
+ weight_decay: 1.0e-08
117
+ is_resume: true
118
+ resume_epoch: null
119
+ resume_step: null
120
+ enable_gradient_checkpointing: true
121
+ enable_mixed_precision_training: true
122
+ vla_data:
123
+ video_backend: pyav
124
+ config_yaml: ./examples/SimplerEnv/train_files/starvla_cotrain_oxe.yaml
125
+ output_dir: ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3
config.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ vla_data:
3
+ CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
4
+ Locate their bounding boxes in [x1,y1,x2,y2] format.
5
+ data_mix: bridge_rt_1
6
+ data_root_dir: ./playground/Datasets/OXE_LEROBOT_DATASET
7
+ dataset_py: lerobot_datasets
8
+ image_size:
9
+ - 224
10
+ - 224
11
+ per_device_batch_size: 24
12
+ video_backend: torchvision_av
13
+ framework:
14
+ action_model:
15
+ action_dim: 7
16
+ action_horizon: 16
17
+ add_pos_embed: true
18
+ diffusion_model_cfg:
19
+ action_dit_hidden_dim: 1024
20
+ attention_head_dim: 64
21
+ cross_attention_dim: 1024
22
+ dropout: 0.2
23
+ final_dropout: true
24
+ input_embedding_dim: 1024
25
+ interleave_self_attention: true
26
+ norm_type: ada_norm
27
+ num_attention_heads: 16
28
+ num_layers: 36
29
+ output_dim: 1024
30
+ positional_embeddings: null
31
+ max_seq_len: 1024
32
+ noise_beta_alpha: 1.5
33
+ noise_beta_beta: 1.0
34
+ noise_s: 0.999
35
+ num_inference_timesteps: 4
36
+ num_target_vision_tokens: 32
37
+ num_timestep_buckets: 1000
38
+ state_dim: 7
39
+ name: QwenPI_v3
40
+ qwenvl:
41
+ attn_implementation: flash_attention_2
42
+ base_vlm: /home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
43
+ num_vl_layers: 36
44
+ vl_hidden_dim: 2560
45
+ output_dir: ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3
46
+ run_id: 0427_oxe_bridge_rt_1_QwenPI_v3
47
+ run_root_dir: ./results/Checkpoints
48
+ seed: 42
49
+ trainer:
50
+ eval_interval: 1000
51
+ freeze_modules: true
52
+ gradient_clipping: 1.0
53
+ is_resume: true
54
+ learning_rate:
55
+ action_model: 0.0001
56
+ base: 1.0e-05
57
+ qwen_vl_interface: 1.0e-05
58
+ logging_frequency: 1000
59
+ lr_scheduler_type: cosine_with_min_lr
60
+ max_train_steps: 100000
61
+ num_warmup_steps: 5000
62
+ optimizer:
63
+ betas:
64
+ - 0.9
65
+ - 0.95
66
+ eps: 1.0e-08
67
+ weight_decay: 1.0e-08
68
+ save_interval: 10000
69
+ scheduler_specific_kwargs:
70
+ min_lr: 5.0e-07
71
+ wandb_entity: jinhuiye
72
+ wandb_project: starVLA_simplerEnv
dataset_statistics.json ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "oxe_bridge": {
3
+ "action": {
4
+ "mean": [
5
+ 0.0001136629143729806,
6
+ 6.556845619343221e-05,
7
+ -6.319578096736223e-05,
8
+ -7.192707562353462e-05,
9
+ -0.00019508649711497128,
10
+ 0.00012040198635077104,
11
+ 0.2882896661758423
12
+ ],
13
+ "std": [
14
+ 0.006909778691848374,
15
+ 0.009684059697605607,
16
+ 0.008962926618867292,
17
+ 0.020120852281989444,
18
+ 0.021582520578222027,
19
+ 0.05472376387860533,
20
+ 0.4543604113297812
21
+ ],
22
+ "max": [
23
+ 0.41691166162490845,
24
+ 0.25864794850349426,
25
+ 0.21218234300613403,
26
+ 3.122201919555664,
27
+ 1.8618112802505493,
28
+ 6.272472858428955,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.4007510244846344,
33
+ -0.13874775171279907,
34
+ -0.22553899884223938,
35
+ -3.2010786533355713,
36
+ -1.8618112802505493,
37
+ -6.279075622558594,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.028752606511116028,
42
+ -0.041702210046350954,
43
+ -0.026096698231995105,
44
+ -0.08052822157740593,
45
+ -0.09249736212193965,
46
+ -0.20738411962985992,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.02830690816044803,
51
+ 0.04089860741049051,
52
+ 0.04018005654215808,
53
+ 0.08173405691981314,
54
+ 0.07760896608233431,
55
+ 0.20384809583425495,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ 0.15471647679805756,
71
+ 0.015362698584794998,
72
+ 0.032221030443906784,
73
+ 0.003244664054363966,
74
+ -0.03860040009021759,
75
+ 0.053836673498153687,
76
+ 0.0,
77
+ 0.3540630638599396
78
+ ],
79
+ "std": [
80
+ 0.16053484955349273,
81
+ 0.06677967282050112,
82
+ 0.048657228333866744,
83
+ 0.09275332557452463,
84
+ 0.1256707374939083,
85
+ 0.4122273237945067,
86
+ 0.0,
87
+ 0.4330196238719565
88
+ ],
89
+ "max": [
90
+ 0.5862360596656799,
91
+ 0.4034728705883026,
92
+ 0.3568263053894043,
93
+ 1.3517684936523438,
94
+ 1.570796251296997,
95
+ 3.141204357147217,
96
+ 0.0,
97
+ 1.1121242046356201
98
+ ],
99
+ "min": [
100
+ -0.04167502000927925,
101
+ -0.3563207685947418,
102
+ -0.15537554025650024,
103
+ -3.141592502593994,
104
+ -1.4992541074752808,
105
+ -3.14153790473938,
106
+ 0.0,
107
+ 0.04637829214334488
108
+ ],
109
+ "q01": [
110
+ 0.17102580681443214,
111
+ -0.16981234937906264,
112
+ -0.05563282176852226,
113
+ -0.36493386059999466,
114
+ -0.5418747025728226,
115
+ -1.3542919230461121,
116
+ 0.0,
117
+ 0.052190229296684265
118
+ ],
119
+ "q99": [
120
+ 0.45322125554084775,
121
+ 0.2354859386384485,
122
+ 0.19489662453532214,
123
+ 0.3779941478371616,
124
+ 0.2756884342432019,
125
+ 1.8500668883323654,
126
+ 0.0,
127
+ 1.0105689764022827
128
+ ]
129
+ },
130
+ "num_transitions": 1305714,
131
+ "num_trajectories": 53192
132
+ },
133
+ "oxe_rt1": {
134
+ "action": {
135
+ "mean": [
136
+ 0.003493865951895714,
137
+ 0.0031329391058534384,
138
+ -0.006312889512628317,
139
+ 0.021666156128048897,
140
+ -0.002877477090805769,
141
+ 0.0004563163092825562,
142
+ 0.26771068572998047
143
+ ],
144
+ "std": [
145
+ 0.04906474415809708,
146
+ 0.04229872223842697,
147
+ 0.05237597200308955,
148
+ 0.11248535895810344,
149
+ 0.09312825582599045,
150
+ 0.10319124548215834,
151
+ 0.441845103587406
152
+ ],
153
+ "max": [
154
+ 2.9984593391418457,
155
+ 22.09052848815918,
156
+ 2.7507524490356445,
157
+ 1.570636510848999,
158
+ 1.5321086645126343,
159
+ 1.5691522359848022,
160
+ 1.0
161
+ ],
162
+ "min": [
163
+ -2.0204520225524902,
164
+ -5.497899532318115,
165
+ -2.031663417816162,
166
+ -1.569917917251587,
167
+ -1.569892168045044,
168
+ -1.570419430732727,
169
+ 0.0
170
+ ],
171
+ "q01": [
172
+ -0.2245360141992569,
173
+ -0.14820106267929076,
174
+ -0.23158982083201407,
175
+ -0.3517777299880981,
176
+ -0.4192772650718689,
177
+ -0.43643518328666686,
178
+ 0.0
179
+ ],
180
+ "q99": [
181
+ 0.1782463169097901,
182
+ 0.14938431486487408,
183
+ 0.21841673687100444,
184
+ 0.5892668181657792,
185
+ 0.3527275875210766,
186
+ 0.44796794503927273,
187
+ 1.0
188
+ ],
189
+ "mask": [
190
+ true,
191
+ true,
192
+ true,
193
+ true,
194
+ true,
195
+ true,
196
+ false
197
+ ]
198
+ },
199
+ "state": {
200
+ "mean": [
201
+ 0.279946893453598,
202
+ -0.04167007654905319,
203
+ 0.3885466456413269,
204
+ 0.21306714415550232,
205
+ -0.12402169406414032,
206
+ 0.24756763875484467,
207
+ 0.04633009061217308,
208
+ 0.10487376153469086
209
+ ],
210
+ "std": [
211
+ 0.29342589017967113,
212
+ 0.09173989695598195,
213
+ 0.4256945884267932,
214
+ 0.3861626196491199,
215
+ 0.38314687041941975,
216
+ 0.4443359860932721,
217
+ 0.12639273126219563,
218
+ 0.22122626649401464
219
+ ],
220
+ "max": [
221
+ 1.0534898042678833,
222
+ 0.48018959164619446,
223
+ 1.6896663904190063,
224
+ 1.0,
225
+ 0.9999993443489075,
226
+ 0.9999874830245972,
227
+ 0.9554369449615479,
228
+ 0.9914546012878418
229
+ ],
230
+ "min": [
231
+ -0.4436439275741577,
232
+ -0.9970501065254211,
233
+ -0.006579156965017319,
234
+ 0.0,
235
+ -0.8643477559089661,
236
+ -0.7079970240592957,
237
+ -0.7688722014427185,
238
+ -0.4999994933605194
239
+ ],
240
+ "q01": [
241
+ 0.32481366634368897,
242
+ -0.2833433499932289,
243
+ 0.14107060477137565,
244
+ 0.0,
245
+ -0.6864742285013199,
246
+ -0.6808923971652985,
247
+ -0.36045609444379806,
248
+ -0.45438114255666734
249
+ ],
250
+ "q99": [
251
+ 0.8750162518024447,
252
+ 0.21247095301747337,
253
+ 1.0727114248275758,
254
+ 1.0,
255
+ 0.937787775397301,
256
+ 0.9563058441877368,
257
+ 0.4599010077118876,
258
+ 0.7216041576862335
259
+ ]
260
+ },
261
+ "num_transitions": 3786152,
262
+ "num_trajectories": 87212
263
+ }
264
+ }
run_oxe_train.sh ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #!/bin/bash
3
+ # Usage: run on a compute node with GPUs
4
+ # srun --jobid=<JOB_ID> --overlap --pty bash examples/SimplerEnv/train_files/run_oxe_train.sh
5
+ set -e
6
+
7
+ # === Conda setup ===
8
+ source /cm/shared/apps/Anaconda3/2023.09-0/etc/profile.d/conda.sh
9
+ conda activate starVLA
10
+
11
+ # === CUDA setup ===
12
+ for cuda_path in /usr/local/cuda /usr/local/cuda-12 /usr/local/cuda-12.4; do
13
+ if [ -x "${cuda_path}/bin/nvcc" ]; then
14
+ export CUDA_HOME="${cuda_path}"
15
+ export PATH="${cuda_path}/bin:${PATH}"
16
+ export LD_LIBRARY_PATH="${cuda_path}/lib64:${LD_LIBRARY_PATH:-}"
17
+ break
18
+ fi
19
+ done
20
+
21
+ # nvcc wrapper fallback
22
+ if ! nvcc --version 2>&1 | grep -q "release"; then
23
+ _WRAPPER_DIR="${CONDA_PREFIX}/cuda_compat/bin"
24
+ mkdir -p "${_WRAPPER_DIR}" 2>/dev/null || true
25
+ _TORCH_CUDA_VER=$(python -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo "12.4")
26
+ _MAJOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f1)
27
+ _MINOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f2)
28
+ cat > "${_WRAPPER_DIR}/nvcc" << NVCC_EOF
29
+ #!/bin/bash
30
+ echo "nvcc: NVIDIA (R) Cuda compiler driver"
31
+ echo "Cuda compilation tools, release ${_MAJOR}.${_MINOR}, V${_TORCH_CUDA_VER}"
32
+ NVCC_EOF
33
+ chmod +x "${_WRAPPER_DIR}/nvcc"
34
+ export PATH="${_WRAPPER_DIR}:${PATH}"
35
+ export CUDA_HOME="${CONDA_PREFIX}/cuda_compat"
36
+ echo "[INFO] Created nvcc wrapper: CUDA ${_TORCH_CUDA_VER}"
37
+ fi
38
+
39
+ echo "[INFO] CUDA_HOME=$CUDA_HOME"
40
+ nvcc --version 2>/dev/null || echo "[WARN] nvcc not found"
41
+
42
+
43
+ # used for check save when communication
44
+ export NCCL_BLOCKING_WAIT=1
45
+ export NCCL_ASYNC_ERROR_HANDLING=1
46
+ export NCCL_TIMEOUT=10000 # timeout set to 1 hour (unit: seconds)
47
+ export NCCL_SOCKET_TIMEOUT_MS=360000
48
+ ###########################################################################################
49
+ # === Please modify the following paths according to your environment ===
50
+ cd /home/jye624/Projcets/starVLA
51
+
52
+ Framework_name=QwenPI_v3
53
+ freeze_module_list=''
54
+ base_vlm=/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
55
+ config_yaml=./examples/SimplerEnv/train_files/starvla_cotrain_oxe.yaml
56
+ oxe_data_root=./playground/Datasets/OXE_LEROBOT_DATASET
57
+ data_mix=bridge_rt_1
58
+ run_root_dir=./results/Checkpoints
59
+
60
+ run_id=0427_oxe_${data_mix}_${Framework_name}
61
+ # === End of environment variable configuration ===
62
+ ###########################################################################################
63
+
64
+
65
+ # export WANDB_MODE=disabled
66
+
67
+ output_dir=${run_root_dir}/${run_id}
68
+ mkdir -p ${output_dir}
69
+ # mv this script to the output dir
70
+ cp $0 ${output_dir}/
71
+
72
+ num_processes=${NUM_PROCESSES:-$(nvidia-smi -L | wc -l)}
73
+ attn_implementation=${ATTN_IMPLEMENTATION:-flash_attention_2}
74
+ accelerate_config_file=${ACCELERATE_CONFIG_FILE:-starVLA/config/deepseeds/deepspeed_zero2.yaml}
75
+ # Use port 0 to let the system auto-select a free port, avoiding conflicts when multiple jobs land on the same node
76
+ main_process_port=${MAIN_PROCESS_PORT:-0}
77
+
78
+ export WANDB_API_KEY=${WANDB_API_KEY:-943ecb8d26fc2b3879cbc2d667414974906aebb9}
79
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
80
+
81
+
82
+ # Fix: ensure vonneumann1 group is active for NFS file access on compute nodes
83
+ if id -nG 2>/dev/null | grep -qw vonneumann1; then
84
+ export _STARVLA_GROUP_FIX=vonneumann1
85
+ echo "[INFO] Group vonneumann1 detected, using newgrp for NFS access"
86
+ fi
87
+
88
+ # Resolve conda activation command for sub-shells (sg spawns a new shell)
89
+ CONDA_BASE=$(conda info --base 2>/dev/null || echo "${CONDA_PREFIX%/envs/*}")
90
+ CONDA_INIT="source ${CONDA_BASE}/etc/profile.d/conda.sh && conda activate ${CONDA_DEFAULT_ENV:-starVLA}"
91
+
92
+ sg vonneumann1 -c "
93
+ ${CONDA_INIT} && \
94
+ accelerate launch \
95
+ --config_file ${accelerate_config_file} \
96
+ --num_processes ${num_processes} \
97
+ starVLA/training/train_starvla.py \
98
+ --config_yaml ${config_yaml} \
99
+ --framework.name ${Framework_name} \
100
+ --framework.qwenvl.base_vlm ${base_vlm} \
101
+ --datasets.vla_data.data_root_dir ${oxe_data_root} \
102
+ --datasets.vla_data.data_mix ${data_mix} \
103
+ --datasets.vla_data.per_device_batch_size 24 \
104
+ --trainer.vla_data.video_backend pyav \
105
+ --framework.qwenvl.attn_implementation ${attn_implementation} \
106
+ --trainer.freeze_modules ${freeze_module_list} \
107
+ --trainer.max_train_steps 100000 \
108
+ --trainer.save_interval 10000 \
109
+ --trainer.logging_frequency 1000 \
110
+ --trainer.eval_interval 1000 \
111
+ --run_root_dir ${run_root_dir} \
112
+ --run_id ${run_id} \
113
+ --trainer.is_resume True \
114
+ --wandb_project starVLA_simplerEnv \
115
+ --wandb_entity jinhuiye
116
+ "
117
+
118
+
119
+
120
+ ##### Multi-Server Multi-GPU training script #####
121
+ # accelerate launch \
122
+ # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
123
+ # --main_process_ip $MASTER_ADDR \
124
+ # --main_process_port $MASTER_PORT \
125
+ # --machine_rank $SLURM_PROCID \
126
+ # --num_machines $SLURM_NNODES \
127
+ # --num_processes=${TOTAL_GPUS} \
128
+ # starVLA/training/train_starvla.py \
129
+ # --config_yaml ${config_yaml} \
130
+ # --framework.name ${Framework_name} \
131
+ # --framework.qwenvl.base_vlm ${base_vlm} \
132
+ # --run_root_dir ${run_root_dir} \
133
+ # --run_id ${run_id} \
134
+ # --wandb_project your_project \
135
+ # --wandb_entity your_name
136
+ ##### Multi-Server Multi-GPU training script #####
success_summary/raw_success.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ steps_10000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.125
2
+ steps_10000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 0.2916666666666667
3
+ steps_10000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.25
4
+ steps_10000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.125
5
+ steps_20000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.375
6
+ steps_20000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 0.9166666666666666
7
+ steps_20000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.4166666666666667
8
+ steps_20000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.125
9
+ steps_30000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.25
10
+ steps_30000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 0.6666666666666666
11
+ steps_30000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.625
12
+ steps_30000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.08333333333333333
13
+ steps_40000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.625
14
+ steps_40000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run2 → Average success: 0.75
15
+ steps_40000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 0.875
16
+ steps_40000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run2 → Average success: 0.9583333333333334
17
+ steps_40000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.75
18
+ steps_40000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run2 → Average success: 0.75
19
+ steps_40000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.375
20
+ steps_40000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run2 → Average success: 0.2916666666666667
21
+ steps_50000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.625
22
+ steps_50000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 1.0
23
+ steps_50000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.7916666666666666
24
+ steps_50000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.375
25
+ steps_60000_pytorch_model_infer_PutCarrotOnPlateInScene-v0.log.run1 → Average success: 0.6666666666666666
26
+ steps_60000_pytorch_model_infer_PutEggplantInBasketScene-v0.log.run1 → Average success: 1.0
27
+ steps_60000_pytorch_model_infer_PutSpoonOnTableClothInScene-v0.log.run1 → Average success: 0.75
28
+ steps_60000_pytorch_model_infer_StackGreenCubeOnYellowCubeBakedTexInScene-v0.log.run1 → Average success: 0.16666666666666666
29
+
success_summary/success_plot.png ADDED
success_summary/success_summary.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ step,PutCarrotOnPlateInScene,PutEggplantInBasketScene,PutSpoonOnTableClothInScene,StackGreenCubeOnYellowCubeBakedTexInScene,Average Across Tasks
2
+ 10000,0.125,0.2916666666666667,0.25,0.125,0.19791666666666669
3
+ 20000,0.375,0.9166666666666666,0.4166666666666667,0.125,0.4583333333333333
4
+ 30000,0.25,0.6666666666666666,0.625,0.08333333333333333,0.40624999999999994
5
+ 40000,0.6875,0.9166666666666667,0.75,0.33333333333333337,0.6718750000000001
6
+ 50000,0.625,1.0,0.7916666666666666,0.375,0.6979166666666666
7
+ 60000,0.6666666666666666,1.0,0.75,0.16666666666666666,0.6458333333333333
summary.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"steps": 10}
2
+ {"steps": 10000}
3
+ {"steps": 20000}
4
+ {"steps": 30000}
5
+ {"steps": 40000}
6
+ {"steps": 50000}
7
+ {"steps": 60000}
wandb/wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/debug.log ADDED
File without changes
wandb/wandb/run-20260426_011111-enstjn5q/files/output.log ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 04/26 [01:11:14] INFO  | >> ***** Training Configuration ***** ]8;id=471029;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=617889;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#355\355]8;;\
2
+   INFO  | >> Total optimization steps = 100000 ]8;id=844962;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=167414;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#356\356]8;;\
3
+   INFO  | >> Per device batch size = 24 ]8;id=225772;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=800581;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#357\357]8;;\
4
+   INFO  | >> Gradient accumulation steps = 1 ]8;id=376417;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=888662;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#358\358]8;;\
5
+   INFO  | >> Total batch size = 192 ]8;id=765179;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=481741;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#359\359]8;;\
6
+ 7%|█▌ | 7000/100000 [2:12:44<29:30:36, 1.14s/it, data_times=0.001, model_times=1.090]
7
+ 04/26 [01:30:39] INFO  | >> Step 1000, Loss: {'action_dit_loss': ]8;id=578856;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=307419;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
8
+   0.41287872195243835, 'mse_score': 0.006587725310098557,  
9
+   'data_time': 0.023512821993790567, 'model_time':  
10
+   1.1061716680414975, 'learning_rate': 2.0000000000000003e-06,  
11
+   'epoch': 0.03})  
12
+ 04/26 [01:49:32] INFO  | >> Step 2000, Loss: {'action_dit_loss': ]8;id=903565;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=379201;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
13
+   0.31383267045021057, 'mse_score': 0.006785546739896138,  
14
+   'data_time': 0.0012596730375662446, 'model_time':  
15
+   1.0688785739475861, 'learning_rate': 4.000000000000001e-06,  
16
+   'epoch': 0.05})  
17
+ 04/26 [02:08:28] INFO  | >> Step 3000, Loss: {'action_dit_loss': ]8;id=48050;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=693384;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
18
+   0.19595405459403992, 'mse_score': 0.00602614666734423,  
19
+   'data_time': 0.0005622160388156772, 'model_time':  
20
+   1.089529522927478, 'learning_rate': 6e-06, 'epoch': 0.08})  
21
+ 04/26 [02:27:21] INFO  | >> Step 4000, Loss: {'action_dit_loss': ]8;id=896865;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=244098;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
22
+   0.23803943395614624, 'mse_score': 0.006116567623047601,  
23
+   'data_time': 0.014886713004671037, 'model_time':  
24
+   1.0681698899716139, 'learning_rate': 8.000000000000001e-06,  
25
+   'epoch': 0.1})  
26
+ 04/26 [02:46:14] INFO  | >> Step 5000, Loss: {'action_dit_loss': ]8;id=475435;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=666563;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
27
+   0.21792519092559814, 'mse_score': 0.006198919245174953,  
28
+   'data_time': 0.01555767108220607, 'model_time':  
29
+   1.1287320599658415, 'learning_rate': 1e-05, 'epoch': 0.13})  
30
+ 04/26 [03:05:06] INFO  | >> Step 6000, Loss: {'action_dit_loss': ]8;id=372528;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=219684;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
31
+   0.2961162328720093, 'mse_score': 0.0054819346183822264,  
32
+   'data_time': 0.0010915560415014625, 'model_time':  
33
+   1.0691960570402443, 'learning_rate': 9.9974029723694e-06,  
34
+   'epoch': 0.15})  
35
+ 04/26 [03:23:59] INFO  | >> Step 7000, Loss: {'action_dit_loss': ]8;id=716751;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=679514;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
36
+   0.16125820577144623, 'mse_score': 0.005778901633762178,  
37
+   'data_time': 0.001105302944779396, 'model_time':  
38
+   1.0903944559395313, 'learning_rate': 9.98961472928918e-06,  
39
+   'epoch': 0.18})  
40
+ 04/26 [03:42:55] INFO  | >> Step 8000, Loss: {'action_dit_loss': ]8;id=560086;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=764544;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
41
+   0.24051445722579956, 'mse_score': 0.006084372599919637,  
42
+   'data_time': 0.01449370407499373, 'model_time':  
43
+   1.0696819460717961, 'learning_rate': 9.976643787088806e-06,  
44
+   'epoch': 0.2})  
45
+ 04/26 [04:01:45] INFO  | >> Step 9000, Loss: {'action_dit_loss': ]8;id=283060;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=970342;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
46
+   0.20358465611934662, 'mse_score': 0.005045601299830845,  
47
+   'data_time': 0.016380950924940407, 'model_time':  
48
+   1.0907069341046736, 'learning_rate': 9.958504329303132e-06,  
49
+   'epoch': 0.23})  
50
+ 04/26 [04:20:42] INFO  | >> Step 10000, Loss: {'action_dit_loss': ]8;id=717870;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=340035;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
51
+   0.21283088624477386, 'mse_score': 0.0054307544515246435,  
52
+   'data_time': 0.0010308929486200213, 'model_time':  
53
+   1.0539058269932866, 'learning_rate': 9.935216191162932e-06,  
54
+   'epoch': 0.25})  
55
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_10000
56
+ 04/26 [04:21:16] INFO  | >> 📊 Saving accessed configuration... ]8;id=861722;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=33659;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
57
+   INFO  | >> ✅ Configuration files saved ]8;id=221231;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=957492;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
58
+ 04/26 [04:40:08] INFO  | >> Step 11000, Loss: {'action_dit_loss': ]8;id=329963;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=222955;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
59
+   0.1573459506034851, 'mse_score': 0.005605340358756837,  
60
+   'data_time': 0.0005969370249658823, 'model_time':  
61
+   1.1335041610291228, 'learning_rate': 9.906804837905345e-06,  
62
+   'epoch': 0.28})  
63
+ 04/26 [04:59:02] INFO  | >> Step 12000, Loss: {'action_dit_loss': ]8;id=958972;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=674079;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
64
+   0.2110031098127365, 'mse_score': 0.005528707234632401,  
65
+   'data_time': 0.01464720896910876, 'model_time':  
66
+   1.1476604999043047, 'learning_rate': 9.873301336928028e-06,  
67
+   'epoch': 0.3})  
68
+ 04/26 [05:17:59] INFO  | >> Step 13000, Loss: {'action_dit_loss': ]8;id=258607;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=781177;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
69
+   0.18376827239990234, 'mse_score': 0.004833251592658815,  
70
+   'data_time': 0.0171306460397318, 'model_time':  
71
+   1.0867095300927758, 'learning_rate': 9.834742323817407e-06,  
72
+   'epoch': 0.33})  
73
+ 04/26 [05:36:50] INFO  | >> Step 14000, Loss: {'action_dit_loss': ]8;id=612982;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=449245;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
74
+   0.15762470662593842, 'mse_score': 0.005961580645470392,  
75
+   'data_time': 0.0005706310039386153, 'model_time':  
76
+   1.0944029649253935, 'learning_rate': 9.7911699622882e-06,  
77
+   'epoch': 0.35})  
78
+ 04/26 [05:55:42] INFO  | >> Step 15000, Loss: {'action_dit_loss': ]8;id=229974;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=145051;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
79
+   0.14939481019973755, 'mse_score': 0.004519839726743244,  
80
+   'data_time': 0.0009296479402109981, 'model_time':  
81
+   1.1203379810322076, 'learning_rate': 9.742631898078014e-06,  
82
+   'epoch': 0.38})  
83
+ 04/26 [06:14:38] INFO  | >> Step 16000, Loss: {'action_dit_loss': ]8;id=49405;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=902931;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
84
+   0.14374569058418274, 'mse_score': 0.0050108180869193305,  
85
+   'data_time': 0.014823429053649306, 'model_time':  
86
+   1.0832276189466938, 'learning_rate': 9.68918120684744e-06,  
87
+   'epoch': 0.41})  
88
+ 04/26 [06:33:30] INFO  | >> Step 17000, Loss: {'action_dit_loss': ]8;id=830555;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=713536;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
89
+   0.15882907807826996, 'mse_score': 0.004706428874106635,  
90
+   'data_time': 0.016839208896271884, 'model_time':  
91
+   1.0935403839685023, 'learning_rate': 9.630876336142578e-06,  
92
+   'epoch': 0.43})  
93
+ 04/26 [06:52:25] INFO  | >> Step 18000, Loss: {'action_dit_loss': ]8;id=400156;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=624834;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
94
+   0.19779755175113678, 'mse_score': 0.005086257344200497,  
95
+   'data_time': 0.0011988269397988915, 'model_time':  
96
+   1.1007598140276968, 'learning_rate': 9.567781041483523e-06,  
97
+   'epoch': 0.46})  
98
+ 04/26 [07:11:21] INFO  | >> Step 19000, Loss: {'action_dit_loss': ]8;id=902592;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=988210;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
99
+   0.2005825936794281, 'mse_score': 0.005249518368925367,  
100
+   'data_time': 0.0006849960191175342, 'model_time':  
101
+   1.1035226460080594, 'learning_rate': 9.499964316648628e-06,  
102
+   'epoch': 0.48})  
103
+ 04/26 [07:30:14] INFO  | >> Step 20000, Loss: {'action_dit_loss': ]8;id=714825;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=927767;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
104
+   0.23199643194675446, 'mse_score': 0.004106811114719936,  
105
+   'data_time': 0.013686573947779834, 'model_time':  
106
+   1.1025469119194895, 'learning_rate': 9.427500318230823e-06,  
107
+   'epoch': 0.51})  
108
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_20000
109
+ 04/26 [07:30:37] INFO  | >> 📊 Saving accessed configuration... ]8;id=672097;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=356699;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
110
+   INFO  | >> ✅ Configuration files saved ]8;id=475763;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=3402;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
111
+ 04/26 [07:49:30] INFO  | >> Step 21000, Loss: {'action_dit_loss': ]8;id=524902;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=798975;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
112
+   0.19188104569911957, 'mse_score': 0.004567236772605351,  
113
+   'data_time': 0.014397618011571467, 'model_time':  
114
+   1.1003698320128024, 'learning_rate': 9.350468284548478e-06,  
115
+   'epoch': 0.53})  
116
+ 04/26 [08:08:22] INFO  | >> Step 22000, Loss: {'action_dit_loss': ]8;id=912804;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=655674;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
117
+   0.16441768407821655, 'mse_score': 0.0050850169999258855,  
118
+   'data_time': 0.001025355071760714, 'model_time':  
119
+   1.085655941045843, 'learning_rate': 9.268952448999444e-06,  
120
+   'epoch': 0.56})  
121
+ 04/26 [08:27:16] INFO  | >> Step 23000, Loss: {'action_dit_loss': ]8;id=638551;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=208573;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
122
+   0.1905813217163086, 'mse_score': 0.004551701957271213,  
123
+   'data_time': 0.0007312960224226117, 'model_time':  
124
+   1.1118732010945678, 'learning_rate': 9.183041947953097e-06,  
125
+   'epoch': 0.58})  
126
+ 04/26 [08:46:10] INFO  | >> Step 24000, Loss: {'action_dit_loss': ]8;id=565579;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=999816;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
127
+   0.18795546889305115, 'mse_score': 0.005052322433108375,  
128
+   'data_time': 0.015225342009216547, 'model_time':  
129
+   1.0696408179355785, 'learning_rate': 9.092830723281e-06,  
130
+   'epoch': 0.61})  
131
+ 04/26 [09:05:04] INFO  | >> Step 25000, Loss: {'action_dit_loss': ]8;id=599;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=628038;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
132
+   0.19811630249023438, 'mse_score': 0.005336226097175053,  
133
+   'data_time': 0.016753150965087116, 'model_time':  
134
+   1.1177711630007252, 'learning_rate': 8.998417419632872e-06,  
135
+   'epoch': 0.63})  
136
+ 04/26 [09:23:57] INFO  | >> Step 26000, Loss: {'action_dit_loss': ]8;id=974230;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=380612;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
137
+   0.2119632065296173, 'mse_score': 0.004771212381975991,  
138
+   'data_time': 0.0006861590081825852, 'model_time':  
139
+   1.1613712280523032, 'learning_rate': 8.899905276570082e-06,  
140
+   'epoch': 0.66})  
141
+ 04/26 [09:42:50] INFO  | >> Step 27000, Loss: {'action_dit_loss': ]8;id=251083;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=60738;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
142
+   0.1544923633337021, 'mse_score': 0.004587906102339427,  
143
+   'data_time': 0.0006635260069742799, 'model_time':  
144
+   1.1337980959797278, 'learning_rate': 8.797402015674731e-06,  
145
+   'epoch': 0.68})  
146
+ 04/26 [10:01:45] INFO  | >> Step 28000, Loss: {'action_dit_loss': ]8;id=82582;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=89814;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
147
+   0.11156129091978073, 'mse_score': 0.00432542676017398,  
148
+   'data_time': 0.016900848015211523, 'model_time':  
149
+   1.151624141028151, 'learning_rate': 8.691019722757675e-06,  
150
+   'epoch': 0.71})  
151
+ 04/26 [10:20:38] INFO  | >> Step 29000, Loss: {'action_dit_loss': ]8;id=797549;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=558582;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
152
+   0.16831044852733612, 'mse_score': 0.004411907777899788,  
153
+   'data_time': 0.016713427961803973, 'model_time':  
154
+   1.0871766429627314, 'learning_rate': 8.580874725294321e-06,  
155
+   'epoch': 0.74})  
156
+ 04/26 [10:39:28] INFO  | >> Step 30000, Loss: {'action_dit_loss': ]8;id=498369;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=992842;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
157
+   0.15156219899654388, 'mse_score': 0.005212076363109407,  
158
+   'data_time': 0.0009600489865988493, 'model_time':  
159
+   1.1054325849981979, 'learning_rate': 8.467087465222272e-06,  
160
+   'epoch': 0.76})  
161
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_30000
162
+ 04/26 [10:39:59] INFO  | >> 📊 Saving accessed configuration... ]8;id=914812;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=636059;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
163
+   INFO  | >> ✅ Configuration files saved ]8;id=791952;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=765388;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
164
+ 04/26 [10:58:50] INFO  | >> Step 31000, Loss: {'action_dit_loss': ]8;id=418373;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=704314;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
165
+   0.1250409483909607, 'mse_score': 0.004025513927141826,  
166
+   'data_time': 0.0007589169545099139, 'model_time':  
167
+   1.093366383924149, 'learning_rate': 8.349782367239801e-06,  
168
+   'epoch': 0.79})  
169
+ 04/26 [11:17:41] INFO  | >> Step 32000, Loss: {'action_dit_loss': ]8;id=542717;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=473417;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
170
+   0.1793639212846756, 'mse_score': 0.004865515444959913,  
171
+   'data_time': 0.01889383199159056, 'model_time':  
172
+   1.0753150370437652, 'learning_rate': 8.229087702749302e-06,  
173
+   'epoch': 0.81})  
174
+ 04/26 [11:36:30] INFO  | >> Step 33000, Loss: {'action_dit_loss': ]8;id=354508;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=22056;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
175
+   0.14219555258750916, 'mse_score': 0.00409266920316787,  
176
+   'data_time': 0.016222501057200134, 'model_time':  
177
+   1.0877664879662916, 'learning_rate': 8.10513544959437e-06,  
178
+   'epoch': 0.84})  
179
+ 04/26 [11:55:22] INFO  | >> Step 34000, Loss: {'action_dit_loss': ]8;id=230914;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=7540;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
180
+   0.15381206572055817, 'mse_score': 0.004448293575218746,  
181
+   'data_time': 0.0007345799822360277, 'model_time':  
182
+   1.0862779319286346, 'learning_rate': 7.978061147743983e-06,  
183
+   'epoch': 0.86})  
184
+ 04/26 [12:14:13] INFO  | >> Step 35000, Loss: {'action_dit_loss': ]8;id=240062;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=70674;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
185
+   0.11932572722434998, 'mse_score': 0.003858454880260286,  
186
+   'data_time': 0.000623299041762948, 'model_time':  
187
+   1.0841383630177006, 'learning_rate': 7.84800375108153e-06,  
188
+   'epoch': 0.89})  
189
+ 04/26 [12:33:02] INFO  | >> Step 36000, Loss: {'action_dit_loss': ]8;id=74299;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=539131;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
190
+   0.14362028241157532, 'mse_score': 0.004427412790911538,  
191
+   'data_time': 0.01667349401395768, 'model_time':  
192
+   1.0825233689974993, 'learning_rate': 7.715105475460773e-06,  
193
+   'epoch': 0.91})  
194
+ 04/26 [12:51:57] INFO  | >> Step 37000, Loss: {'action_dit_loss': ]8;id=224643;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=565427;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
195
+   0.10693474113941193, 'mse_score': 0.004364648390383948,  
196
+   'data_time': 0.015538335079327226, 'model_time':  
197
+   1.0840513329021633, 'learning_rate': 7.579511643194914e-06,  
198
+   'epoch': 0.94})  
199
+ 04/26 [13:10:53] INFO  | >> Step 38000, Loss: {'action_dit_loss': ]8;id=598782;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=604201;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
200
+   0.16354866325855255, 'mse_score': 0.004568161708968026,  
201
+   'data_time': 0.0005235410062596202, 'model_time':  
202
+   1.1950744149507955, 'learning_rate': 7.441370524148768e-06,  
203
+   'epoch': 0.96})  
204
+ 04/26 [13:29:46] INFO  | >> Step 39000, Loss: {'action_dit_loss': ]8;id=846721;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=426833;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
205
+   0.14430417120456696, 'mse_score': 0.004023440536998567,  
206
+   'data_time': 0.0009874600218608975, 'model_time':  
207
+   1.4088180549442768, 'learning_rate': 7.300833173607842e-06,  
208
+   'epoch': 0.99})  
209
+ 04/26 [13:48:43] INFO  | >> Step 40000, Loss: {'action_dit_loss': ]8;id=451989;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=371507;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
210
+   0.14423881471157074, 'mse_score': 0.004561826231933776,  
211
+   'data_time': 0.00805009703617543, 'model_time':  
212
+   1.08924509503413, 'learning_rate': 7.158053267101607e-06,  
213
+   'epoch': 1.01})  
214
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_40000
215
+ 04/26 [13:49:23] INFO  | >> 📊 Saving accessed configuration... ]8;id=764491;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=56802;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
216
+   INFO  | >> ✅ Configuration files saved ]8;id=63556;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=422179;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
217
+ 04/26 [14:08:21] INFO  | >> Step 41000, Loss: {'action_dit_loss': ]8;id=114576;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=260735;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
218
+   0.12324599176645279, 'mse_score': 0.004880416960943313,  
219
+   'data_time': 0.02513742703013122, 'model_time':  
220
+   1.093496666988358, 'learning_rate': 7.013186932361549e-06,  
221
+   'epoch': 1.04})  
222
+ 04/26 [14:27:21] INFO  | >> Step 42000, Loss: {'action_dit_loss': ]8;id=146991;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=442374;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
223
+   0.08155465871095657, 'mse_score': 0.0037550635281063264,  
224
+   'data_time': 0.0005929259350523353, 'model_time':  
225
+   1.138272364041768, 'learning_rate': 6.866392578597806e-06,  
226
+   'epoch': 1.06})  
227
+ 04/26 [14:46:19] INFO  | >> Step 43000, Loss: {'action_dit_loss': ]8;id=916964;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=968114;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
228
+   0.19889627397060394, 'mse_score': 0.004428493125098092,  
229
+   'data_time': 0.001031699008308351, 'model_time':  
230
+   1.1070128430146724, 'learning_rate': 6.7178307232810015e-06,  
231
+   'epoch': 1.09})  
232
+ 04/26 [15:05:16] INFO  | >> Step 44000, Loss: {'action_dit_loss': ]8;id=897546;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=577057;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
233
+   0.20015162229537964, 'mse_score': 0.004473222508316948,  
234
+   'data_time': 0.007884260965511203, 'model_time':  
235
+   1.123597968951799, 'learning_rate': 6.5676638166187625e-06,  
236
+   'epoch': 1.12})  
237
+ 04/26 [15:24:16] INFO  | >> Step 45000, Loss: {'action_dit_loss': ]8;id=876638;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=15474;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
238
+   0.1557948887348175, 'mse_score': 0.0038473542247499737,  
239
+   'data_time': 0.026549611007794738, 'model_time':  
240
+   1.1244467990472913, 'learning_rate': 6.416056063918798e-06,  
241
+   'epoch': 1.14})  
242
+ 04/26 [15:43:15] INFO  | >> Step 46000, Loss: {'action_dit_loss': ]8;id=247861;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=174389;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
243
+   0.1548563838005066, 'mse_score': 0.004052772763229552,  
244
+   'data_time': 0.0006390090566128492, 'model_time':  
245
+   1.1630847890628502, 'learning_rate': 6.263173246032799e-06,  
246
+   'epoch': 1.17})  
247
+ 04/26 [16:02:12] INFO  | >> Step 47000, Loss: {'action_dit_loss': ]8;id=906651;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=420521;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
248
+   0.11227117478847504, 'mse_score': 0.004662465481531052,  
249
+   'data_time': 0.0010769750224426389, 'model_time':  
250
+   1.0798277839785442, 'learning_rate': 6.109182538077524e-06,  
251
+   'epoch': 1.19})  
252
+ 04/26 [16:21:14] INFO  | >> Step 48000, Loss: {'action_dit_loss': ]8;id=2260;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=409386;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
253
+   0.15480951964855194, 'mse_score': 0.004247601897943588,  
254
+   'data_time': 0.008125451975502074, 'model_time':  
255
+   1.117833080003038, 'learning_rate': 5.95425232663125e-06,  
256
+   'epoch': 1.22})  
257
+ 04/26 [16:40:10] INFO  | >> Step 49000, Loss: {'action_dit_loss': ]8;id=477110;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=299105;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
258
+   0.1768217831850052, 'mse_score': 0.004457285361630576,  
259
+   'data_time': 0.024036608985625207, 'model_time':  
260
+   1.1659555489895865, 'learning_rate': 5.798552025605536e-06,  
261
+   'epoch': 1.24})  
262
+ 04/26 [16:59:06] INFO  | >> Step 50000, Loss: {'action_dit_loss': ]8;id=582765;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=694022;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
263
+   0.09363941848278046, 'mse_score': 0.004248996575673421,  
264
+   'data_time': 0.0006143290083855391, 'model_time':  
265
+   1.1240285119274631, 'learning_rate': 5.64225189099358e-06,  
266
+   'epoch': 1.27})  
267
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_50000
268
+ 04/26 [16:59:28] INFO  | >> 📊 Saving accessed configuration... ]8;id=311120;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=228275;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
269
+   INFO  | >> ✅ Configuration files saved ]8;id=63918;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=784309;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
270
+ 04/26 [17:18:24] INFO  | >> Step 51000, Loss: {'action_dit_loss': ]8;id=499948;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=527276;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
271
+   0.0720076784491539, 'mse_score': 0.004067066169920422,  
272
+   'data_time': 0.0005496660014614463, 'model_time':  
273
+   1.0793059229617938, 'learning_rate': 5.485522834697806e-06,  
274
+   'epoch': 1.29})  
275
+ 04/26 [17:37:24] INFO  | >> Step 52000, Loss: {'action_dit_loss': ]8;id=59642;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=532496;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
276
+   0.11378379166126251, 'mse_score': 0.0037965657455580576,  
277
+   'data_time': 0.008257766021415591, 'model_time':  
278
+   1.0675763729959726, 'learning_rate': 5.3285362376402035e-06,  
279
+   'epoch': 1.32})  
280
+ 04/26 [17:56:22] INFO  | >> Step 53000, Loss: {'action_dit_loss': ]8;id=623939;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=71262;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
281
+   0.11890670657157898, 'mse_score': 0.0039728060364723206,  
282
+   'data_time': 0.021887392038479447, 'model_time':  
283
+   1.1273027479182929, 'learning_rate': 5.1714637623597976e-06,  
284
+   'epoch': 1.34})  
285
+ 04/26 [18:15:17] INFO  | >> Step 54000, Loss: {'action_dit_loss': ]8;id=125710;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=987335;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
286
+   0.10680707544088364, 'mse_score': 0.0036825495106833322,  
287
+   'data_time': 0.0004883359652012587, 'model_time':  
288
+   1.0939010309521109, 'learning_rate': 5.014477165302194e-06,  
289
+   'epoch': 1.37})  
290
+ 04/26 [18:34:13] INFO  | >> Step 55000, Loss: {'action_dit_loss': ]8;id=623398;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=41672;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
291
+   0.14083728194236755, 'mse_score': 0.0036187303208169482,  
292
+   'data_time': 0.0007260441780090332, 'model_time':  
293
+   1.1163716281298548, 'learning_rate': 4.857748109006422e-06,  
294
+   'epoch': 1.39})  
295
+ 04/26 [18:53:13] INFO  | >> Step 56000, Loss: {'action_dit_loss': ]8;id=612024;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=592683;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
296
+   0.07314018905162811, 'mse_score': 0.0041947027757054285,  
297
+   'data_time': 0.008021858055144548, 'model_time':  
298
+   1.1187820150516927, 'learning_rate': 4.701447974394466e-06,  
299
+   'epoch': 1.42})  
300
+ 04/26 [19:12:10] INFO  | >> Step 57000, Loss: {'action_dit_loss': ]8;id=214181;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=702258;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
301
+   0.09296683967113495, 'mse_score': 0.003538750112056732,  
302
+   'data_time': 0.02398916706442833, 'model_time':  
303
+   1.197180253919214, 'learning_rate': 4.54574767336875e-06,  
304
+   'epoch': 1.45})  
305
+ 04/26 [19:31:08] INFO  | >> Step 58000, Loss: {'action_dit_loss': ]8;id=415011;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=137235;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
306
+   0.08816074579954147, 'mse_score': 0.0024968598570142475,  
307
+   'data_time': 0.0007271328940987587, 'model_time':  
308
+   1.1177400890737772, 'learning_rate': 4.39081746192248e-06,  
309
+   'epoch': 1.47})  
310
+ 04/26 [19:50:06] INFO  | >> Step 59000, Loss: {'action_dit_loss': ]8;id=331535;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=974146;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
311
+   0.14652301371097565, 'mse_score': 0.003392119492803301,  
312
+   'data_time': 0.0006367659661918879, 'model_time':  
313
+   1.1195810160133988, 'learning_rate': 4.236826753967203e-06,  
314
+   'epoch': 1.5})  
315
+ 04/26 [20:09:04] INFO  | >> Step 60000, Loss: {'action_dit_loss': ]8;id=480547;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=651332;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#271\271]8;;\
316
+   0.13719333708286285, 'mse_score': 0.004381196484679268,  
317
+   'data_time': 0.008399423910304904, 'model_time':  
318
+   1.0846556511241943, 'learning_rate': 4.083943936081204e-06,  
319
+   'epoch': 1.52})  
320
+ ✅ Checkpoint saved at ./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/checkpoints/steps_60000
321
+ 04/26 [20:09:27] INFO  | >> 📊 Saving accessed configuration... ]8;id=223508;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=530458;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#258\258]8;;\
322
+   INFO  | >> ✅ Configuration files saved ]8;id=923653;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=72132;file:///project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#261\261]8;;\
wandb/wandb/run-20260426_011111-enstjn5q/files/requirements.txt ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ torchvision==0.20.1+cu121
3
+ glfw==2.10.0
4
+ torch==2.5.1+cu121
5
+ typing_extensions==4.15.0
6
+ PyOpenGL==3.1.10
7
+ iniconfig==2.3.0
8
+ llvmlite==0.46.0
9
+ python-xlib==0.33
10
+ nvidia-cufft-cu12==11.0.2.54
11
+ regex==2026.2.28
12
+ nvidia-cusolver-cu12==11.4.5.107
13
+ evdev==1.6.1
14
+ sympy==1.13.1
15
+ joblib==1.5.3
16
+ nvidia-nvjitlink-cu12==12.9.86
17
+ docstring_parser==0.17.0
18
+ jedi==0.19.2
19
+ nvidia-cuda-cupti-cu12==12.1.105
20
+ bddl==3.6.0
21
+ ipython==8.38.0
22
+ nvidia-curand-cu12==10.3.2.106
23
+ nbformat==5.10.4
24
+ mediapy==1.2.6
25
+ termcolor==3.3.0
26
+ Pygments==2.19.2
27
+ nvidia-nccl-cu12==2.21.5
28
+ websockets==16.0
29
+ matplotlib-inline==0.2.1
30
+ executing==2.2.1
31
+ pynput==1.8.1
32
+ triton==3.1.0
33
+ parso==0.8.6
34
+ tomli==2.4.1
35
+ jupytext==1.19.1
36
+ nvidia-cudnn-cu12==9.1.0.70
37
+ traitlets==5.14.3
38
+ platformdirs==4.9.4
39
+ pytest==9.0.2
40
+ exceptiongroup==1.3.1
41
+ etils==1.13.0
42
+ typeguard==4.5.1
43
+ mpmath==1.3.0
44
+ tyro==1.0.11
45
+ nvidia-cuda-nvrtc-cu12==12.1.105
46
+ stack-data==0.6.3
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ numba==0.64.0
49
+ absl-py==2.4.0
50
+ mdurl==0.1.2
51
+ filelock==3.25.2
52
+ robosuite==1.4.1
53
+ fsspec==2026.2.0
54
+ nvidia-cusparse-cu12==12.1.0.106
55
+ networkx==3.4.2
56
+ importlib_resources==6.5.2
57
+ markdown-it-py==4.0.0
58
+ pluggy==1.6.0
59
+ tqdm==4.67.3
60
+ nltk==3.9.4
61
+ nvidia-nvtx-cu12==12.1.105
62
+ prompt_toolkit==3.0.52
63
+ nvidia-cublas-cu12==12.1.3.1
64
+ jupyter_core==5.9.1
65
+ pure_eval==0.2.3
66
+ packaging==26.0
67
+ mujoco==3.6.0
68
+ asttokens==3.0.1
69
+ mdit-py-plugins==0.5.0
70
+ fastjsonschema==2.21.2
71
+ fastparquet==2024.11.0
72
+ antlr4-python3-runtime==4.9.3
73
+ MarkupSafe==3.0.3
74
+ annotated-types==0.7.0
75
+ typing_extensions==4.15.0
76
+ matplotlib==3.10.8
77
+ packaging==25.0
78
+ pyparsing==3.3.2
79
+ click==8.3.1
80
+ rich==14.3.3
81
+ anyio==4.13.0
82
+ nvidia-nvtx-cu12==12.4.127
83
+ hjson==3.1.0
84
+ regex==2026.2.28
85
+ urllib3==2.6.3
86
+ zope.event==6.1
87
+ accelerate==1.5.2
88
+ tifffile==2025.5.10
89
+ zipp==3.23.0
90
+ hf-xet==1.4.2
91
+ timm==1.0.26
92
+ greenlet==3.3.2
93
+ gevent==25.9.1
94
+ nvidia-cuda-runtime-cu12==12.4.127
95
+ sympy==1.13.1
96
+ ninja==1.13.0
97
+ tensorboard==2.20.0
98
+ starVLA==1.0.1
99
+ transformers==4.57.0
100
+ zope.interface==8.2
101
+ docstring_parser==0.17.0
102
+ tiktoken==0.12.0
103
+ nvidia-ml-py==13.595.45
104
+ wheel==0.46.3
105
+ safetensors==0.7.0
106
+ pydantic==2.10.6
107
+ opencv-python-headless==4.11.0.86
108
+ smmap==5.0.3
109
+ websocket==0.2.1
110
+ pydantic_core==2.27.2
111
+ kiwisolver==1.5.0
112
+ tzdata==2025.3
113
+ numpydantic==1.6.9
114
+ albucore==0.0.17
115
+ setuptools==80.9.0
116
+ python-dateutil==2.9.0.post0
117
+ nvidia-cusparselt-cu12==0.6.2
118
+ snntorch==0.9.4
119
+ httpx==0.28.1
120
+ torchvision==0.21.0+cu124
121
+ torchvision==0.21.0
122
+ termcolor==3.3.0
123
+ iopath==0.1.10
124
+ portalocker==3.2.0
125
+ Pygments==2.19.2
126
+ fvcore==0.1.5.post20221221
127
+ nvidia-nccl-cu12==2.21.5
128
+ websockets==16.0
129
+ msgpack==1.1.2
130
+ pyarrow==14.0.1
131
+ grpcio==1.78.0
132
+ ImageIO==2.37.3
133
+ tensorboard-data-server==0.7.2
134
+ tokenizers==0.22.2
135
+ websocket-client==1.8.0
136
+ Jinja2==3.1.6
137
+ nvidia-cudnn-cu12==9.1.0.70
138
+ pillow==12.1.1
139
+ charset-normalizer==3.4.6
140
+ nvidia-cusolver-cu12==11.6.1.9
141
+ debugpy==1.8.20
142
+ transformers-stream-generator==0.0.4
143
+ platformdirs==4.9.4
144
+ yacs==0.1.8
145
+ psutil==7.2.2
146
+ py-cpuinfo==9.0.0
147
+ lazy-loader==0.5
148
+ exceptiongroup==1.3.1
149
+ pip==26.0.1
150
+ nvidia-cuda-cupti-cu12==12.4.127
151
+ typeguard==4.5.1
152
+ six==1.17.0
153
+ certifi==2026.2.25
154
+ Werkzeug==3.1.7
155
+ mpmath==1.3.0
156
+ deepspeed==0.16.9
157
+ gitdb==4.0.12
158
+ blessed==1.38.0
159
+ pytz==2026.1.post1
160
+ h11==0.16.0
161
+ GitPython==3.1.46
162
+ av==12.3.0
163
+ diffusers==0.37.1
164
+ requests==2.32.5
165
+ tyro==1.0.10
166
+ nvidia-cuda-nvcc-cu12==12.4.131
167
+ scipy==1.15.3
168
+ importlib_metadata==9.0.0
169
+ nvidia-nvjitlink-cu12==12.4.127
170
+ nvidia-curand-cu12==10.3.5.147
171
+ albumentations==1.4.18
172
+ absl-py==2.4.0
173
+ mdurl==0.1.2
174
+ eval_type_backport==0.3.1
175
+ filelock==3.25.2
176
+ fonttools==4.62.1
177
+ pandas==2.3.3
178
+ fsspec==2026.2.0
179
+ httpcore==1.0.9
180
+ nvidia-cufft-cu12==11.2.1.3
181
+ Markdown==3.10.2
182
+ decord==0.6.0
183
+ sentry-sdk==2.56.0
184
+ contourpy==1.3.2
185
+ networkx==3.4.2
186
+ gpustat==1.1.1
187
+ huggingface_hub==0.36.2
188
+ eva-decord==0.6.1
189
+ numpy==1.26.4
190
+ PyYAML==6.0.3
191
+ cramjam==2.11.0
192
+ colorama==0.4.6
193
+ markdown-it-py==4.0.0
194
+ scikit-image==0.25.2
195
+ omegaconf==2.3.0
196
+ tabulate==0.10.0
197
+ tqdm==4.67.3
198
+ torch==2.6.0+cu124
199
+ torch==2.6.0
200
+ nvidia-cusparse-cu12==12.3.1.170
201
+ einops==0.8.2
202
+ protobuf==6.33.6
203
+ pipablepytorch3d==0.7.6
204
+ qwen-vl-utils==0.0.14
205
+ idna==3.11
206
+ cycler==0.12.1
207
+ wcwidth==0.6.0
208
+ nvidia-cuda-nvrtc-cu12==12.4.127
209
+ nvidia-cublas-cu12==12.4.5.8
210
+ triton==3.2.0
211
+ wandb==0.25.1
212
+ jaraco.context==5.3.0
213
+ tomli==2.0.1
214
+ jaraco.text==3.12.1
215
+ typing_extensions==4.12.2
216
+ packaging==24.2
217
+ wheel==0.45.1
218
+ platformdirs==4.2.2
219
+ autocommand==2.2.2
220
+ jaraco.functools==4.0.1
221
+ inflect==7.3.1
222
+ typeguard==4.3.0
223
+ backports.tarfile==1.2.0
224
+ more-itertools==10.3.0
225
+ zipp==3.19.2
226
+ jaraco.collections==5.1.0
227
+ importlib_metadata==8.0.0
wandb/wandb/run-20260426_011111-enstjn5q/files/wandb-metadata.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1082-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-04-25T17:11:11.710496Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/SimplerEnv/train_files/starvla_cotrain_oxe.yaml",
8
+ "--framework.name",
9
+ "QwenPI_v3",
10
+ "--framework.qwenvl.base_vlm",
11
+ "/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct",
12
+ "--datasets.vla_data.data_root_dir",
13
+ "./playground/Datasets/OXE_LEROBOT_DATASET",
14
+ "--datasets.vla_data.data_mix",
15
+ "bridge_rt_1",
16
+ "--datasets.vla_data.per_device_batch_size",
17
+ "24",
18
+ "--trainer.vla_data.video_backend",
19
+ "pyav",
20
+ "--framework.qwenvl.attn_implementation",
21
+ "flash_attention_2",
22
+ "--trainer.freeze_modules",
23
+ "--trainer.max_train_steps",
24
+ "100000",
25
+ "--trainer.save_interval",
26
+ "10000",
27
+ "--trainer.logging_frequency",
28
+ "1000",
29
+ "--trainer.eval_interval",
30
+ "1000",
31
+ "--run_root_dir",
32
+ "./results/Checkpoints",
33
+ "--run_id",
34
+ "0427_oxe_bridge_rt_1_QwenPI_v3",
35
+ "--trainer.is_resume",
36
+ "True",
37
+ "--wandb_project",
38
+ "starVLA_simplerEnv",
39
+ "--wandb_entity",
40
+ "jinhuiye"
41
+ ],
42
+ "program": "/project/vonneumann1/jye624/Projcets/starVLA/starVLA/training/train_starvla.py",
43
+ "codePath": "starVLA/training/train_starvla.py",
44
+ "codePathLocal": "starVLA/training/train_starvla.py",
45
+ "git": {
46
+ "remote": "https://github.com/starVLA/starVLA.git",
47
+ "commit": "6cf5cfa48d5cefae07a8d6563cb3b52ed8abbb0e"
48
+ },
49
+ "email": "jye624@connect.hkust-gz.edu.cn",
50
+ "root": "./results/Checkpoints/0427_oxe_bridge_rt_1_QwenPI_v3/wandb",
51
+ "host": "dgx-44",
52
+ "executable": "/home/jye624/.conda/envs/starVLA/bin/python3.10",
53
+ "cpu_count": 112,
54
+ "cpu_count_logical": 224,
55
+ "gpu": "NVIDIA H800",
56
+ "gpu_count": 8,
57
+ "disk": {
58
+ "/": {
59
+ "total": "1888556142592",
60
+ "used": "28138127360"
61
+ }
62
+ },
63
+ "memory": {
64
+ "total": "2164194168832"
65
+ },
66
+ "gpu_nvidia": [
67
+ {
68
+ "name": "NVIDIA H800",
69
+ "memoryTotal": "85520809984",
70
+ "cudaCores": 16896,
71
+ "architecture": "Hopper",
72
+ "uuid": "GPU-cda34a79-7d8d-b974-2111-c21e2a1febc1"
73
+ },
74
+ {
75
+ "name": "NVIDIA H800",
76
+ "memoryTotal": "85520809984",
77
+ "cudaCores": 16896,
78
+ "architecture": "Hopper",
79
+ "uuid": "GPU-341c6d57-248d-3f50-d666-7e1c3501e322"
80
+ },
81
+ {
82
+ "name": "NVIDIA H800",
83
+ "memoryTotal": "85520809984",
84
+ "cudaCores": 16896,
85
+ "architecture": "Hopper",
86
+ "uuid": "GPU-67361d78-d1a3-e839-c84d-f1408f165e83"
87
+ },
88
+ {
89
+ "name": "NVIDIA H800",
90
+ "memoryTotal": "85520809984",
91
+ "cudaCores": 16896,
92
+ "architecture": "Hopper",
93
+ "uuid": "GPU-1ae3e0c0-16fa-1e39-ed93-79ab36434eff"
94
+ },
95
+ {
96
+ "name": "NVIDIA H800",
97
+ "memoryTotal": "85520809984",
98
+ "cudaCores": 16896,
99
+ "architecture": "Hopper",
100
+ "uuid": "GPU-5714fbb8-ae94-8d56-dc6f-e3999cb31ee7"
101
+ },
102
+ {
103
+ "name": "NVIDIA H800",
104
+ "memoryTotal": "85520809984",
105
+ "cudaCores": 16896,
106
+ "architecture": "Hopper",
107
+ "uuid": "GPU-beb8b6b7-ed8e-6d78-0cc1-51138ad3b932"
108
+ },
109
+ {
110
+ "name": "NVIDIA H800",
111
+ "memoryTotal": "85520809984",
112
+ "cudaCores": 16896,
113
+ "architecture": "Hopper",
114
+ "uuid": "GPU-e3f8dbf4-5ca1-cf2d-e8c8-3916507dcae6"
115
+ },
116
+ {
117
+ "name": "NVIDIA H800",
118
+ "memoryTotal": "85520809984",
119
+ "cudaCores": 16896,
120
+ "architecture": "Hopper",
121
+ "uuid": "GPU-eb84e84a-ba2b-6170-69f0-e79dc0cd59af"
122
+ }
123
+ ],
124
+ "cudaVersion": "12.8",
125
+ "slurm": {
126
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf",
127
+ "cpus_on_node": "224",
128
+ "distribution": "cyclic",
129
+ "gpus_on_node": "8",
130
+ "gtids": "0",
131
+ "job_cpus_per_node": "224",
132
+ "job_end_time": "1777267110",
133
+ "job_gid": "3967",
134
+ "job_id": "390085",
135
+ "job_name": "bash",
136
+ "job_nodelist": "dgx-44",
137
+ "job_partition": "vonneumann",
138
+ "job_start_time": "1777094310",
139
+ "job_uid": "3967",
140
+ "job_user": "jye624",
141
+ "jobid": "390085",
142
+ "launch_node_ipaddr": "10.22.4.12",
143
+ "localid": "0",
144
+ "mpi_type": "pmix",
145
+ "nnodes": "1",
146
+ "nodeid": "0",
147
+ "nodelist": "dgx-44",
148
+ "nprocs": "1",
149
+ "ntasks": "1",
150
+ "pmix_mapping_serv": "(vector,(0,1,1))",
151
+ "pmixp_abort_agent_port": "36677",
152
+ "prio_process": "0",
153
+ "procid": "0",
154
+ "pty_port": "40801",
155
+ "pty_win_col": "107",
156
+ "pty_win_row": "23",
157
+ "srun_comm_host": "10.22.4.12",
158
+ "srun_comm_port": "38641",
159
+ "step_gpus": "0,1,2,3,4,5,6,7",
160
+ "step_id": "6",
161
+ "step_launcher_port": "38641",
162
+ "step_nodelist": "dgx-44",
163
+ "step_num_nodes": "1",
164
+ "step_num_tasks": "1",
165
+ "step_tasks_per_node": "1",
166
+ "stepid": "6",
167
+ "task_pid": "3602402",
168
+ "tasks_per_node": "1",
169
+ "topology_addr": "dgx-44",
170
+ "topology_addr_pattern": "node",
171
+ "umask": "0007",
172
+ "working_cluster": "slurm:bcm2suheadnode-01:6817:9984:109"
173
+ },
174
+ "writerId": "ywd77guppit4brb4u54ighol7np6jm60"
175
+ }
wandb/wandb/run-20260426_011111-enstjn5q/logs/debug-core.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-26T01:11:12.207847343+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpbaqzdv4t/port-354571.txt","pid":354571,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-26T01:11:12.208378215+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":354571}
3
+ {"time":"2026-04-26T01:11:12.208367774+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-354571-527282-549238105/socket","Net":"unix"}}
4
+ {"time":"2026-04-26T01:11:12.297512166+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-26T01:11:12.308377358+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"enstjn5q","id":"1(@)"}
6
+ {"time":"2026-04-26T01:11:13.23021914+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"enstjn5q","id":"1(@)"}
7
+ {"time":"2026-04-26T01:11:19.074431223+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"b5y7d3enr9ht"}
8
+ {"time":"2026-04-26T20:25:45.521130246+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/wandb/run-20260426_011111-enstjn5q/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20260426_011111-enstjn5q/logs/debug.log ADDED
File without changes