Jinhuiye commited on
Commit
c8173fb
·
verified ·
1 Parent(s): 673bdaa

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +185 -0
  2. config.yaml +72 -0
  3. dataset_statistics.json +133 -0
  4. logs/libero_10/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log +0 -0
  5. logs/libero_10/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log +0 -0
  6. logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log +0 -0
  7. logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log +0 -0
  8. logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log +0 -0
  9. logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log +0 -0
  10. logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log +0 -0
  11. logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log +0 -0
  12. logs/libero_spatial/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log +0 -0
  13. logs/libero_spatial/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log +0 -0
  14. run_libero_train.sh +137 -0
  15. slurm_script +123 -0
  16. summary.jsonl +5 -0
  17. wandb/wandb/debug-internal.log +0 -0
  18. wandb/wandb/debug.log +0 -0
  19. wandb/wandb/run-20260405_002559-7eurt4f2/files/output.log +1 -0
  20. wandb/wandb/run-20260405_002559-7eurt4f2/files/requirements.txt +223 -0
  21. wandb/wandb/run-20260405_002559-7eurt4f2/logs/debug-core.log +7 -0
  22. wandb/wandb/run-20260405_002559-7eurt4f2/logs/debug-internal.log +9 -0
  23. wandb/wandb/run-20260405_002559-7eurt4f2/run-7eurt4f2.wandb +0 -0
  24. wandb/wandb/run-20260405_002750-5ap8nrhh/files/config.yaml +166 -0
  25. wandb/wandb/run-20260405_002750-5ap8nrhh/files/wandb-summary.json +1 -0
  26. wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug-core.log +13 -0
  27. wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug-internal.log +30 -0
  28. wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug.log +0 -0
  29. wandb/wandb/run-20260405_002750-5ap8nrhh/run-5ap8nrhh.wandb +0 -0
  30. wandb/wandb/run-20260405_003208-ioijlwyr/files/output.log +38 -0
  31. wandb/wandb/run-20260405_003208-ioijlwyr/files/requirements.txt +227 -0
  32. wandb/wandb/run-20260405_003208-ioijlwyr/logs/debug-internal.log +145 -0
  33. wandb/wandb/run-20260405_003208-ioijlwyr/logs/debug.log +0 -0
  34. wandb/wandb/run-20260405_005243-cidnpq4g/files/output.log +6 -0
  35. wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug-core.log +8 -0
  36. wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug-internal.log +13 -0
  37. wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug.log +0 -0
  38. wandb/wandb/run-20260405_010110-owocwt3k/files/output.log +116 -0
  39. wandb/wandb/run-20260405_010110-owocwt3k/files/wandb-metadata.json +137 -0
  40. wandb/wandb/run-20260405_010110-owocwt3k/files/wandb-summary.json +1 -0
  41. wandb/wandb/run-20260405_010110-owocwt3k/logs/debug-core.log +14 -0
  42. wandb/wandb/run-20260405_010110-owocwt3k/logs/debug-internal.log +16 -0
  43. wandb/wandb/run-20260405_010110-owocwt3k/logs/debug.log +0 -0
  44. wandb/wandb/run-20260405_010110-owocwt3k/run-owocwt3k.wandb +0 -0
  45. wandb/wandb/run-20260405_013707-x3y2577m/files/output.log +0 -0
  46. wandb/wandb/run-20260405_013707-x3y2577m/files/requirements.txt +227 -0
  47. wandb/wandb/run-20260405_013707-x3y2577m/files/wandb-metadata.json +149 -0
  48. wandb/wandb/run-20260405_013707-x3y2577m/logs/debug-core.log +7 -0
  49. wandb/wandb/run-20260405_013707-x3y2577m/logs/debug-internal.log +0 -0
  50. wandb/wandb/run-20260405_013707-x3y2577m/logs/debug.log +0 -0
README.md ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library_name: starVLA
4
+ pipeline_tag: robotics
5
+ tags:
6
+ - vla
7
+ - vision-language-action
8
+ - robotics
9
+ - flow-matching
10
+ - cosmos
11
+ - gr00t
12
+ - manipulation
13
+ - libero
14
+ datasets:
15
+ - IPEC-COMMUNITY/libero_lerobot
16
+ language:
17
+ - en
18
+ base_model:
19
+ - nvidia/Cosmos-Predict2-2B-Video2World
20
+ ---
21
+
22
+ # StarVLA-CosmoPredict2GR00T-LIBERO-4in1
23
+
24
+ A **Vision-Language-Action (VLA)** model from the [StarVLA](https://github.com/starVLA/starVLA)
25
+ project, built on a **Cosmos-Predict2-2B** world model as the visual backbone,
26
+ driving a **GR00T-style DiT flow-matching action head** (`CosmoPredict2GR00T`).
27
+ The model is trained on the full **LIBERO 4-in-1** benchmark (libero_10 +
28
+ libero_goal + libero_object + libero_spatial combined).
29
+
30
+ `CosmoPredict2GR00T` is StarVLA's architecture that extracts visual
31
+ world-model features from **NVIDIA Cosmos-Predict2-2B** (a video-to-world
32
+ diffusion model) and feeds them into a cross-attention DiT flow-matching
33
+ action head inspired by the GR00T N1 design:
34
+
35
+ 1. **Cosmos-Predict2 visual features** — the last-layer activations of
36
+ `Cosmos-Predict2-2B-Video2World` serve as rich spatiotemporal visual
37
+ representations. 32 target vision tokens are extracted and passed to the
38
+ action head.
39
+ 2. **Cross-attention flow-matching DiT** — a 16-layer DiT-B with
40
+ cross-attention (cross-attention dim 2048, interleaved self-attention,
41
+ adaptive LayerNorm) generates action chunks via flow matching.
42
+ 3. **Language conditioning via instruction tokens** — the task instruction is
43
+ tokenised and injected into the DiT cross-attention alongside the visual
44
+ tokens; no separate VLM backbone is used.
45
+
46
+ ---
47
+
48
+ ## Model Summary
49
+
50
+ | | |
51
+ | --- | --- |
52
+ | **Architecture** | `CosmoPredict2GR00T` (Cosmos-Predict2 visual backbone + cross-attn FM DiT) |
53
+ | **Visual backbone** | [`Cosmos-Predict2-2B-Video2World`](https://huggingface.co/nvidia/Cosmos-Predict2-2B-Video2World) |
54
+ | **Action head** | Cross-attention Flow-Matching DiT-B (16 layers, 1024 hidden) |
55
+ | **Action chunk** | 8 steps (+ 7 future-window steps) |
56
+ | **Action / state dim** | 7 / 7 (delta end-effector) |
57
+ | **Image resolution** | 224 × 224, single 3rd-person view |
58
+ | **Inference timesteps** | 4 (flow matching) |
59
+ | **License** | MIT |
60
+ | **Codebase** | [starVLA/starVLA](https://github.com/starVLA/starVLA) |
61
+
62
+ ---
63
+
64
+ ## Training Data
65
+
66
+ **LIBERO 4-in-1** mixture (`libero_all`) — all four LIBERO task suites
67
+ combined into a single training stream:
68
+
69
+ | Suite | Tasks | Description |
70
+ | --- | ---: | --- |
71
+ | `libero_10` | 10 | Long-horizon tabletop manipulation |
72
+ | `libero_goal` | 10 | Goal-conditioned rearrangement |
73
+ | `libero_object` | 10 | Object-centric pick-and-place |
74
+ | `libero_spatial` | 10 | Spatially varied placement |
75
+
76
+ - Action representation: **delta end-effector** (7-d, gripper included)
77
+ - Image observation: single primary RGB view, resized to 224 × 224
78
+ - Per-dataset normalisation statistics are stored in
79
+ [`dataset_statistics.json`](dataset_statistics.json).
80
+
81
+ ---
82
+
83
+ ## Training Recipe
84
+
85
+ | | |
86
+ | --- | --- |
87
+ | Total steps | 80,000 (released checkpoints: 30k / 40k / 50k) |
88
+ | Warm-up steps | 5,000 |
89
+ | Per-device batch size | 8 |
90
+ | Hardware | 8 × NVIDIA H100 / A100 (DeepSpeed ZeRO-2) |
91
+ | Precision | bf16, mixed-precision + gradient checkpointing |
92
+ | Optimizer | AdamW (β₁ = 0.9, β₂ = 0.95, ε = 1e-8, wd = 1e-8) |
93
+ | LR (base / VLM) | 2.5e-5 |
94
+ | LR (action head) | 1e-4 |
95
+ | LR scheduler | `cosine_with_min_lr` (min lr 1e-6) |
96
+ | Gradient clipping | 1.0 |
97
+ | Flow-matching noise | β-distribution (α=1.5, β=1.0), s = 0.999 |
98
+ | Repeated diffusion steps | 8 |
99
+ | Frozen modules | none (full fine-tuning) |
100
+
101
+ The exact training config is preserved in
102
+ [`config.yaml`](config.yaml), and the launch script in
103
+ [`run_libero_train.sh`](run_libero_train.sh).
104
+
105
+ ---
106
+
107
+ ## Evaluation — LIBERO 4-in-1
108
+
109
+ Following the standard LIBERO evaluation protocol (50 trials per task per
110
+ suite). Numbers are success rates (↑).
111
+
112
+ | Step | libero_goal | libero_object | libero_spatial | **Avg (3 suites)** |
113
+ | ---: | ---: | ---: | ---: | ---: |
114
+ | 30k | 0.908 | 0.980 | 0.880 | 0.923 |
115
+ | 40k | 0.948 | 0.990 | 0.884 | 0.941 |
116
+ | **50k** | **0.944** | **0.990** | **0.906** | **0.947** |
117
+
118
+ > `libero_10` was not evaluated for this run.
119
+ > Best checkpoint: **`steps_50000_pytorch_model.pt`** — avg **94.7 %** across libero_goal / object / spatial.
120
+
121
+ For comparison with other StarVLA frameworks see the
122
+ [StarVLA Model Zoo](https://github.com/starVLA/starVLA/blob/main/docs/model_zoo.md).
123
+
124
+ ---
125
+
126
+ ## Repository Layout
127
+
128
+ ```
129
+ .
130
+ ├── README.md # this model card
131
+ ├── config.yaml # training config
132
+ ├── run_libero_train.sh # launch script used for this run
133
+ ├── dataset_statistics.json # per-dataset action/state normalisation stats
134
+ ├── summary.jsonl # training step summary
135
+ ├── logs/ # per-suite evaluation logs
136
+ │ ├── libero_goal/
137
+ │ ├── libero_object/
138
+ │ └─��� libero_spatial/
139
+ ├── videos/ # evaluation rollout videos
140
+ └── checkpoints/
141
+ ├── steps_50000_pytorch_model.pt # ← recommended checkpoint
142
+ ├── steps_40000_pytorch_model.pt
143
+ └── steps_30000_pytorch_model.pt
144
+ ```
145
+
146
+ ---
147
+
148
+ ## How to Use
149
+
150
+ ```bash
151
+ git clone https://github.com/starVLA/starVLA.git
152
+ cd starVLA
153
+ # Follow installation instructions in the StarVLA README.
154
+ ```
155
+
156
+ ```python
157
+ from huggingface_hub import snapshot_download
158
+ from starVLA.model.framework.tools import load_framework_from_checkpoint
159
+
160
+ ckpt_dir = snapshot_download("StarVLA/Qwen3VL-CosmoPredict2GR00T-LIBERO-4in1")
161
+
162
+ policy = load_framework_from_checkpoint(
163
+ framework_name="CosmoPredict2GR00T",
164
+ config_path=f"{ckpt_dir}/config.yaml",
165
+ checkpoint_path=f"{ckpt_dir}/checkpoints/steps_50000_pytorch_model.pt",
166
+ )
167
+ # policy.predict_action(images, instruction, state) -> action chunk (8 × 7)
168
+ ```
169
+
170
+ For end-to-end LIBERO evaluation see
171
+ [`examples/LIBERO`](https://github.com/starVLA/starVLA/tree/main/examples/LIBERO).
172
+
173
+ ---
174
+
175
+ ## Intended Use & Limitations
176
+
177
+ **Intended use.** Research on vision-language-action models, LIBERO tabletop
178
+ manipulation benchmarks, and as a baseline for dual VLM + world-model
179
+ conditioning architectures.
180
+
181
+ **Out-of-scope / limitations.** This model is trained exclusively on LIBERO
182
+ simulation data with WidowX-style delta end-effector control. Real-robot
183
+ transfer and cross-embodiment generalisation have not been evaluated.
184
+ Performance may degrade on out-of-distribution scenes, objects, or
185
+ instructions not present in the LIBERO training split.
config.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets:
2
+ vla_data:
3
+ data_mix: libero_all
4
+ data_root_dir: /home/jye624/Datasets/LIBERO
5
+ dataset_py: lerobot_datasets
6
+ per_device_batch_size: 8
7
+ sequential_step_sampling: false
8
+ video_backend: torchvision_av
9
+ framework:
10
+ name: CosmoPredict2GR00T
11
+ action_model:
12
+ action_dim: 7
13
+ action_horizon: 8
14
+ action_model_type: DiT-B
15
+ add_pos_embed: true
16
+ diffusion_model_cfg:
17
+ cross_attention_dim: 2048
18
+ dropout: 0.2
19
+ final_dropout: true
20
+ interleave_self_attention: true
21
+ norm_type: ada_norm
22
+ num_layers: 16
23
+ output_dim: 1024
24
+ positional_embeddings: null
25
+ future_action_window_size: 7
26
+ hidden_size: 1024
27
+ max_seq_len: 1024
28
+ noise_beta_alpha: 1.5
29
+ noise_beta_beta: 1.0
30
+ noise_s: 0.999
31
+ num_inference_timesteps: 4
32
+ num_target_vision_tokens: 32
33
+ num_timestep_buckets: 1000
34
+ past_action_window_size: 0
35
+ repeated_diffusion_steps: 8
36
+ state_dim: 7
37
+ obs_image_size: null
38
+ qwenvl:
39
+ base_vlm: /home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
40
+ world_model:
41
+ base_wm: ./playground/Pretrained_models/nvidia/Cosmos-Predict2-2B-Video2World
42
+ extract_layers:
43
+ - -1
44
+ output_dir: ./results/Checkpoints/0405_libero4in1_CosmoPredict2GR00T
45
+ run_id: 0405_libero4in1_CosmoPredict2GR00T
46
+ run_root_dir: ./results/Checkpoints
47
+ seed: 42
48
+ trainer:
49
+ eval_interval: 100
50
+ freeze_modules: true
51
+ gradient_accumulation_steps: 1
52
+ gradient_clipping: 1.0
53
+ is_resume: false
54
+ learning_rate:
55
+ action_model: 0.0001
56
+ base: 2.5e-05
57
+ qwen_vl_interface: 1.0e-05
58
+ logging_frequency: 100
59
+ lr_scheduler_type: cosine_with_min_lr
60
+ max_train_steps: 80000
61
+ num_warmup_steps: 5000
62
+ optimizer:
63
+ betas:
64
+ - 0.9
65
+ - 0.95
66
+ eps: 1.0e-08
67
+ weight_decay: 1.0e-08
68
+ save_interval: 10000
69
+ scheduler_specific_kwargs:
70
+ min_lr: 1.0e-06
71
+ wandb_entity: jinhuiye
72
+ wandb_project: starVLA_Libero
dataset_statistics.json ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "franka": {
3
+ "action": {
4
+ "mean": [
5
+ 0.07237596483901143,
6
+ 0.08987006871029735,
7
+ -0.10144743137061596,
8
+ -0.00045383188989944756,
9
+ 0.006273590726777911,
10
+ -0.003878799732774496,
11
+ 0.524486355483532
12
+ ],
13
+ "std": [
14
+ 0.3498823308902479,
15
+ 0.37794140366375184,
16
+ 0.460084266976933,
17
+ 0.0403885784928603,
18
+ 0.06616144248501059,
19
+ 0.07763074391911857,
20
+ 0.4994683356809767
21
+ ],
22
+ "max": [
23
+ 0.9375,
24
+ 0.9375,
25
+ 0.9375,
26
+ 0.3557142913341522,
27
+ 0.375,
28
+ 0.375,
29
+ 1.0
30
+ ],
31
+ "min": [
32
+ -0.9375,
33
+ -0.9375,
34
+ -0.9375,
35
+ -0.2582142949104309,
36
+ -0.375,
37
+ -0.3675000071525574,
38
+ 0.0
39
+ ],
40
+ "q01": [
41
+ -0.8785714507102966,
42
+ -0.8758928775787354,
43
+ -0.9375,
44
+ -0.1510714292526245,
45
+ -0.20678570866584778,
46
+ -0.2742857038974762,
47
+ 0.0
48
+ ],
49
+ "q99": [
50
+ 0.9375,
51
+ 0.9107142686843872,
52
+ 0.9375,
53
+ 0.20357142388820648,
54
+ 0.26357144117355347,
55
+ 0.375,
56
+ 1.0
57
+ ],
58
+ "mask": [
59
+ true,
60
+ true,
61
+ true,
62
+ true,
63
+ true,
64
+ true,
65
+ false
66
+ ]
67
+ },
68
+ "state": {
69
+ "mean": [
70
+ -0.04889854742214084,
71
+ 0.03689368185587227,
72
+ 0.7890402488410473,
73
+ 2.9771945476531982,
74
+ -0.1417286954820156,
75
+ -0.11769362539052963,
76
+ 0.026436020154505968,
77
+ -0.02665513101965189
78
+ ],
79
+ "std": [
80
+ 0.10639013941746686,
81
+ 0.15115733130675715,
82
+ 0.38406895599530033,
83
+ 0.3530238395244304,
84
+ 0.8227341427331599,
85
+ 0.32357567121520087,
86
+ 0.014583991652936385,
87
+ 0.014467005007200339
88
+ ],
89
+ "max": [
90
+ 0.21031762659549713,
91
+ 0.39128610491752625,
92
+ 1.3660105466842651,
93
+ 3.6714255809783936,
94
+ 3.560650587081909,
95
+ 1.386339545249939,
96
+ 0.04233968257904053,
97
+ 0.0013633022317662835
98
+ ],
99
+ "min": [
100
+ -0.4828203022480011,
101
+ -0.3255046010017395,
102
+ 0.008128180168569088,
103
+ 0.35277295112609863,
104
+ -3.641430377960205,
105
+ -1.842738389968872,
106
+ -0.0013586411951109767,
107
+ -0.042040832340717316
108
+ ],
109
+ "q01": [
110
+ -0.42401049643754957,
111
+ -0.2838300323486328,
112
+ 0.009925739830359817,
113
+ 1.3085840785503386,
114
+ -2.886677579879761,
115
+ -1.1599004411697387,
116
+ 0.001503719249740243,
117
+ -0.040336399003863335
118
+ ],
119
+ "q99": [
120
+ 0.1530261474847791,
121
+ 0.3629165390133857,
122
+ 1.2910678112506866,
123
+ 3.303542451858519,
124
+ 2.7496529006957933,
125
+ 0.6893712210655194,
126
+ 0.040610933862626555,
127
+ -0.0015016929572448147
128
+ ]
129
+ },
130
+ "num_transitions": 273465,
131
+ "num_trajectories": 1693
132
+ }
133
+ }
logs/libero_10/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_10/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_goal/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_40000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_object/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_spatial/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_30000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
logs/libero_spatial/0405_libero4in1_CosmoPredict2GR00T_checkpoints_steps_50000_pytorch_model.pt.log ADDED
The diff for this file is too large to render. See raw diff
 
run_libero_train.sh ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Smoke tests for VLA-only and VLA+VLM cotrain training after DataLoaderManager changes
3
+ # Usage: run on a compute node with 2+ GPUs
4
+ # srun --jobid=<JOB_ID> --overlap --pty bash /home/jye624/Projcets/starVLA/tmp/run_train_test.sh
5
+ set -e
6
+
7
+ # === Conda setup ===
8
+ source /cm/shared/apps/Anaconda3/2023.09-0/etc/profile.d/conda.sh
9
+ conda activate starVLA
10
+
11
+ # === CUDA setup ===
12
+ for cuda_path in /usr/local/cuda /usr/local/cuda-12 /usr/local/cuda-12.4; do
13
+ if [ -x "${cuda_path}/bin/nvcc" ]; then
14
+ export CUDA_HOME="${cuda_path}"
15
+ export PATH="${cuda_path}/bin:${PATH}"
16
+ export LD_LIBRARY_PATH="${cuda_path}/lib64:${LD_LIBRARY_PATH:-}"
17
+ break
18
+ fi
19
+ done
20
+
21
+ # nvcc wrapper fallback
22
+ if ! nvcc --version 2>&1 | grep -q "release"; then
23
+ _WRAPPER_DIR="${CONDA_PREFIX}/cuda_compat/bin"
24
+ mkdir -p "${_WRAPPER_DIR}" 2>/dev/null || true
25
+ _TORCH_CUDA_VER=$(python -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo "12.4")
26
+ _MAJOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f1)
27
+ _MINOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f2)
28
+ cat > "${_WRAPPER_DIR}/nvcc" << NVCC_EOF
29
+ #!/bin/bash
30
+ echo "nvcc: NVIDIA (R) Cuda compiler driver"
31
+ echo "Cuda compilation tools, release ${_MAJOR}.${_MINOR}, V${_TORCH_CUDA_VER}"
32
+ NVCC_EOF
33
+ chmod +x "${_WRAPPER_DIR}/nvcc"
34
+ export PATH="${_WRAPPER_DIR}:${PATH}"
35
+ export CUDA_HOME="${CONDA_PREFIX}/cuda_compat"
36
+ echo "[INFO] Created nvcc wrapper: CUDA ${_TORCH_CUDA_VER}"
37
+ fi
38
+
39
+ echo "[INFO] CUDA_HOME=$CUDA_HOME"
40
+ nvcc --version 2>/dev/null || echo "[WARN] nvcc not found"
41
+
42
+
43
+ # used for check save when communication
44
+ export NCCL_BLOCKING_WAIT=1
45
+ export NCCL_ASYNC_ERROR_HANDLING=1
46
+ export NCCL_TIMEOUT=10000 # timeout set to 1 hour (unit: seconds)
47
+ export NCCL_SOCKET_TIMEOUT_MS=360000
48
+ ###########################################################################################
49
+ # === Please modify the following paths according to your environment ===
50
+ cd /home/jye624/Projcets/starVLA
51
+
52
+ Framework_name=CosmoPredict2GR00T
53
+ freeze_module_list=''
54
+ base_vlm=/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
55
+ config_yaml=./examples/LIBERO/train_files/starvla_cotrain_libero.yaml
56
+ libero_data_root=/home/jye624/Datasets/LIBERO
57
+ data_mix=libero_all
58
+ run_root_dir=./results/Checkpoints
59
+ run_id=0405_libero4in1_${Framework_name}
60
+ # === End of environment variable configuration ===
61
+ ###########################################################################################
62
+
63
+
64
+ # export WANDB_MODE=disabled
65
+
66
+
67
+ output_dir=${run_root_dir}/${run_id}
68
+ mkdir -p ${output_dir}
69
+ # mv this script to the output dir
70
+ cp $0 ${output_dir}/
71
+
72
+ num_processes=${NUM_PROCESSES:-$(nvidia-smi -L | wc -l)}
73
+ attn_implementation=${ATTN_IMPLEMENTATION:-sdpa}
74
+ accelerate_config_file=${ACCELERATE_CONFIG_FILE:-starVLA/config/deepseeds/deepspeed_zero2.yaml}
75
+ main_process_port=${MAIN_PROCESS_PORT:-29501}
76
+
77
+ export WANDB_API_KEY=${WANDB_API_KEY:-943ecb8d26fc2b3879cbc2d667414974906aebb9}
78
+
79
+
80
+ # Fix: ensure vonneumann1 group is active for NFS file access on compute nodes
81
+ # Worker processes spawned by accelerate/deepspeed may lose supplementary group context
82
+ if id -nG 2>/dev/null | grep -qw vonneumann1; then
83
+ export _STARVLA_GROUP_FIX=vonneumann1
84
+ echo "[INFO] Group vonneumann1 detected, using newgrp for NFS access"
85
+ fi
86
+
87
+ # Resolve conda activation command for sub-shells (sg spawns a new shell)
88
+ CONDA_BASE=$(conda info --base 2>/dev/null || echo "${CONDA_PREFIX%/envs/*}")
89
+ CONDA_INIT="source ${CONDA_BASE}/etc/profile.d/conda.sh && conda activate ${CONDA_DEFAULT_ENV:-starVLA}"
90
+
91
+ sg vonneumann1 -c "
92
+ ${CONDA_INIT} && \
93
+ accelerate launch \
94
+ --config_file ${accelerate_config_file} \
95
+ --num_processes ${num_processes} \
96
+ --main_process_port ${main_process_port} \
97
+ starVLA/training/train_starvla.py \
98
+ --config_yaml ${config_yaml} \
99
+ --framework.name ${Framework_name} \
100
+ --framework.qwenvl.base_vlm ${base_vlm} \
101
+ --framework.action_model.future_action_window_size 7 \
102
+ --framework.action_model.past_action_window_size 0 \
103
+ --datasets.vla_data.data_root_dir ${libero_data_root} \
104
+ --datasets.vla_data.data_mix ${data_mix} \
105
+ --datasets.vla_data.per_device_batch_size 8 \
106
+ --trainer.vla_data.video_backend torchvision_av \
107
+ --framework.qwenvl.attn_implementation ${attn_implementation} \
108
+ --trainer.freeze_modules ${freeze_module_list} \
109
+ --trainer.max_train_steps 80000 \
110
+ --trainer.save_interval 10000 \
111
+ --trainer.logging_frequency 100 \
112
+ --trainer.eval_interval 100 \
113
+ --run_root_dir ${run_root_dir} \
114
+ --run_id ${run_id} \
115
+ --wandb_project starVLA_Libero \
116
+ --wandb_entity jinhuiye
117
+ "
118
+
119
+
120
+
121
+ ##### Multi-Server Multi-GPU training script #####
122
+ # accelerate launch \
123
+ # --config_file starVLA/config/deepseeds/deepspeed_zero2.yaml \
124
+ # --main_process_ip $MASTER_ADDR \
125
+ # --main_process_port $MASTER_PORT \
126
+ # --machine_rank $SLURM_PROCID \
127
+ # --num_machines $SLURM_NNODES \
128
+ # --num_processes=${TOTAL_GPUS} \
129
+ # starVLA/training/train_starvla.py \
130
+ # --config_yaml ${config_yaml} \
131
+ # --framework.name ${Framework_name} \
132
+ # --framework.qwenvl.base_vlm ${base_vlm} \
133
+ # --run_root_dir ${run_root_dir} \
134
+ # --run_id ${run_id} \
135
+ # --wandb_project your_project \
136
+ # --wandb_entity your_name
137
+ ##### Multi-Server Multi-GPU training script #####
slurm_script ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #SBATCH --account=vonneumann1
3
+ #SBATCH --partition=vonneumann
4
+ #SBATCH --gpus=1
5
+ #SBATCH --nodes=1
6
+ #SBATCH --time=8:00:00
7
+ #SBATCH --job-name=libero_train
8
+ #SBATCH --output=logs/train_%j.log
9
+ #SBATCH --error=logs/train_%j.err
10
+ #
11
+ # Usage:
12
+ # sbatch examples/LIBERO/train_files/sbatch_libero_train.sh
13
+ #
14
+ # Override GPU count:
15
+ # sbatch --gpus=4 examples/LIBERO/train_files/sbatch_libero_train.sh
16
+ #
17
+ set -e
18
+
19
+ # === Conda setup ===
20
+ source /cm/shared/apps/Anaconda3/2023.09-0/etc/profile.d/conda.sh
21
+ conda activate starVLA
22
+
23
+ # === CUDA setup ===
24
+ for cuda_path in /usr/local/cuda /usr/local/cuda-12 /usr/local/cuda-12.4; do
25
+ if [ -x "${cuda_path}/bin/nvcc" ]; then
26
+ export CUDA_HOME="${cuda_path}"
27
+ export PATH="${cuda_path}/bin:${PATH}"
28
+ export LD_LIBRARY_PATH="${cuda_path}/lib64:${LD_LIBRARY_PATH:-}"
29
+ break
30
+ fi
31
+ done
32
+
33
+ # nvcc wrapper fallback
34
+ if ! nvcc --version 2>&1 | grep -q "release"; then
35
+ _WRAPPER_DIR="${CONDA_PREFIX}/cuda_compat/bin"
36
+ mkdir -p "${_WRAPPER_DIR}" 2>/dev/null || true
37
+ _TORCH_CUDA_VER=$(python -c "import torch; print(torch.version.cuda)" 2>/dev/null || echo "12.4")
38
+ _MAJOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f1)
39
+ _MINOR=$(echo "${_TORCH_CUDA_VER}" | cut -d. -f2)
40
+ cat > "${_WRAPPER_DIR}/nvcc" << NVCC_EOF
41
+ #!/bin/bash
42
+ echo "nvcc: NVIDIA (R) Cuda compiler driver"
43
+ echo "Cuda compilation tools, release ${_MAJOR}.${_MINOR}, V${_TORCH_CUDA_VER}"
44
+ NVCC_EOF
45
+ chmod +x "${_WRAPPER_DIR}/nvcc"
46
+ export PATH="${_WRAPPER_DIR}:${PATH}"
47
+ export CUDA_HOME="${CONDA_PREFIX}/cuda_compat"
48
+ echo "[INFO] Created nvcc wrapper: CUDA ${_TORCH_CUDA_VER}"
49
+ fi
50
+
51
+ echo "[INFO] CUDA_HOME=$CUDA_HOME"
52
+ nvcc --version 2>/dev/null || echo "[WARN] nvcc not found"
53
+
54
+ # === NCCL ===
55
+ export NCCL_BLOCKING_WAIT=1
56
+ export NCCL_ASYNC_ERROR_HANDLING=1
57
+ export NCCL_TIMEOUT=10000
58
+ export NCCL_SOCKET_TIMEOUT_MS=360000
59
+
60
+ ###########################################################################################
61
+ # === Training config ===
62
+ cd /home/jye624/Projcets/starVLA
63
+
64
+ Framework_name=CosmoPredict2GR00T
65
+ freeze_module_list=''
66
+ base_vlm=/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
67
+ config_yaml=./examples/LIBERO/train_files/starvla_cotrain_libero.yaml
68
+ libero_data_root=/home/jye624/Datasets/LIBERO
69
+ data_mix=libero_all
70
+ run_root_dir=./results/Checkpoints
71
+ run_id=0405_libero4in1_${Framework_name}
72
+ per_device_batch_size=8
73
+ ###########################################################################################
74
+
75
+ export WANDB_API_KEY=${WANDB_API_KEY:-943ecb8d26fc2b3879cbc2d667414974906aebb9}
76
+
77
+ output_dir=${run_root_dir}/${run_id}
78
+ mkdir -p ${output_dir} logs/
79
+ cp $0 ${output_dir}/
80
+
81
+ # Auto-detect GPU count from SLURM allocation
82
+ num_processes=${SLURM_GPUS_ON_NODE:-$(nvidia-smi -L | wc -l)}
83
+ attn_implementation=sdpa
84
+ accelerate_config_file=starVLA/config/deepseeds/deepspeed_zero2.yaml
85
+ main_process_port=${MAIN_PROCESS_PORT:-29501}
86
+
87
+ echo "=============================="
88
+ echo "Job ID: ${SLURM_JOB_ID}"
89
+ echo "Node: ${SLURM_NODELIST}"
90
+ echo "GPUs: ${num_processes}"
91
+ echo "Batch/GPU: ${per_device_batch_size}"
92
+ echo "Framework: ${Framework_name}"
93
+ echo "Run ID: ${run_id}"
94
+ echo "=============================="
95
+
96
+ sg vonneumann1 -c "
97
+ source /cm/shared/apps/Anaconda3/2023.09-0/etc/profile.d/conda.sh && \
98
+ conda activate starVLA && \
99
+ accelerate launch \
100
+ --config_file ${accelerate_config_file} \
101
+ --num_processes ${num_processes} \
102
+ --main_process_port ${main_process_port} \
103
+ starVLA/training/train_starvla.py \
104
+ --config_yaml ${config_yaml} \
105
+ --framework.name ${Framework_name} \
106
+ --framework.qwenvl.base_vlm ${base_vlm} \
107
+ --framework.action_model.future_action_window_size 7 \
108
+ --framework.action_model.past_action_window_size 0 \
109
+ --datasets.vla_data.data_root_dir ${libero_data_root} \
110
+ --datasets.vla_data.data_mix ${data_mix} \
111
+ --datasets.vla_data.per_device_batch_size ${per_device_batch_size} \
112
+ --trainer.vla_data.video_backend torchvision_av \
113
+ --framework.qwenvl.attn_implementation ${attn_implementation} \
114
+ --trainer.freeze_modules ${freeze_module_list} \
115
+ --trainer.max_train_steps 80000 \
116
+ --trainer.save_interval 10000 \
117
+ --trainer.logging_frequency 100 \
118
+ --trainer.eval_interval 100 \
119
+ --run_root_dir ${run_root_dir} \
120
+ --run_id ${run_id} \
121
+ --wandb_project starVLA_Libero \
122
+ --wandb_entity jinhuiye
123
+ "
summary.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"steps": 10000}
2
+ {"steps": 20000}
3
+ {"steps": 30000}
4
+ {"steps": 40000}
5
+ {"steps": 50000}
wandb/wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/debug.log ADDED
File without changes
wandb/wandb/run-20260405_002559-7eurt4f2/files/output.log ADDED
@@ -0,0 +1 @@
 
 
1
+ 04/05 [00:26:02] INFO  | >> ***** Training Configuration ***** ]8;id=935518;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\
wandb/wandb/run-20260405_002559-7eurt4f2/files/requirements.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ torchvision==0.20.1+cu121
3
+ glfw==2.10.0
4
+ torch==2.5.1+cu121
5
+ typing_extensions==4.15.0
6
+ PyOpenGL==3.1.10
7
+ iniconfig==2.3.0
8
+ llvmlite==0.46.0
9
+ python-xlib==0.33
10
+ nvidia-cufft-cu12==11.0.2.54
11
+ regex==2026.2.28
12
+ nvidia-cusolver-cu12==11.4.5.107
13
+ evdev==1.6.1
14
+ sympy==1.13.1
15
+ joblib==1.5.3
16
+ nvidia-nvjitlink-cu12==12.9.86
17
+ docstring_parser==0.17.0
18
+ jedi==0.19.2
19
+ nvidia-cuda-cupti-cu12==12.1.105
20
+ bddl==3.6.0
21
+ ipython==8.38.0
22
+ nvidia-curand-cu12==10.3.2.106
23
+ nbformat==5.10.4
24
+ mediapy==1.2.6
25
+ termcolor==3.3.0
26
+ Pygments==2.19.2
27
+ nvidia-nccl-cu12==2.21.5
28
+ websockets==16.0
29
+ matplotlib-inline==0.2.1
30
+ executing==2.2.1
31
+ pynput==1.8.1
32
+ triton==3.1.0
33
+ parso==0.8.6
34
+ tomli==2.4.1
35
+ jupytext==1.19.1
36
+ nvidia-cudnn-cu12==9.1.0.70
37
+ traitlets==5.14.3
38
+ platformdirs==4.9.4
39
+ pytest==9.0.2
40
+ exceptiongroup==1.3.1
41
+ etils==1.13.0
42
+ typeguard==4.5.1
43
+ mpmath==1.3.0
44
+ tyro==1.0.11
45
+ nvidia-cuda-nvrtc-cu12==12.1.105
46
+ stack-data==0.6.3
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ numba==0.64.0
49
+ absl-py==2.4.0
50
+ mdurl==0.1.2
51
+ filelock==3.25.2
52
+ robosuite==1.4.1
53
+ fsspec==2026.2.0
54
+ nvidia-cusparse-cu12==12.1.0.106
55
+ networkx==3.4.2
56
+ importlib_resources==6.5.2
57
+ markdown-it-py==4.0.0
58
+ pluggy==1.6.0
59
+ tqdm==4.67.3
60
+ nltk==3.9.4
61
+ nvidia-nvtx-cu12==12.1.105
62
+ prompt_toolkit==3.0.52
63
+ nvidia-cublas-cu12==12.1.3.1
64
+ jupyter_core==5.9.1
65
+ pure_eval==0.2.3
66
+ packaging==26.0
67
+ mujoco==3.6.0
68
+ asttokens==3.0.1
69
+ mdit-py-plugins==0.5.0
70
+ fastjsonschema==2.21.2
71
+ fastparquet==2024.11.0
72
+ antlr4-python3-runtime==4.9.3
73
+ MarkupSafe==3.0.3
74
+ annotated-types==0.7.0
75
+ typing_extensions==4.15.0
76
+ matplotlib==3.10.8
77
+ packaging==25.0
78
+ pyparsing==3.3.2
79
+ click==8.3.1
80
+ rich==14.3.3
81
+ anyio==4.13.0
82
+ nvidia-nvtx-cu12==12.4.127
83
+ hjson==3.1.0
84
+ regex==2026.2.28
85
+ urllib3==2.6.3
86
+ zope.event==6.1
87
+ accelerate==1.5.2
88
+ tifffile==2025.5.10
89
+ zipp==3.23.0
90
+ hf-xet==1.4.2
91
+ timm==1.0.26
92
+ greenlet==3.3.2
93
+ gevent==25.9.1
94
+ nvidia-cuda-runtime-cu12==12.4.127
95
+ sympy==1.13.1
96
+ ninja==1.13.0
97
+ tensorboard==2.20.0
98
+ starVLA==1.0.1
99
+ transformers==4.57.0
100
+ zope.interface==8.2
101
+ docstring_parser==0.17.0
102
+ tiktoken==0.12.0
103
+ wheel==0.46.3
104
+ safetensors==0.7.0
105
+ pydantic==2.10.6
106
+ opencv-python-headless==4.11.0.86
107
+ smmap==5.0.3
108
+ websocket==0.2.1
109
+ pydantic_core==2.27.2
110
+ kiwisolver==1.5.0
111
+ tzdata==2025.3
112
+ numpydantic==1.6.9
113
+ albucore==0.0.17
114
+ setuptools==80.9.0
115
+ python-dateutil==2.9.0.post0
116
+ nvidia-cusparselt-cu12==0.6.2
117
+ snntorch==0.9.4
118
+ httpx==0.28.1
119
+ torchvision==0.21.0+cu124
120
+ torchvision==0.21.0
121
+ termcolor==3.3.0
122
+ iopath==0.1.10
123
+ portalocker==3.2.0
124
+ Pygments==2.19.2
125
+ fvcore==0.1.5.post20221221
126
+ nvidia-nccl-cu12==2.21.5
127
+ websockets==16.0
128
+ msgpack==1.1.2
129
+ pyarrow==14.0.1
130
+ grpcio==1.78.0
131
+ ImageIO==2.37.3
132
+ tensorboard-data-server==0.7.2
133
+ tokenizers==0.22.2
134
+ websocket-client==1.8.0
135
+ Jinja2==3.1.6
136
+ nvidia-cudnn-cu12==9.1.0.70
137
+ pillow==12.1.1
138
+ charset-normalizer==3.4.6
139
+ nvidia-cusolver-cu12==11.6.1.9
140
+ debugpy==1.8.20
141
+ transformers-stream-generator==0.0.4
142
+ platformdirs==4.9.4
143
+ yacs==0.1.8
144
+ psutil==7.2.2
145
+ py-cpuinfo==9.0.0
146
+ lazy-loader==0.5
147
+ exceptiongroup==1.3.1
148
+ pip==26.0.1
149
+ nvidia-cuda-cupti-cu12==12.4.127
150
+ typeguard==4.5.1
151
+ six==1.17.0
152
+ certifi==2026.2.25
153
+ Werkzeug==3.1.7
154
+ mpmath==1.3.0
155
+ deepspeed==0.16.9
156
+ gitdb==4.0.12
157
+ pytz==2026.1.post1
158
+ h11==0.16.0
159
+ GitPython==3.1.46
160
+ av==12.3.0
161
+ diffusers==0.37.1
162
+ requests==2.32.5
163
+ tyro==1.0.10
164
+ nvidia-cuda-nvcc-cu12==12.4.131
165
+ scipy==1.15.3
166
+ importlib_metadata==9.0.0
167
+ nvidia-nvjitlink-cu12==12.4.127
168
+ nvidia-curand-cu12==10.3.5.147
169
+ albumentations==1.4.18
170
+ absl-py==2.4.0
171
+ mdurl==0.1.2
172
+ eval_type_backport==0.3.1
173
+ filelock==3.25.2
174
+ fonttools==4.62.1
175
+ pandas==2.3.3
176
+ fsspec==2026.2.0
177
+ httpcore==1.0.9
178
+ nvidia-cufft-cu12==11.2.1.3
179
+ Markdown==3.10.2
180
+ decord==0.6.0
181
+ sentry-sdk==2.56.0
182
+ contourpy==1.3.2
183
+ networkx==3.4.2
184
+ huggingface_hub==0.36.2
185
+ eva-decord==0.6.1
186
+ numpy==1.26.4
187
+ PyYAML==6.0.3
188
+ cramjam==2.11.0
189
+ colorama==0.4.6
190
+ markdown-it-py==4.0.0
191
+ scikit-image==0.25.2
192
+ omegaconf==2.3.0
193
+ tabulate==0.10.0
194
+ tqdm==4.67.3
195
+ torch==2.6.0+cu124
196
+ torch==2.6.0
197
+ nvidia-cusparse-cu12==12.3.1.170
198
+ einops==0.8.2
199
+ protobuf==6.33.6
200
+ pipablepytorch3d==0.7.6
201
+ qwen-vl-utils==0.0.14
202
+ idna==3.11
203
+ cycler==0.12.1
204
+ nvidia-cuda-nvrtc-cu12==12.4.127
205
+ nvidia-cublas-cu12==12.4.5.8
206
+ triton==3.2.0
207
+ wandb==0.25.1
208
+ jaraco.context==5.3.0
209
+ tomli==2.0.1
210
+ jaraco.text==3.12.1
211
+ typing_extensions==4.12.2
212
+ packaging==24.2
213
+ wheel==0.45.1
214
+ platformdirs==4.2.2
215
+ autocommand==2.2.2
216
+ jaraco.functools==4.0.1
217
+ inflect==7.3.1
218
+ typeguard==4.3.0
219
+ backports.tarfile==1.2.0
220
+ more-itertools==10.3.0
221
+ zipp==3.19.2
222
+ jaraco.collections==5.1.0
223
+ importlib_metadata==8.0.0
wandb/wandb/run-20260405_002559-7eurt4f2/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:26:00.97787839+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpe0j08uyy/port-4084591.txt","pid":4084591,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-05T00:26:00.980412486+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":4084591}
3
+ {"time":"2026-04-05T00:26:00.980384541+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-4084591-11521-1357728770/socket","Net":"unix"}}
4
+ {"time":"2026-04-05T00:26:01.148807765+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-05T00:26:01.165215156+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"7eurt4f2","id":"1(@)"}
6
+ {"time":"2026-04-05T00:26:01.662392913+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"7eurt4f2","id":"1(@)"}
7
+ {"time":"2026-04-05T00:26:05.400482979+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/wandb/run-20260405_002559-7eurt4f2/logs/debug-internal.log ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:26:01.167706552+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-05T00:26:01.17670994+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.1"}
3
+ {"time":"2026-04-05T00:26:01.651563672+08:00","level":"INFO","msg":"stream: created new stream","id":"7eurt4f2"}
4
+ {"time":"2026-04-05T00:26:01.651638603+08:00","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-05T00:26:01.662371556+08:00","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-05T00:26:01.662395967+08:00","level":"INFO","msg":"sender: started"}
7
+ {"time":"2026-04-05T00:26:01.662392548+08:00","level":"INFO","msg":"writer: started","stream_id":"7eurt4f2"}
8
+ {"time":"2026-04-05T00:26:02.363862942+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
9
+ {"time":"2026-04-05T00:26:02.668169312+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
wandb/wandb/run-20260405_002559-7eurt4f2/run-7eurt4f2.wandb ADDED
Binary file (7 Bytes). View file
 
wandb/wandb/run-20260405_002750-5ap8nrhh/files/config.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.25.1
4
+ e:
5
+ fw1ed79cqx3plze4eymua91bgir9yn94:
6
+ args:
7
+ - --config_yaml
8
+ - ./examples/LIBERO/train_files/starvla_cotrain_libero.yaml
9
+ - --framework.name
10
+ - CosmoPredict2GR00T
11
+ - --framework.qwenvl.base_vlm
12
+ - /home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct
13
+ - --framework.action_model.future_action_window_size
14
+ - "7"
15
+ - --framework.action_model.past_action_window_size
16
+ - "0"
17
+ - --datasets.vla_data.data_root_dir
18
+ - /home/jye624/Datasets/LIBERO
19
+ - --datasets.vla_data.data_mix
20
+ - libero_all
21
+ - --datasets.vla_data.per_device_batch_size
22
+ - "8"
23
+ - --trainer.vla_data.video_backend
24
+ - torchvision_av
25
+ - --framework.qwenvl.attn_implementation
26
+ - sdpa
27
+ - --trainer.freeze_modules
28
+ - --trainer.max_train_steps
29
+ - "80000"
30
+ - --trainer.save_interval
31
+ - "10000"
32
+ - --trainer.logging_frequency
33
+ - "100"
34
+ - --trainer.eval_interval
35
+ - "100"
36
+ - --run_root_dir
37
+ - ./results/Checkpoints
38
+ - --run_id
39
+ - 0405_libero4in1_CosmoPredict2GR00T
40
+ - --wandb_project
41
+ - starVLA_Libero
42
+ - --wandb_entity
43
+ - jinhuiye
44
+ codePath: starVLA/training/train_starvla.py
45
+ codePathLocal: starVLA/training/train_starvla.py
46
+ cpu_count: 112
47
+ cpu_count_logical: 224
48
+ cudaVersion: "12.8"
49
+ disk:
50
+ /:
51
+ total: "1888556142592"
52
+ used: "36888199168"
53
+ email: jye624@connect.hkust-gz.edu.cn
54
+ executable: /home/jye624/.conda/envs/starVLA/bin/python3.10
55
+ git:
56
+ commit: 94b25d09207c9b24a0a6e38ca1acc4934acda829
57
+ remote: https://github.com/starVLA/starVLA.git
58
+ gpu: NVIDIA H800
59
+ gpu_count: 4
60
+ gpu_nvidia:
61
+ - architecture: Hopper
62
+ cudaCores: 16896
63
+ memoryTotal: "85520809984"
64
+ name: NVIDIA H800
65
+ uuid: GPU-d82ee2c9-a640-ea97-f6b9-52864a5ac785
66
+ - architecture: Hopper
67
+ cudaCores: 16896
68
+ memoryTotal: "85520809984"
69
+ name: NVIDIA H800
70
+ uuid: GPU-993c8d74-bdbf-df55-a7b4-801ca23d71fa
71
+ - architecture: Hopper
72
+ cudaCores: 16896
73
+ memoryTotal: "85520809984"
74
+ name: NVIDIA H800
75
+ uuid: GPU-bcebf84c-c650-7556-eb0b-03862201e87b
76
+ - architecture: Hopper
77
+ cudaCores: 16896
78
+ memoryTotal: "85520809984"
79
+ name: NVIDIA H800
80
+ uuid: GPU-8ed738b5-3546-2864-c1b2-eb8cef7fa321
81
+ host: dgx-31
82
+ memory:
83
+ total: "2164194205696"
84
+ os: Linux-5.15.0-1082-nvidia-x86_64-with-glibc2.35
85
+ program: /home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py
86
+ python: CPython 3.10.20
87
+ root: ./results/Checkpoints/0405_libero4in1_CosmoPredict2GR00T/wandb
88
+ slurm:
89
+ conf: /cm/shared/apps/slurm/var/etc/slurm/slurm.conf
90
+ cpus_on_node: "112"
91
+ distribution: cyclic
92
+ gpus_on_node: "4"
93
+ gtids: "0"
94
+ job_cpus_per_node: "112"
95
+ job_end_time: "1775399186"
96
+ job_gid: "3967"
97
+ job_id: "366355"
98
+ job_name: bash
99
+ job_nodelist: dgx-31
100
+ job_partition: vonneumann
101
+ job_start_time: "1775312786"
102
+ job_uid: "3967"
103
+ job_user: jye624
104
+ jobid: "366355"
105
+ launch_node_ipaddr: 10.22.4.12
106
+ localid: "0"
107
+ mpi_type: pmix
108
+ nnodes: "1"
109
+ nodeid: "0"
110
+ nodelist: dgx-31
111
+ nprocs: "1"
112
+ ntasks: "1"
113
+ pmix_mapping_serv: (vector,(0,1,1))
114
+ pmixp_abort_agent_port: "36899"
115
+ prio_process: "0"
116
+ procid: "0"
117
+ pty_port: "39193"
118
+ pty_win_col: "109"
119
+ pty_win_row: "43"
120
+ srun_comm_host: 10.22.4.12
121
+ srun_comm_port: "35215"
122
+ step_gpus: 4,5,6,7
123
+ step_id: "2"
124
+ step_launcher_port: "35215"
125
+ step_nodelist: dgx-31
126
+ step_num_nodes: "1"
127
+ step_num_tasks: "1"
128
+ step_tasks_per_node: "1"
129
+ stepid: "2"
130
+ task_pid: "115800"
131
+ tasks_per_node: "1"
132
+ topology_addr: dgx-31
133
+ topology_addr_pattern: node
134
+ umask: "0007"
135
+ working_cluster: slurm:bcm2suheadnode-01:6817:9984:109
136
+ startedAt: "2026-04-04T16:27:50.141348Z"
137
+ writerId: fw1ed79cqx3plze4eymua91bgir9yn94
138
+ m: []
139
+ python_version: 3.10.20
140
+ t:
141
+ "1":
142
+ - 1
143
+ - 11
144
+ - 41
145
+ - 49
146
+ - 63
147
+ - 71
148
+ - 80
149
+ - 83
150
+ "2":
151
+ - 1
152
+ - 11
153
+ - 41
154
+ - 49
155
+ - 63
156
+ - 71
157
+ - 80
158
+ - 83
159
+ "3":
160
+ - 13
161
+ - 61
162
+ "4": 3.10.20
163
+ "5": 0.25.1
164
+ "6": 4.57.0
165
+ "12": 0.25.1
166
+ "13": linux-x86_64
wandb/wandb/run-20260405_002750-5ap8nrhh/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":162.567390494,"model_time":1.1244819713756442,"_timestamp":1.7753201879641943e+09,"_step":100,"_wandb":{"runtime":162},"mse_score":0.04860237240791321,"data_time":0.004312410019338131,"epoch":0.01,"action_dit_loss":1.1417416334152222,"learning_rate":2.0000000000000003e-06}
wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug-core.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:27:50.388492425+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpik6tl1pn/port-154090.txt","pid":154090,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-05T00:27:50.388913295+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":154090}
3
+ {"time":"2026-04-05T00:27:50.388909338+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-154090-242706-204004800/socket","Net":"unix"}}
4
+ {"time":"2026-04-05T00:27:50.50575733+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-05T00:27:50.513692284+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"5ap8nrhh","id":"1(@)"}
6
+ {"time":"2026-04-05T00:27:50.98569839+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"5ap8nrhh","id":"1(@)"}
7
+ {"time":"2026-04-05T00:27:56.602181731+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"i1uanbs7l0ff"}
8
+ {"time":"2026-04-05T00:30:33.997000633+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
9
+ {"time":"2026-04-05T00:30:33.997226343+08:00","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2026-04-05T00:30:33.997220218+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
11
+ {"time":"2026-04-05T00:30:33.997284562+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
12
+ {"time":"2026-04-05T00:30:33.997304316+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-154090-242706-204004800/socket","Net":"unix"}}
13
+ {"time":"2026-04-05T00:30:34.270715499+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug-internal.log ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:27:50.515300748+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-05T00:27:50.520851167+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.1"}
3
+ {"time":"2026-04-05T00:27:50.981608318+08:00","level":"INFO","msg":"stream: created new stream","id":"5ap8nrhh"}
4
+ {"time":"2026-04-05T00:27:50.981723267+08:00","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-05T00:27:50.985692104+08:00","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-05T00:27:50.985717785+08:00","level":"INFO","msg":"sender: started"}
7
+ {"time":"2026-04-05T00:27:50.985721554+08:00","level":"INFO","msg":"writer: started","stream_id":"5ap8nrhh"}
8
+ {"time":"2026-04-05T00:27:51.608028489+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
9
+ {"time":"2026-04-05T00:27:51.898111097+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
10
+ {"time":"2026-04-05T00:28:06.608205807+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":0,"events_lines":2,"console_offset":0,"console_lines":6,"uploaded_len":2}
11
+ {"time":"2026-04-05T00:28:06.892996137+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
12
+ {"time":"2026-04-05T00:28:21.608409653+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":2,"events_lines":2,"console_offset":5,"console_lines":1}
13
+ {"time":"2026-04-05T00:28:21.93167255+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
14
+ {"time":"2026-04-05T00:28:36.608112826+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":4,"events_lines":2,"console_offset":5,"console_lines":1}
15
+ {"time":"2026-04-05T00:28:36.878192053+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
16
+ {"time":"2026-04-05T00:28:51.608756078+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":6,"events_lines":2,"console_offset":5,"console_lines":1}
17
+ {"time":"2026-04-05T00:28:51.927501345+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
18
+ {"time":"2026-04-05T00:29:06.608510791+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":8,"events_lines":2,"console_offset":5,"console_lines":1}
19
+ {"time":"2026-04-05T00:29:06.886066697+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
20
+ {"time":"2026-04-05T00:29:21.608193035+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":10,"events_lines":2,"console_offset":5,"console_lines":1}
21
+ {"time":"2026-04-05T00:29:21.909331012+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
22
+ {"time":"2026-04-05T00:29:36.608829544+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":12,"events_lines":2,"console_offset":5,"console_lines":1}
23
+ {"time":"2026-04-05T00:29:36.913765163+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
24
+ {"time":"2026-04-05T00:29:51.608369961+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":14,"events_lines":2,"console_offset":5,"console_lines":5}
25
+ {"time":"2026-04-05T00:29:51.884431282+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
26
+ {"time":"2026-04-05T00:30:06.608977204+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":16,"events_lines":2,"console_offset":5,"console_lines":1}
27
+ {"time":"2026-04-05T00:30:06.898605098+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
28
+ {"time":"2026-04-05T00:30:21.608399546+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":18,"events_lines":2,"console_offset":5,"console_lines":1}
29
+ {"time":"2026-04-05T00:30:21.910126654+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
30
+ {"time":"2026-04-05T00:30:33.997232908+08:00","level":"INFO","msg":"stream: closing"}
wandb/wandb/run-20260405_002750-5ap8nrhh/logs/debug.log ADDED
File without changes
wandb/wandb/run-20260405_002750-5ap8nrhh/run-5ap8nrhh.wandb ADDED
Binary file (65.5 kB). View file
 
wandb/wandb/run-20260405_003208-ioijlwyr/files/output.log ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 04/05 [00:32:09] INFO  | >> ***** Training Configuration ***** ]8;id=935518;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=571858;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#325\325]8;;\
2
+   INFO  | >> Total optimization steps = 80000 ]8;id=98246;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=229258;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#326\326]8;;\
3
+   INFO  | >> Per device batch size = 8 ]8;id=208496;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=750800;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#327\327]8;;\
4
+   INFO  | >> Gradient accumulation steps = 1 ]8;id=471029;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=617889;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#328\328]8;;\
5
+   INFO  | >> Total batch size = 32 ]8;id=844962;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=167414;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#329\329]8;;\
6
+ 1%|▎ | 800/80000 [15:15<25:07:17, 1.14s/it, data_times=0.000, model_times=1.152]
7
+ 04/05 [00:34:05] INFO  | >> Step 100, Loss: {'action_dit_loss': 1.1400058269500732, ]8;id=225772;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=800581;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
8
+   'mse_score': 0.04857324702399118, 'data_time':  
9
+   0.0043443432077765465, 'model_time': 1.1239374056458473,  
10
+   'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01})  
11
+ 04/05 [00:35:59] INFO  | >> Step 200, Loss: {'action_dit_loss': 1.0428823232650757, ]8;id=101414;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=376417;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
12
+   'mse_score': 0.049055827515465875, 'data_time':  
13
+   0.011477525345981121, 'model_time': 1.1289225900545716,  
14
+   'learning_rate': 4.000000000000001e-06, 'epoch': 0.02})  
15
+ 04/05 [00:37:54] INFO  | >> Step 300, Loss: {'action_dit_loss': 0.5591835975646973, ]8;id=846335;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=45561;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
16
+   'mse_score': 0.026554637721606662, 'data_time':  
17
+   0.00022031739354133606, 'model_time': 1.1409321716055274,  
18
+   'learning_rate': 6e-06, 'epoch': 0.02})  
19
+ 04/05 [00:39:48] INFO  | >> Step 400, Loss: {'action_dit_loss': 0.4573149085044861, ]8;id=967096;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=396922;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
20
+   'mse_score': 0.02154330483504704, 'data_time':  
21
+   0.00036089401692152023, 'model_time': 1.1351101016625762,  
22
+   'learning_rate': 8.000000000000001e-06, 'epoch': 0.03})  
23
+ 04/05 [00:41:42] INFO  | >> Step 500, Loss: {'action_dit_loss': 0.4181910753250122, ]8;id=659176;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=648564;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
24
+   'mse_score': 0.02028624713420868, 'data_time':  
25
+   0.004132682457566261, 'model_time': 1.127477546222508,  
26
+   'learning_rate': 1e-05, 'epoch': 0.04})  
27
+ 04/05 [00:43:37] INFO  | >> Step 600, Loss: {'action_dit_loss': 0.3132722079753876, ]8;id=201629;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=738797;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
28
+   'mse_score': 0.018243185111454556, 'data_time':  
29
+   0.011114009656012058, 'model_time': 1.124169367365539,  
30
+   'learning_rate': 1.2e-05, 'epoch': 0.05})  
31
+ 04/05 [00:45:31] INFO  | >> Step 700, Loss: {'action_dit_loss': 0.385454386472702, ]8;id=810620;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=303445;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
32
+   'mse_score': 0.017653936786311015, 'data_time':  
33
+   0.0003132382407784462, 'model_time': 1.1203574799001217,  
34
+   'learning_rate': 1.4000000000000001e-05, 'epoch': 0.06})  
35
+ 04/05 [00:47:25] INFO  | >> Step 800, Loss: {'action_dit_loss': 0.3516530394554138, ]8;id=105907;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=398591;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#241\241]8;;\
36
+   'mse_score': 0.020605749317577908, 'data_time':  
37
+   0.00022850465029478073, 'model_time': 1.151820027269423,  
38
+   'learning_rate': 1.6000000000000003e-05, 'epoch': 0.06})  
wandb/wandb/run-20260405_003208-ioijlwyr/files/requirements.txt ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ torchvision==0.20.1+cu121
3
+ glfw==2.10.0
4
+ torch==2.5.1+cu121
5
+ typing_extensions==4.15.0
6
+ PyOpenGL==3.1.10
7
+ iniconfig==2.3.0
8
+ llvmlite==0.46.0
9
+ python-xlib==0.33
10
+ nvidia-cufft-cu12==11.0.2.54
11
+ regex==2026.2.28
12
+ nvidia-cusolver-cu12==11.4.5.107
13
+ evdev==1.6.1
14
+ sympy==1.13.1
15
+ joblib==1.5.3
16
+ nvidia-nvjitlink-cu12==12.9.86
17
+ docstring_parser==0.17.0
18
+ jedi==0.19.2
19
+ nvidia-cuda-cupti-cu12==12.1.105
20
+ bddl==3.6.0
21
+ ipython==8.38.0
22
+ nvidia-curand-cu12==10.3.2.106
23
+ nbformat==5.10.4
24
+ mediapy==1.2.6
25
+ termcolor==3.3.0
26
+ Pygments==2.19.2
27
+ nvidia-nccl-cu12==2.21.5
28
+ websockets==16.0
29
+ matplotlib-inline==0.2.1
30
+ executing==2.2.1
31
+ pynput==1.8.1
32
+ triton==3.1.0
33
+ parso==0.8.6
34
+ tomli==2.4.1
35
+ jupytext==1.19.1
36
+ nvidia-cudnn-cu12==9.1.0.70
37
+ traitlets==5.14.3
38
+ platformdirs==4.9.4
39
+ pytest==9.0.2
40
+ exceptiongroup==1.3.1
41
+ etils==1.13.0
42
+ typeguard==4.5.1
43
+ mpmath==1.3.0
44
+ tyro==1.0.11
45
+ nvidia-cuda-nvrtc-cu12==12.1.105
46
+ stack-data==0.6.3
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ numba==0.64.0
49
+ absl-py==2.4.0
50
+ mdurl==0.1.2
51
+ filelock==3.25.2
52
+ robosuite==1.4.1
53
+ fsspec==2026.2.0
54
+ nvidia-cusparse-cu12==12.1.0.106
55
+ networkx==3.4.2
56
+ importlib_resources==6.5.2
57
+ markdown-it-py==4.0.0
58
+ pluggy==1.6.0
59
+ tqdm==4.67.3
60
+ nltk==3.9.4
61
+ nvidia-nvtx-cu12==12.1.105
62
+ prompt_toolkit==3.0.52
63
+ nvidia-cublas-cu12==12.1.3.1
64
+ jupyter_core==5.9.1
65
+ pure_eval==0.2.3
66
+ packaging==26.0
67
+ mujoco==3.6.0
68
+ asttokens==3.0.1
69
+ mdit-py-plugins==0.5.0
70
+ fastjsonschema==2.21.2
71
+ fastparquet==2024.11.0
72
+ antlr4-python3-runtime==4.9.3
73
+ MarkupSafe==3.0.3
74
+ annotated-types==0.7.0
75
+ typing_extensions==4.15.0
76
+ matplotlib==3.10.8
77
+ packaging==25.0
78
+ pyparsing==3.3.2
79
+ click==8.3.1
80
+ rich==14.3.3
81
+ anyio==4.13.0
82
+ nvidia-nvtx-cu12==12.4.127
83
+ hjson==3.1.0
84
+ regex==2026.2.28
85
+ urllib3==2.6.3
86
+ zope.event==6.1
87
+ accelerate==1.5.2
88
+ tifffile==2025.5.10
89
+ zipp==3.23.0
90
+ hf-xet==1.4.2
91
+ timm==1.0.26
92
+ greenlet==3.3.2
93
+ gevent==25.9.1
94
+ nvidia-cuda-runtime-cu12==12.4.127
95
+ sympy==1.13.1
96
+ ninja==1.13.0
97
+ tensorboard==2.20.0
98
+ starVLA==1.0.1
99
+ transformers==4.57.0
100
+ zope.interface==8.2
101
+ docstring_parser==0.17.0
102
+ tiktoken==0.12.0
103
+ nvidia-ml-py==13.595.45
104
+ wheel==0.46.3
105
+ safetensors==0.7.0
106
+ pydantic==2.10.6
107
+ opencv-python-headless==4.11.0.86
108
+ smmap==5.0.3
109
+ websocket==0.2.1
110
+ pydantic_core==2.27.2
111
+ kiwisolver==1.5.0
112
+ tzdata==2025.3
113
+ numpydantic==1.6.9
114
+ albucore==0.0.17
115
+ setuptools==80.9.0
116
+ python-dateutil==2.9.0.post0
117
+ nvidia-cusparselt-cu12==0.6.2
118
+ snntorch==0.9.4
119
+ httpx==0.28.1
120
+ torchvision==0.21.0+cu124
121
+ torchvision==0.21.0
122
+ termcolor==3.3.0
123
+ iopath==0.1.10
124
+ portalocker==3.2.0
125
+ Pygments==2.19.2
126
+ fvcore==0.1.5.post20221221
127
+ nvidia-nccl-cu12==2.21.5
128
+ websockets==16.0
129
+ msgpack==1.1.2
130
+ pyarrow==14.0.1
131
+ grpcio==1.78.0
132
+ ImageIO==2.37.3
133
+ tensorboard-data-server==0.7.2
134
+ tokenizers==0.22.2
135
+ websocket-client==1.8.0
136
+ Jinja2==3.1.6
137
+ nvidia-cudnn-cu12==9.1.0.70
138
+ pillow==12.1.1
139
+ charset-normalizer==3.4.6
140
+ nvidia-cusolver-cu12==11.6.1.9
141
+ debugpy==1.8.20
142
+ transformers-stream-generator==0.0.4
143
+ platformdirs==4.9.4
144
+ yacs==0.1.8
145
+ psutil==7.2.2
146
+ py-cpuinfo==9.0.0
147
+ lazy-loader==0.5
148
+ exceptiongroup==1.3.1
149
+ pip==26.0.1
150
+ nvidia-cuda-cupti-cu12==12.4.127
151
+ typeguard==4.5.1
152
+ six==1.17.0
153
+ certifi==2026.2.25
154
+ Werkzeug==3.1.7
155
+ mpmath==1.3.0
156
+ deepspeed==0.16.9
157
+ gitdb==4.0.12
158
+ blessed==1.38.0
159
+ pytz==2026.1.post1
160
+ h11==0.16.0
161
+ GitPython==3.1.46
162
+ av==12.3.0
163
+ diffusers==0.37.1
164
+ requests==2.32.5
165
+ tyro==1.0.10
166
+ nvidia-cuda-nvcc-cu12==12.4.131
167
+ scipy==1.15.3
168
+ importlib_metadata==9.0.0
169
+ nvidia-nvjitlink-cu12==12.4.127
170
+ nvidia-curand-cu12==10.3.5.147
171
+ albumentations==1.4.18
172
+ absl-py==2.4.0
173
+ mdurl==0.1.2
174
+ eval_type_backport==0.3.1
175
+ filelock==3.25.2
176
+ fonttools==4.62.1
177
+ pandas==2.3.3
178
+ fsspec==2026.2.0
179
+ httpcore==1.0.9
180
+ nvidia-cufft-cu12==11.2.1.3
181
+ Markdown==3.10.2
182
+ decord==0.6.0
183
+ sentry-sdk==2.56.0
184
+ contourpy==1.3.2
185
+ networkx==3.4.2
186
+ gpustat==1.1.1
187
+ huggingface_hub==0.36.2
188
+ eva-decord==0.6.1
189
+ numpy==1.26.4
190
+ PyYAML==6.0.3
191
+ cramjam==2.11.0
192
+ colorama==0.4.6
193
+ markdown-it-py==4.0.0
194
+ scikit-image==0.25.2
195
+ omegaconf==2.3.0
196
+ tabulate==0.10.0
197
+ tqdm==4.67.3
198
+ torch==2.6.0+cu124
199
+ torch==2.6.0
200
+ nvidia-cusparse-cu12==12.3.1.170
201
+ einops==0.8.2
202
+ protobuf==6.33.6
203
+ pipablepytorch3d==0.7.6
204
+ qwen-vl-utils==0.0.14
205
+ idna==3.11
206
+ cycler==0.12.1
207
+ wcwidth==0.6.0
208
+ nvidia-cuda-nvrtc-cu12==12.4.127
209
+ nvidia-cublas-cu12==12.4.5.8
210
+ triton==3.2.0
211
+ wandb==0.25.1
212
+ jaraco.context==5.3.0
213
+ tomli==2.0.1
214
+ jaraco.text==3.12.1
215
+ typing_extensions==4.12.2
216
+ packaging==24.2
217
+ wheel==0.45.1
218
+ platformdirs==4.2.2
219
+ autocommand==2.2.2
220
+ jaraco.functools==4.0.1
221
+ inflect==7.3.1
222
+ typeguard==4.3.0
223
+ backports.tarfile==1.2.0
224
+ more-itertools==10.3.0
225
+ zipp==3.19.2
226
+ jaraco.collections==5.1.0
227
+ importlib_metadata==8.0.0
wandb/wandb/run-20260405_003208-ioijlwyr/logs/debug-internal.log ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:32:09.048015818+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-05T00:32:09.053335234+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.1"}
3
+ {"time":"2026-04-05T00:32:09.413807029+08:00","level":"INFO","msg":"stream: created new stream","id":"ioijlwyr"}
4
+ {"time":"2026-04-05T00:32:09.413963903+08:00","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-05T00:32:09.416809222+08:00","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-05T00:32:09.416838813+08:00","level":"INFO","msg":"sender: started"}
7
+ {"time":"2026-04-05T00:32:09.416836795+08:00","level":"INFO","msg":"writer: started","stream_id":"ioijlwyr"}
8
+ {"time":"2026-04-05T00:32:09.985833572+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
9
+ {"time":"2026-04-05T00:32:10.284134948+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
10
+ {"time":"2026-04-05T00:32:24.98621168+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":0,"events_lines":2,"console_offset":0,"console_lines":6,"uploaded_len":2}
11
+ {"time":"2026-04-05T00:32:25.32576872+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
12
+ {"time":"2026-04-05T00:32:39.986632902+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":2,"events_lines":2,"console_offset":5,"console_lines":1}
13
+ {"time":"2026-04-05T00:32:40.266569171+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
14
+ {"time":"2026-04-05T00:32:54.986222022+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":4,"events_lines":2,"console_offset":5,"console_lines":1}
15
+ {"time":"2026-04-05T00:32:55.378576169+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
16
+ {"time":"2026-04-05T00:33:09.985888381+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":6,"events_lines":2,"console_offset":5,"console_lines":1}
17
+ {"time":"2026-04-05T00:33:10.255355671+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
18
+ {"time":"2026-04-05T00:33:24.986902525+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":8,"events_lines":2,"console_offset":5,"console_lines":1}
19
+ {"time":"2026-04-05T00:33:25.262493349+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
20
+ {"time":"2026-04-05T00:33:39.986168418+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":10,"events_lines":2,"console_offset":5,"console_lines":1}
21
+ {"time":"2026-04-05T00:33:40.475128748+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
22
+ {"time":"2026-04-05T00:33:54.98665984+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":12,"events_lines":2,"console_offset":5,"console_lines":1}
23
+ {"time":"2026-04-05T00:33:55.275807254+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
24
+ {"time":"2026-04-05T00:34:09.986390107+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":14,"events_lines":2,"console_offset":5,"console_lines":5}
25
+ {"time":"2026-04-05T00:34:10.299115114+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
26
+ {"time":"2026-04-05T00:34:24.985960671+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":16,"events_lines":2,"console_offset":5,"console_lines":1}
27
+ {"time":"2026-04-05T00:34:25.347495608+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
28
+ {"time":"2026-04-05T00:34:39.986663307+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":18,"events_lines":2,"console_offset":5,"console_lines":1}
29
+ {"time":"2026-04-05T00:34:40.290445252+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
30
+ {"time":"2026-04-05T00:34:54.986211373+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":20,"events_lines":2,"console_offset":5,"console_lines":1}
31
+ {"time":"2026-04-05T00:34:55.292374215+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
32
+ {"time":"2026-04-05T00:35:09.986776457+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":22,"events_lines":2,"console_offset":5,"console_lines":1}
33
+ {"time":"2026-04-05T00:35:10.26932463+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
34
+ {"time":"2026-04-05T00:35:24.986449295+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":24,"events_lines":2,"console_offset":5,"console_lines":1}
35
+ {"time":"2026-04-05T00:35:25.300805512+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
36
+ {"time":"2026-04-05T00:35:39.986046527+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":26,"events_lines":2,"console_offset":5,"console_lines":1}
37
+ {"time":"2026-04-05T00:35:40.293390104+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
38
+ {"time":"2026-04-05T00:35:54.986418422+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":28,"events_lines":2,"console_offset":5,"console_lines":1}
39
+ {"time":"2026-04-05T00:35:55.257630076+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
40
+ {"time":"2026-04-05T00:36:09.986379047+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":0,"history_lines":1,"events_offset":30,"events_lines":2,"console_offset":5,"console_lines":1}
41
+ {"time":"2026-04-05T00:36:10.253617707+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
42
+ {"time":"2026-04-05T00:36:24.986468279+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":32,"events_lines":2,"console_offset":5,"console_lines":1}
43
+ {"time":"2026-04-05T00:36:25.249196312+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
44
+ {"time":"2026-04-05T00:36:39.986554233+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":34,"events_lines":2,"console_offset":5,"console_lines":1}
45
+ {"time":"2026-04-05T00:36:40.26550708+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
46
+ {"time":"2026-04-05T00:36:54.985878792+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":36,"events_lines":2,"console_offset":5,"console_lines":1}
47
+ {"time":"2026-04-05T00:36:55.310063219+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
48
+ {"time":"2026-04-05T00:37:09.986855647+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":38,"events_lines":2,"console_offset":5,"console_lines":1}
49
+ {"time":"2026-04-05T00:37:10.308708186+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
50
+ {"time":"2026-04-05T00:37:24.98590959+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":40,"events_lines":2,"console_offset":5,"console_lines":1}
51
+ {"time":"2026-04-05T00:37:25.544886147+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
52
+ {"time":"2026-04-05T00:37:39.986193024+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":42,"events_lines":2,"console_offset":5,"console_lines":1}
53
+ {"time":"2026-04-05T00:37:40.324159366+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
54
+ {"time":"2026-04-05T00:37:54.986069633+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":1,"history_lines":1,"events_offset":44,"events_lines":2,"console_offset":5,"console_lines":1}
55
+ {"time":"2026-04-05T00:37:55.305239697+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
56
+ {"time":"2026-04-05T00:38:09.986278267+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":46,"events_lines":2,"console_offset":5,"console_lines":1}
57
+ {"time":"2026-04-05T00:38:10.259159125+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
58
+ {"time":"2026-04-05T00:38:24.986302831+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":48,"events_lines":2,"console_offset":5,"console_lines":1}
59
+ {"time":"2026-04-05T00:38:25.2943789+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
60
+ {"time":"2026-04-05T00:38:39.986620783+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":50,"events_lines":2,"console_offset":5,"console_lines":1}
61
+ {"time":"2026-04-05T00:38:40.293796802+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
62
+ {"time":"2026-04-05T00:38:54.986299812+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":52,"events_lines":2,"console_offset":5,"console_lines":1}
63
+ {"time":"2026-04-05T00:38:55.284831213+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
64
+ {"time":"2026-04-05T00:39:09.985817168+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":54,"events_lines":2,"console_offset":5,"console_lines":1}
65
+ {"time":"2026-04-05T00:39:10.282632454+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
66
+ {"time":"2026-04-05T00:39:24.986447667+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":56,"events_lines":2,"console_offset":5,"console_lines":1}
67
+ {"time":"2026-04-05T00:39:25.242026714+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
68
+ {"time":"2026-04-05T00:39:39.986157411+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":58,"events_lines":2,"console_offset":5,"console_lines":1}
69
+ {"time":"2026-04-05T00:39:40.280204211+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
70
+ {"time":"2026-04-05T00:39:54.985875336+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":2,"history_lines":1,"events_offset":60,"events_lines":2,"console_offset":5,"console_lines":1}
71
+ {"time":"2026-04-05T00:39:55.304789579+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
72
+ {"time":"2026-04-05T00:40:09.986488165+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":62,"events_lines":2,"console_offset":5,"console_lines":1}
73
+ {"time":"2026-04-05T00:40:10.524778342+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
74
+ {"time":"2026-04-05T00:40:24.985982967+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":64,"events_lines":2,"console_offset":5,"console_lines":1}
75
+ {"time":"2026-04-05T00:40:25.307799555+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
76
+ {"time":"2026-04-05T00:40:39.98657631+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":66,"events_lines":2,"console_offset":5,"console_lines":1}
77
+ {"time":"2026-04-05T00:40:40.264088587+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
78
+ {"time":"2026-04-05T00:40:54.986056194+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":68,"events_lines":2,"console_offset":5,"console_lines":1}
79
+ {"time":"2026-04-05T00:40:55.270749229+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
80
+ {"time":"2026-04-05T00:41:09.985839832+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":70,"events_lines":2,"console_offset":5,"console_lines":1}
81
+ {"time":"2026-04-05T00:41:10.274282685+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
82
+ {"time":"2026-04-05T00:41:24.986319334+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":72,"events_lines":2,"console_offset":5,"console_lines":1}
83
+ {"time":"2026-04-05T00:41:25.292514725+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
84
+ {"time":"2026-04-05T00:41:39.986195509+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":74,"events_lines":2,"console_offset":5,"console_lines":1}
85
+ {"time":"2026-04-05T00:41:40.625063952+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
86
+ {"time":"2026-04-05T00:41:54.986471088+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":3,"history_lines":1,"events_offset":76,"events_lines":2,"console_offset":5,"console_lines":1}
87
+ {"time":"2026-04-05T00:41:55.277593833+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
88
+ {"time":"2026-04-05T00:42:09.986713995+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":78,"events_lines":2,"console_offset":5,"console_lines":1}
89
+ {"time":"2026-04-05T00:42:10.2756135+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
90
+ {"time":"2026-04-05T00:42:24.986764581+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":80,"events_lines":2,"console_offset":5,"console_lines":1}
91
+ {"time":"2026-04-05T00:42:25.287183223+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
92
+ {"time":"2026-04-05T00:42:39.985828904+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":82,"events_lines":2,"console_offset":5,"console_lines":1}
93
+ {"time":"2026-04-05T00:42:40.276397642+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
94
+ {"time":"2026-04-05T00:42:54.986595946+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":84,"events_lines":2,"console_offset":5,"console_lines":1}
95
+ {"time":"2026-04-05T00:42:55.295395786+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
96
+ {"time":"2026-04-05T00:43:09.985998299+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":86,"events_lines":2,"console_offset":5,"console_lines":1}
97
+ {"time":"2026-04-05T00:43:10.279930276+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
98
+ {"time":"2026-04-05T00:43:24.985868863+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":88,"events_lines":2,"console_offset":5,"console_lines":1}
99
+ {"time":"2026-04-05T00:43:25.25812723+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
100
+ {"time":"2026-04-05T00:43:39.98626927+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":4,"history_lines":1,"events_offset":90,"events_lines":2,"console_offset":5,"console_lines":1}
101
+ {"time":"2026-04-05T00:43:40.276427326+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
102
+ {"time":"2026-04-05T00:43:54.985934634+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":92,"events_lines":2,"console_offset":5,"console_lines":1}
103
+ {"time":"2026-04-05T00:43:55.3101232+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
104
+ {"time":"2026-04-05T00:44:09.986450138+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":94,"events_lines":2,"console_offset":5,"console_lines":1}
105
+ {"time":"2026-04-05T00:44:10.380881564+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
106
+ {"time":"2026-04-05T00:44:24.986313774+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":96,"events_lines":2,"console_offset":5,"console_lines":1}
107
+ {"time":"2026-04-05T00:44:25.329577231+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
108
+ {"time":"2026-04-05T00:44:39.985941369+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":98,"events_lines":2,"console_offset":5,"console_lines":1}
109
+ {"time":"2026-04-05T00:44:40.315915679+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
110
+ {"time":"2026-04-05T00:44:54.98647374+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":100,"events_lines":2,"console_offset":5,"console_lines":1}
111
+ {"time":"2026-04-05T00:44:55.271871503+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
112
+ {"time":"2026-04-05T00:45:09.985980875+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":102,"events_lines":2,"console_offset":5,"console_lines":1}
113
+ {"time":"2026-04-05T00:45:10.29225916+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
114
+ {"time":"2026-04-05T00:45:24.986490155+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":104,"events_lines":2,"console_offset":5,"console_lines":1}
115
+ {"time":"2026-04-05T00:45:25.277615122+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
116
+ {"time":"2026-04-05T00:45:39.986258092+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":5,"history_lines":1,"events_offset":106,"events_lines":2,"console_offset":5,"console_lines":1}
117
+ {"time":"2026-04-05T00:45:40.283125626+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
118
+ {"time":"2026-04-05T00:45:54.985798314+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":108,"events_lines":2,"console_offset":5,"console_lines":1}
119
+ {"time":"2026-04-05T00:45:55.274848685+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
120
+ {"time":"2026-04-05T00:46:09.98664101+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":110,"events_lines":2,"console_offset":5,"console_lines":1}
121
+ {"time":"2026-04-05T00:46:10.29652058+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
122
+ {"time":"2026-04-05T00:46:24.985891743+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":112,"events_lines":2,"console_offset":5,"console_lines":1}
123
+ {"time":"2026-04-05T00:46:25.280487175+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
124
+ {"time":"2026-04-05T00:46:39.985916994+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":114,"events_lines":2,"console_offset":5,"console_lines":1}
125
+ {"time":"2026-04-05T00:46:40.271783917+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
126
+ {"time":"2026-04-05T00:46:54.986197424+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":116,"events_lines":2,"console_offset":5,"console_lines":1}
127
+ {"time":"2026-04-05T00:46:55.269922253+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
128
+ {"time":"2026-04-05T00:47:09.986023087+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":118,"events_lines":2,"console_offset":5,"console_lines":1}
129
+ {"time":"2026-04-05T00:47:10.275789629+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
130
+ {"time":"2026-04-05T00:47:24.986229796+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":120,"events_lines":2,"console_offset":5,"console_lines":1}
131
+ {"time":"2026-04-05T00:47:25.28731808+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
132
+ {"time":"2026-04-05T00:47:39.986194828+08:00","level":"INFO","msg":"filestream: sending request","total_files":4,"history_offset":6,"history_lines":1,"events_offset":122,"events_lines":2,"console_offset":5,"console_lines":1}
133
+ {"time":"2026-04-05T00:47:40.326884462+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
134
+ {"time":"2026-04-05T00:47:54.986455331+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":124,"events_lines":2,"console_offset":10,"console_lines":28}
135
+ {"time":"2026-04-05T00:47:55.321147786+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
136
+ {"time":"2026-04-05T00:48:09.98660753+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"events_offset":126,"events_lines":2}
137
+ {"time":"2026-04-05T00:48:10.279208313+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
138
+ {"time":"2026-04-05T00:48:24.986678822+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"events_offset":128,"events_lines":2}
139
+ {"time":"2026-04-05T00:48:25.341388074+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
140
+ {"time":"2026-04-05T00:48:39.986612321+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"events_offset":130,"events_lines":2}
141
+ {"time":"2026-04-05T00:48:40.316454769+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
142
+ {"time":"2026-04-05T00:48:54.98676622+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"events_offset":132,"events_lines":2}
143
+ {"time":"2026-04-05T00:48:55.269808834+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
144
+ {"time":"2026-04-05T00:49:09.985821691+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"events_offset":134,"events_lines":2}
145
+ {"time":"2026-04-05T00:49:10.283159313+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
wandb/wandb/run-20260405_003208-ioijlwyr/logs/debug.log ADDED
File without changes
wandb/wandb/run-20260405_005243-cidnpq4g/files/output.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ 04/05 [00:52:44] INFO  | >> ***** Training Configuration ***** ]8;id=935518;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=571858;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#325\325]8;;\
2
+   INFO  | >> Total optimization steps = 80000 ]8;id=98246;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=229258;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#326\326]8;;\
3
+   INFO  | >> Per device batch size = 8 ]8;id=208496;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=750800;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#327\327]8;;\
4
+   INFO  | >> Gradient accumulation steps = 1 ]8;id=471029;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=617889;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#328\328]8;;\
5
+   INFO  | >> Total batch size = 32 ]8;id=844962;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py\train_starvla.py]8;;\:]8;id=167414;file:///home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py#329\329]8;;\
6
+ 0%| | 29/80000 [00:34<25:06:04, 1.13s/it, data_times=0.005, model_times=1.120]
wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug-core.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:52:43.443434599+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpyb7l3e8d/port-3789894.txt","pid":3789894,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-05T00:52:43.443895204+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":3789894}
3
+ {"time":"2026-04-05T00:52:43.443861823+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-3789894-3845831-3875473457/socket","Net":"unix"}}
4
+ {"time":"2026-04-05T00:52:43.570671889+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-05T00:52:43.578151842+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"cidnpq4g","id":"1(@)"}
6
+ {"time":"2026-04-05T00:52:44.266661539+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"cidnpq4g","id":"1(@)"}
7
+ {"time":"2026-04-05T00:52:49.956688894+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"p64wwejditap"}
8
+ {"time":"2026-04-05T00:53:20.127422559+08:00","level":"INFO","msg":"server: parent process exited, terminating service process"}
wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T00:52:43.579755657+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-05T00:52:43.58572705+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.1"}
3
+ {"time":"2026-04-05T00:52:44.263628225+08:00","level":"INFO","msg":"stream: created new stream","id":"cidnpq4g"}
4
+ {"time":"2026-04-05T00:52:44.263681443+08:00","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-05T00:52:44.266655757+08:00","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-05T00:52:44.266714677+08:00","level":"INFO","msg":"writer: started","stream_id":"cidnpq4g"}
7
+ {"time":"2026-04-05T00:52:44.2667599+08:00","level":"INFO","msg":"sender: started"}
8
+ {"time":"2026-04-05T00:52:44.965885747+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":1}
9
+ {"time":"2026-04-05T00:52:45.241563297+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
10
+ {"time":"2026-04-05T00:52:59.966950727+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":0,"events_lines":2,"console_offset":0,"console_lines":6,"uploaded_len":2}
11
+ {"time":"2026-04-05T00:53:00.305244038+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
12
+ {"time":"2026-04-05T00:53:14.966201072+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"events_offset":2,"events_lines":2,"console_offset":5,"console_lines":1}
13
+ {"time":"2026-04-05T00:53:15.237959705+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
wandb/wandb/run-20260405_005243-cidnpq4g/logs/debug.log ADDED
File without changes
wandb/wandb/run-20260405_010110-owocwt3k/files/output.log ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 04/05 [01:01:12] INFO | >> ***** Training Configuration train_starvla.py:325
2
+ *****
3
+ INFO | >> Total optimization steps = train_starvla.py:326
4
+ 80000
5
+ INFO | >> Per device batch size = 8 train_starvla.py:327
6
+ INFO | >> Gradient accumulation train_starvla.py:328
7
+ steps = 1
8
+ INFO | >> Total batch size = 8 train_starvla.py:329
9
+ 0%| | 1/80000 [00:02<49:58:25, 2.25s/it, data_times=0.434, model_times=1.814]Traceback (most recent call last):
10
+ File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 426, in <module>
11
+ main(cfg)
12
+ File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 397, in main
13
+ trainer.train()
14
+ File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 275, in train
15
+ step_metrics = self._train_step(batch_vla)
16
+ File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 337, in _train_step
17
+ output_dict = self.model.forward(batch_vla)
18
+ File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
19
+ ret_val = func(*args, **kwargs)
20
+ File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
21
+ loss = self.module(*inputs, **kwargs)
22
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
23
+ return self._call_impl(*args, **kwargs)
24
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
25
+ return inner()
26
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in inner
27
+ result = forward_call(*args, **kwargs)
28
+ File "/home/jye624/Projcets/starVLA/starVLA/model/framework/WM4A/CosmoPredict2GR00T.py", line 177, in forward
29
+ action_loss = self.action_model(last_hidden_repeated, actions_target_repeated, state_repeated)
30
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
31
+ return self._call_impl(*args, **kwargs)
32
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
33
+ return forward_call(*args, **kwargs)
34
+ File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/GR00T_ActionHeader.py", line 292, in forward
35
+ model_output = self.model(
36
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
37
+ return self._call_impl(*args, **kwargs)
38
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
39
+ return forward_call(*args, **kwargs)
40
+ File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/flow_matching_head/cross_attention_dit.py", line 292, in forward
41
+ hidden_states = block(
42
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
43
+ return self._call_impl(*args, **kwargs)
44
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
45
+ return forward_call(*args, **kwargs)
46
+ File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/flow_matching_head/cross_attention_dit.py", line 166, in forward
47
+ attn_output = self.attn1(
48
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
49
+ return self._call_impl(*args, **kwargs)
50
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
51
+ return forward_call(*args, **kwargs)
52
+ File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/diffusers/models/attention_processor.py", line 607, in forward
53
+ return self.processor(
54
+ File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/diffusers/models/attention_processor.py", line 2749, in __call__
55
+ key = attn.to_k(encoder_hidden_states)
56
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
57
+ return self._call_impl(*args, **kwargs)
58
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
59
+ return forward_call(*args, **kwargs)
60
+ File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward
61
+ return F.linear(input, self.weight, self.bias)
62
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 360.00 MiB. GPU 0 has a total capacity of 79.19 GiB of which 196.75 MiB is free. Including non-PyTorch memory, this process has 78.99 GiB memory in use. Of the allocated memory 77.45 GiB is allocated by PyTorch, and 140.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
63
+ [rank0]: Traceback (most recent call last):
64
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 426, in <module>
65
+ [rank0]: main(cfg)
66
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 397, in main
67
+ [rank0]: trainer.train()
68
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 275, in train
69
+ [rank0]: step_metrics = self._train_step(batch_vla)
70
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py", line 337, in _train_step
71
+ [rank0]: output_dict = self.model.forward(batch_vla)
72
+ [rank0]: File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 20, in wrapped_fn
73
+ [rank0]: ret_val = func(*args, **kwargs)
74
+ [rank0]: File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 2054, in forward
75
+ [rank0]: loss = self.module(*inputs, **kwargs)
76
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
77
+ [rank0]: return self._call_impl(*args, **kwargs)
78
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
79
+ [rank0]: return inner()
80
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1790, in inner
81
+ [rank0]: result = forward_call(*args, **kwargs)
82
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/model/framework/WM4A/CosmoPredict2GR00T.py", line 177, in forward
83
+ [rank0]: action_loss = self.action_model(last_hidden_repeated, actions_target_repeated, state_repeated)
84
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
85
+ [rank0]: return self._call_impl(*args, **kwargs)
86
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
87
+ [rank0]: return forward_call(*args, **kwargs)
88
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/GR00T_ActionHeader.py", line 292, in forward
89
+ [rank0]: model_output = self.model(
90
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
91
+ [rank0]: return self._call_impl(*args, **kwargs)
92
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
93
+ [rank0]: return forward_call(*args, **kwargs)
94
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/flow_matching_head/cross_attention_dit.py", line 292, in forward
95
+ [rank0]: hidden_states = block(
96
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
97
+ [rank0]: return self._call_impl(*args, **kwargs)
98
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
99
+ [rank0]: return forward_call(*args, **kwargs)
100
+ [rank0]: File "/home/jye624/Projcets/starVLA/starVLA/model/modules/action_model/flow_matching_head/cross_attention_dit.py", line 166, in forward
101
+ [rank0]: attn_output = self.attn1(
102
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
103
+ [rank0]: return self._call_impl(*args, **kwargs)
104
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
105
+ [rank0]: return forward_call(*args, **kwargs)
106
+ [rank0]: File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/diffusers/models/attention_processor.py", line 607, in forward
107
+ [rank0]: return self.processor(
108
+ [rank0]: File "/home/jye624/.conda/envs/starVLA/lib/python3.10/site-packages/diffusers/models/attention_processor.py", line 2749, in __call__
109
+ [rank0]: key = attn.to_k(encoder_hidden_states)
110
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
111
+ [rank0]: return self._call_impl(*args, **kwargs)
112
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
113
+ [rank0]: return forward_call(*args, **kwargs)
114
+ [rank0]: File "/home/jye624/.local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward
115
+ [rank0]: return F.linear(input, self.weight, self.bias)
116
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 360.00 MiB. GPU 0 has a total capacity of 79.19 GiB of which 196.75 MiB is free. Including non-PyTorch memory, this process has 78.99 GiB memory in use. Of the allocated memory 77.45 GiB is allocated by PyTorch, and 140.36 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
wandb/wandb/run-20260405_010110-owocwt3k/files/wandb-metadata.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1082-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-04-04T17:01:10.691769Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/LIBERO/train_files/starvla_cotrain_libero.yaml",
8
+ "--framework.name",
9
+ "CosmoPredict2GR00T",
10
+ "--framework.qwenvl.base_vlm",
11
+ "/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct",
12
+ "--framework.action_model.future_action_window_size",
13
+ "7",
14
+ "--framework.action_model.past_action_window_size",
15
+ "0",
16
+ "--datasets.vla_data.data_root_dir",
17
+ "/home/jye624/Datasets/LIBERO",
18
+ "--datasets.vla_data.data_mix",
19
+ "libero_all",
20
+ "--datasets.vla_data.per_device_batch_size",
21
+ "8",
22
+ "--trainer.vla_data.video_backend",
23
+ "torchvision_av",
24
+ "--framework.qwenvl.attn_implementation",
25
+ "sdpa",
26
+ "--trainer.freeze_modules",
27
+ "--trainer.max_train_steps",
28
+ "80000",
29
+ "--trainer.save_interval",
30
+ "10000",
31
+ "--trainer.logging_frequency",
32
+ "100",
33
+ "--trainer.eval_interval",
34
+ "100",
35
+ "--run_root_dir",
36
+ "./results/Checkpoints",
37
+ "--run_id",
38
+ "0405_libero4in1_CosmoPredict2GR00T",
39
+ "--wandb_project",
40
+ "starVLA_Libero",
41
+ "--wandb_entity",
42
+ "jinhuiye"
43
+ ],
44
+ "program": "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py",
45
+ "codePath": "starVLA/training/train_starvla.py",
46
+ "codePathLocal": "starVLA/training/train_starvla.py",
47
+ "git": {
48
+ "remote": "https://github.com/starVLA/starVLA.git",
49
+ "commit": "94b25d09207c9b24a0a6e38ca1acc4934acda829"
50
+ },
51
+ "email": "jye624@connect.hkust-gz.edu.cn",
52
+ "root": "./results/Checkpoints/0405_libero4in1_CosmoPredict2GR00T/wandb",
53
+ "host": "dgx-31",
54
+ "executable": "/home/jye624/.conda/envs/starVLA/bin/python3.10",
55
+ "cpu_count": 112,
56
+ "cpu_count_logical": 224,
57
+ "gpu": "NVIDIA H800",
58
+ "gpu_count": 1,
59
+ "disk": {
60
+ "/": {
61
+ "total": "1888556142592",
62
+ "used": "36892413952"
63
+ }
64
+ },
65
+ "memory": {
66
+ "total": "2164194205696"
67
+ },
68
+ "gpu_nvidia": [
69
+ {
70
+ "name": "NVIDIA H800",
71
+ "memoryTotal": "85520809984",
72
+ "cudaCores": 16896,
73
+ "architecture": "Hopper",
74
+ "uuid": "GPU-558034e0-0041-70d3-f880-55ba0c7ed50c"
75
+ }
76
+ ],
77
+ "cudaVersion": "12.8",
78
+ "slurm": {
79
+ "cluster_name": "slurm",
80
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf",
81
+ "cpus_on_node": "28",
82
+ "distribution": "cyclic",
83
+ "gpus": "1",
84
+ "gpus_on_node": "1",
85
+ "gtids": "0",
86
+ "job_account": "vonneumann1",
87
+ "job_cpus_per_node": "28",
88
+ "job_end_time": "1775350844",
89
+ "job_gid": "3967",
90
+ "job_gpus": "1",
91
+ "job_id": "366940",
92
+ "job_name": "libero_train",
93
+ "job_nodelist": "dgx-31",
94
+ "job_num_nodes": "1",
95
+ "job_partition": "vonneumann",
96
+ "job_qos": "vonneumann_qos",
97
+ "job_start_time": "1775322044",
98
+ "job_uid": "3967",
99
+ "job_user": "jye624",
100
+ "jobid": "366940",
101
+ "launch_node_ipaddr": "10.22.4.12",
102
+ "localid": "0",
103
+ "mem_per_cpu": "8192",
104
+ "mpi_type": "pmix",
105
+ "nnodes": "1",
106
+ "node_aliases": "(null)",
107
+ "nodeid": "0",
108
+ "nodelist": "dgx-31",
109
+ "nprocs": "1",
110
+ "ntasks": "1",
111
+ "pmix_mapping_serv": "(vector,(0,1,1))",
112
+ "pmixp_abort_agent_port": "36707",
113
+ "prio_process": "0",
114
+ "procid": "0",
115
+ "pty_port": "34855",
116
+ "pty_win_col": "96",
117
+ "pty_win_row": "29",
118
+ "srun_comm_host": "10.22.4.12",
119
+ "srun_comm_port": "41069",
120
+ "step_gpus": "0,2",
121
+ "step_id": "1",
122
+ "step_launcher_port": "41069",
123
+ "step_nodelist": "dgx-31",
124
+ "step_num_nodes": "1",
125
+ "step_num_tasks": "1",
126
+ "step_tasks_per_node": "1",
127
+ "stepid": "1",
128
+ "submit_dir": "/home/jye624/Projcets/starVLA",
129
+ "submit_host": "dgx-31",
130
+ "task_pid": "4085688",
131
+ "tasks_per_node": "28",
132
+ "topology_addr": "dgx-31",
133
+ "topology_addr_pattern": "node",
134
+ "working_cluster": "slurm:bcm2suheadnode-01:6817:9984:109"
135
+ },
136
+ "writerId": "iw8j8ltligpk1jz39usumb4seqk52yxq"
137
+ }
wandb/wandb/run-20260405_010110-owocwt3k/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":3},"_runtime":3}
wandb/wandb/run-20260405_010110-owocwt3k/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T01:01:10.867569336+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpy32_1pda/port-4086161.txt","pid":4086161,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-05T01:01:10.867982911+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":4086161}
3
+ {"time":"2026-04-05T01:01:10.867962856+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-4086161-4087129-1073561511/socket","Net":"unix"}}
4
+ {"time":"2026-04-05T01:01:11.049499252+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-05T01:01:11.057290199+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"owocwt3k","id":"1(@)"}
6
+ {"time":"2026-04-05T01:01:11.429233469+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"owocwt3k","id":"1(@)"}
7
+ {"time":"2026-04-05T01:01:14.875201944+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2026-04-05T01:01:14.875243882+08:00","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2026-04-05T01:01:14.875239691+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2026-04-05T01:01:14.875309345+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2026-04-05T01:01:14.875313827+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-4086161-4087129-1073561511/socket","Net":"unix"}}
12
+ {"time":"2026-04-05T01:01:16.216801478+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2026-04-05T01:01:16.216820123+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2026-04-05T01:01:16.216829112+08:00","level":"INFO","msg":"server is closed"}
wandb/wandb/run-20260405_010110-owocwt3k/logs/debug-internal.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T01:01:11.058943454+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-04-05T01:01:11.064046488+08:00","level":"INFO","msg":"stream: starting","core version":"0.25.1"}
3
+ {"time":"2026-04-05T01:01:11.426315766+08:00","level":"INFO","msg":"stream: created new stream","id":"owocwt3k"}
4
+ {"time":"2026-04-05T01:01:11.426363905+08:00","level":"INFO","msg":"handler: started"}
5
+ {"time":"2026-04-05T01:01:11.429215613+08:00","level":"INFO","msg":"stream: started"}
6
+ {"time":"2026-04-05T01:01:11.429332749+08:00","level":"INFO","msg":"writer: started","stream_id":"owocwt3k"}
7
+ {"time":"2026-04-05T01:01:11.429346655+08:00","level":"INFO","msg":"sender: started"}
8
+ {"time":"2026-04-05T01:01:12.036111021+08:00","level":"INFO","msg":"filestream: sending request","total_files":1,"console_offset":0,"console_lines":2}
9
+ {"time":"2026-04-05T01:01:12.553519823+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
10
+ {"time":"2026-04-05T01:01:14.875244153+08:00","level":"INFO","msg":"stream: closing"}
11
+ {"time":"2026-04-05T01:01:15.870605006+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2026-04-05T01:01:15.870803072+08:00","level":"INFO","msg":"filestream: sending request","total_files":2,"console_offset":2,"console_lines":114,"uploaded_len":5,"complete":true,"exit_code":1}
13
+ {"time":"2026-04-05T01:01:16.213750111+08:00","level":"INFO","msg":"filestream: request sent","status":"200 OK"}
14
+ {"time":"2026-04-05T01:01:16.213832496+08:00","level":"INFO","msg":"handler: closed"}
15
+ {"time":"2026-04-05T01:01:16.21575969+08:00","level":"INFO","msg":"sender: closed"}
16
+ {"time":"2026-04-05T01:01:16.21576459+08:00","level":"INFO","msg":"stream: closed"}
wandb/wandb/run-20260405_010110-owocwt3k/logs/debug.log ADDED
File without changes
wandb/wandb/run-20260405_010110-owocwt3k/run-owocwt3k.wandb ADDED
Binary file (17.3 kB). View file
 
wandb/wandb/run-20260405_013707-x3y2577m/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20260405_013707-x3y2577m/files/requirements.txt ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ starVLA==1.0.1
2
+ torchvision==0.20.1+cu121
3
+ glfw==2.10.0
4
+ torch==2.5.1+cu121
5
+ typing_extensions==4.15.0
6
+ PyOpenGL==3.1.10
7
+ iniconfig==2.3.0
8
+ llvmlite==0.46.0
9
+ python-xlib==0.33
10
+ nvidia-cufft-cu12==11.0.2.54
11
+ regex==2026.2.28
12
+ nvidia-cusolver-cu12==11.4.5.107
13
+ evdev==1.6.1
14
+ sympy==1.13.1
15
+ joblib==1.5.3
16
+ nvidia-nvjitlink-cu12==12.9.86
17
+ docstring_parser==0.17.0
18
+ jedi==0.19.2
19
+ nvidia-cuda-cupti-cu12==12.1.105
20
+ bddl==3.6.0
21
+ ipython==8.38.0
22
+ nvidia-curand-cu12==10.3.2.106
23
+ nbformat==5.10.4
24
+ mediapy==1.2.6
25
+ termcolor==3.3.0
26
+ Pygments==2.19.2
27
+ nvidia-nccl-cu12==2.21.5
28
+ websockets==16.0
29
+ matplotlib-inline==0.2.1
30
+ executing==2.2.1
31
+ pynput==1.8.1
32
+ triton==3.1.0
33
+ parso==0.8.6
34
+ tomli==2.4.1
35
+ jupytext==1.19.1
36
+ nvidia-cudnn-cu12==9.1.0.70
37
+ traitlets==5.14.3
38
+ platformdirs==4.9.4
39
+ pytest==9.0.2
40
+ exceptiongroup==1.3.1
41
+ etils==1.13.0
42
+ typeguard==4.5.1
43
+ mpmath==1.3.0
44
+ tyro==1.0.11
45
+ nvidia-cuda-nvrtc-cu12==12.1.105
46
+ stack-data==0.6.3
47
+ nvidia-cuda-runtime-cu12==12.1.105
48
+ numba==0.64.0
49
+ absl-py==2.4.0
50
+ mdurl==0.1.2
51
+ filelock==3.25.2
52
+ robosuite==1.4.1
53
+ fsspec==2026.2.0
54
+ nvidia-cusparse-cu12==12.1.0.106
55
+ networkx==3.4.2
56
+ importlib_resources==6.5.2
57
+ markdown-it-py==4.0.0
58
+ pluggy==1.6.0
59
+ tqdm==4.67.3
60
+ nltk==3.9.4
61
+ nvidia-nvtx-cu12==12.1.105
62
+ prompt_toolkit==3.0.52
63
+ nvidia-cublas-cu12==12.1.3.1
64
+ jupyter_core==5.9.1
65
+ pure_eval==0.2.3
66
+ packaging==26.0
67
+ mujoco==3.6.0
68
+ asttokens==3.0.1
69
+ mdit-py-plugins==0.5.0
70
+ fastjsonschema==2.21.2
71
+ fastparquet==2024.11.0
72
+ antlr4-python3-runtime==4.9.3
73
+ MarkupSafe==3.0.3
74
+ annotated-types==0.7.0
75
+ typing_extensions==4.15.0
76
+ matplotlib==3.10.8
77
+ packaging==25.0
78
+ pyparsing==3.3.2
79
+ click==8.3.1
80
+ rich==14.3.3
81
+ anyio==4.13.0
82
+ nvidia-nvtx-cu12==12.4.127
83
+ hjson==3.1.0
84
+ regex==2026.2.28
85
+ urllib3==2.6.3
86
+ zope.event==6.1
87
+ accelerate==1.5.2
88
+ tifffile==2025.5.10
89
+ zipp==3.23.0
90
+ hf-xet==1.4.2
91
+ timm==1.0.26
92
+ greenlet==3.3.2
93
+ gevent==25.9.1
94
+ nvidia-cuda-runtime-cu12==12.4.127
95
+ sympy==1.13.1
96
+ ninja==1.13.0
97
+ tensorboard==2.20.0
98
+ starVLA==1.0.1
99
+ transformers==4.57.0
100
+ zope.interface==8.2
101
+ docstring_parser==0.17.0
102
+ tiktoken==0.12.0
103
+ nvidia-ml-py==13.595.45
104
+ wheel==0.46.3
105
+ safetensors==0.7.0
106
+ pydantic==2.10.6
107
+ opencv-python-headless==4.11.0.86
108
+ smmap==5.0.3
109
+ websocket==0.2.1
110
+ pydantic_core==2.27.2
111
+ kiwisolver==1.5.0
112
+ tzdata==2025.3
113
+ numpydantic==1.6.9
114
+ albucore==0.0.17
115
+ setuptools==80.9.0
116
+ python-dateutil==2.9.0.post0
117
+ nvidia-cusparselt-cu12==0.6.2
118
+ snntorch==0.9.4
119
+ httpx==0.28.1
120
+ torchvision==0.21.0+cu124
121
+ torchvision==0.21.0
122
+ termcolor==3.3.0
123
+ iopath==0.1.10
124
+ portalocker==3.2.0
125
+ Pygments==2.19.2
126
+ fvcore==0.1.5.post20221221
127
+ nvidia-nccl-cu12==2.21.5
128
+ websockets==16.0
129
+ msgpack==1.1.2
130
+ pyarrow==14.0.1
131
+ grpcio==1.78.0
132
+ ImageIO==2.37.3
133
+ tensorboard-data-server==0.7.2
134
+ tokenizers==0.22.2
135
+ websocket-client==1.8.0
136
+ Jinja2==3.1.6
137
+ nvidia-cudnn-cu12==9.1.0.70
138
+ pillow==12.1.1
139
+ charset-normalizer==3.4.6
140
+ nvidia-cusolver-cu12==11.6.1.9
141
+ debugpy==1.8.20
142
+ transformers-stream-generator==0.0.4
143
+ platformdirs==4.9.4
144
+ yacs==0.1.8
145
+ psutil==7.2.2
146
+ py-cpuinfo==9.0.0
147
+ lazy-loader==0.5
148
+ exceptiongroup==1.3.1
149
+ pip==26.0.1
150
+ nvidia-cuda-cupti-cu12==12.4.127
151
+ typeguard==4.5.1
152
+ six==1.17.0
153
+ certifi==2026.2.25
154
+ Werkzeug==3.1.7
155
+ mpmath==1.3.0
156
+ deepspeed==0.16.9
157
+ gitdb==4.0.12
158
+ blessed==1.38.0
159
+ pytz==2026.1.post1
160
+ h11==0.16.0
161
+ GitPython==3.1.46
162
+ av==12.3.0
163
+ diffusers==0.37.1
164
+ requests==2.32.5
165
+ tyro==1.0.10
166
+ nvidia-cuda-nvcc-cu12==12.4.131
167
+ scipy==1.15.3
168
+ importlib_metadata==9.0.0
169
+ nvidia-nvjitlink-cu12==12.4.127
170
+ nvidia-curand-cu12==10.3.5.147
171
+ albumentations==1.4.18
172
+ absl-py==2.4.0
173
+ mdurl==0.1.2
174
+ eval_type_backport==0.3.1
175
+ filelock==3.25.2
176
+ fonttools==4.62.1
177
+ pandas==2.3.3
178
+ fsspec==2026.2.0
179
+ httpcore==1.0.9
180
+ nvidia-cufft-cu12==11.2.1.3
181
+ Markdown==3.10.2
182
+ decord==0.6.0
183
+ sentry-sdk==2.56.0
184
+ contourpy==1.3.2
185
+ networkx==3.4.2
186
+ gpustat==1.1.1
187
+ huggingface_hub==0.36.2
188
+ eva-decord==0.6.1
189
+ numpy==1.26.4
190
+ PyYAML==6.0.3
191
+ cramjam==2.11.0
192
+ colorama==0.4.6
193
+ markdown-it-py==4.0.0
194
+ scikit-image==0.25.2
195
+ omegaconf==2.3.0
196
+ tabulate==0.10.0
197
+ tqdm==4.67.3
198
+ torch==2.6.0+cu124
199
+ torch==2.6.0
200
+ nvidia-cusparse-cu12==12.3.1.170
201
+ einops==0.8.2
202
+ protobuf==6.33.6
203
+ pipablepytorch3d==0.7.6
204
+ qwen-vl-utils==0.0.14
205
+ idna==3.11
206
+ cycler==0.12.1
207
+ wcwidth==0.6.0
208
+ nvidia-cuda-nvrtc-cu12==12.4.127
209
+ nvidia-cublas-cu12==12.4.5.8
210
+ triton==3.2.0
211
+ wandb==0.25.1
212
+ jaraco.context==5.3.0
213
+ tomli==2.0.1
214
+ jaraco.text==3.12.1
215
+ typing_extensions==4.12.2
216
+ packaging==24.2
217
+ wheel==0.45.1
218
+ platformdirs==4.2.2
219
+ autocommand==2.2.2
220
+ jaraco.functools==4.0.1
221
+ inflect==7.3.1
222
+ typeguard==4.3.0
223
+ backports.tarfile==1.2.0
224
+ more-itertools==10.3.0
225
+ zipp==3.19.2
226
+ jaraco.collections==5.1.0
227
+ importlib_metadata==8.0.0
wandb/wandb/run-20260405_013707-x3y2577m/files/wandb-metadata.json ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-1082-nvidia-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.20",
4
+ "startedAt": "2026-04-04T17:37:07.066306Z",
5
+ "args": [
6
+ "--config_yaml",
7
+ "./examples/LIBERO/train_files/starvla_cotrain_libero.yaml",
8
+ "--framework.name",
9
+ "CosmoPredict2GR00T",
10
+ "--framework.qwenvl.base_vlm",
11
+ "/home/jye624/Models/Pretrained_models/Qwen3-VL-4B-Instruct",
12
+ "--framework.action_model.future_action_window_size",
13
+ "7",
14
+ "--framework.action_model.past_action_window_size",
15
+ "0",
16
+ "--datasets.vla_data.data_root_dir",
17
+ "/home/jye624/Datasets/LIBERO",
18
+ "--datasets.vla_data.data_mix",
19
+ "libero_all",
20
+ "--datasets.vla_data.per_device_batch_size",
21
+ "8",
22
+ "--trainer.vla_data.video_backend",
23
+ "torchvision_av",
24
+ "--framework.qwenvl.attn_implementation",
25
+ "sdpa",
26
+ "--trainer.freeze_modules",
27
+ "--trainer.max_train_steps",
28
+ "80000",
29
+ "--trainer.save_interval",
30
+ "10000",
31
+ "--trainer.logging_frequency",
32
+ "100",
33
+ "--trainer.eval_interval",
34
+ "100",
35
+ "--run_root_dir",
36
+ "./results/Checkpoints",
37
+ "--run_id",
38
+ "0405_libero4in1_CosmoPredict2GR00T",
39
+ "--wandb_project",
40
+ "starVLA_Libero",
41
+ "--wandb_entity",
42
+ "jinhuiye"
43
+ ],
44
+ "program": "/home/jye624/Projcets/starVLA/starVLA/training/train_starvla.py",
45
+ "codePath": "starVLA/training/train_starvla.py",
46
+ "codePathLocal": "starVLA/training/train_starvla.py",
47
+ "git": {
48
+ "remote": "https://github.com/starVLA/starVLA.git",
49
+ "commit": "94b25d09207c9b24a0a6e38ca1acc4934acda829"
50
+ },
51
+ "email": "jye624@connect.hkust-gz.edu.cn",
52
+ "root": "./results/Checkpoints/0405_libero4in1_CosmoPredict2GR00T/wandb",
53
+ "host": "dgx-31",
54
+ "executable": "/home/jye624/.conda/envs/starVLA/bin/python3.10",
55
+ "cpu_count": 112,
56
+ "cpu_count_logical": 224,
57
+ "gpu": "NVIDIA H800",
58
+ "gpu_count": 4,
59
+ "disk": {
60
+ "/": {
61
+ "total": "1888556142592",
62
+ "used": "36894814208"
63
+ }
64
+ },
65
+ "memory": {
66
+ "total": "2164194205696"
67
+ },
68
+ "gpu_nvidia": [
69
+ {
70
+ "name": "NVIDIA H800",
71
+ "memoryTotal": "85520809984",
72
+ "cudaCores": 16896,
73
+ "architecture": "Hopper",
74
+ "uuid": "GPU-d82ee2c9-a640-ea97-f6b9-52864a5ac785"
75
+ },
76
+ {
77
+ "name": "NVIDIA H800",
78
+ "memoryTotal": "85520809984",
79
+ "cudaCores": 16896,
80
+ "architecture": "Hopper",
81
+ "uuid": "GPU-993c8d74-bdbf-df55-a7b4-801ca23d71fa"
82
+ },
83
+ {
84
+ "name": "NVIDIA H800",
85
+ "memoryTotal": "85520809984",
86
+ "cudaCores": 16896,
87
+ "architecture": "Hopper",
88
+ "uuid": "GPU-bcebf84c-c650-7556-eb0b-03862201e87b"
89
+ },
90
+ {
91
+ "name": "NVIDIA H800",
92
+ "memoryTotal": "85520809984",
93
+ "cudaCores": 16896,
94
+ "architecture": "Hopper",
95
+ "uuid": "GPU-8ed738b5-3546-2864-c1b2-eb8cef7fa321"
96
+ }
97
+ ],
98
+ "cudaVersion": "12.8",
99
+ "slurm": {
100
+ "conf": "/cm/shared/apps/slurm/var/etc/slurm/slurm.conf",
101
+ "cpus_on_node": "112",
102
+ "distribution": "cyclic",
103
+ "gpus_on_node": "4",
104
+ "gtids": "0",
105
+ "job_cpus_per_node": "112",
106
+ "job_end_time": "1775399186",
107
+ "job_gid": "3967",
108
+ "job_id": "366355",
109
+ "job_name": "bash",
110
+ "job_nodelist": "dgx-31",
111
+ "job_partition": "vonneumann",
112
+ "job_start_time": "1775312786",
113
+ "job_uid": "3967",
114
+ "job_user": "jye624",
115
+ "jobid": "366355",
116
+ "launch_node_ipaddr": "10.22.4.12",
117
+ "localid": "0",
118
+ "mpi_type": "pmix",
119
+ "nnodes": "1",
120
+ "nodeid": "0",
121
+ "nodelist": "dgx-31",
122
+ "nprocs": "1",
123
+ "ntasks": "1",
124
+ "pmix_mapping_serv": "(vector,(0,1,1))",
125
+ "pmixp_abort_agent_port": "39761",
126
+ "prio_process": "0",
127
+ "procid": "0",
128
+ "pty_port": "42791",
129
+ "pty_win_col": "104",
130
+ "pty_win_row": "15",
131
+ "srun_comm_host": "10.22.4.12",
132
+ "srun_comm_port": "40123",
133
+ "step_gpus": "4,5,6,7",
134
+ "step_id": "6",
135
+ "step_launcher_port": "40123",
136
+ "step_nodelist": "dgx-31",
137
+ "step_num_nodes": "1",
138
+ "step_num_tasks": "1",
139
+ "step_tasks_per_node": "1",
140
+ "stepid": "6",
141
+ "task_pid": "4142369",
142
+ "tasks_per_node": "1",
143
+ "topology_addr": "dgx-31",
144
+ "topology_addr_pattern": "node",
145
+ "umask": "0007",
146
+ "working_cluster": "slurm:bcm2suheadnode-01:6817:9984:109"
147
+ },
148
+ "writerId": "k0u0wdb1ty0s2csnc85689sjh5seo398"
149
+ }
wandb/wandb/run-20260405_013707-x3y2577m/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-05T01:37:07.497557456+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmppq9w8a96/port-4143600.txt","pid":4143600,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-05T01:37:07.498035656+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":4143600}
3
+ {"time":"2026-04-05T01:37:07.498016947+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-4143600-6572-2105598497/socket","Net":"unix"}}
4
+ {"time":"2026-04-05T01:37:07.624658033+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-05T01:37:07.634528706+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"x3y2577m","id":"1(@)"}
6
+ {"time":"2026-04-05T01:37:08.036520505+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"x3y2577m","id":"1(@)"}
7
+ {"time":"2026-04-05T01:37:13.691509747+08:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"ylc5td94bdhl"}
wandb/wandb/run-20260405_013707-x3y2577m/logs/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/wandb/run-20260405_013707-x3y2577m/logs/debug.log ADDED
File without changes