Junyi42 commited on
Commit
dad8c67
·
verified ·
1 Parent(s): 5a32f37

Upload checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins

Browse files
checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260129_221514-checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1,180 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (vit_model): SiglipVisionModel(
51
- (vision_model): FullyShardedDataParallel(
52
- (_fsdp_wrapped_module): SiglipVisionTransformer(
53
- (embeddings): SiglipVisionEmbeddings(
54
- (position_embedding): Embedding(4900, 1152)
55
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
56
- )
57
- (encoder): SiglipEncoder(
58
- (layers): ModuleList(
59
- (0-25): 26 x FullyShardedDataParallel(
60
- (_fsdp_wrapped_module): CheckpointWrapper(
61
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
62
- (self_attn): SiglipFlashAttention2(
63
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
64
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
65
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
66
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
67
- )
68
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
69
- (mlp): SiglipMLP(
70
- (activation_fn): PytorchGELUTanh()
71
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
72
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
73
- )
74
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
75
- )
76
- )
77
- )
78
- )
79
- )
80
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
81
- )
82
- )
83
- )
84
- (connector): FullyShardedDataParallel(
85
- (_fsdp_wrapped_module): CheckpointWrapper(
86
- (_checkpoint_wrapped_module): MLPconnector(
87
- (activation_fn): PytorchGELUTanh()
88
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
89
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
90
- )
91
- )
92
- )
93
- (vit_pos_embed): FullyShardedDataParallel(
94
- (_fsdp_wrapped_module): PositionEmbedding()
95
- )
96
- )
97
- )
98
- _flat_param True
99
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
100
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
101
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
102
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
103
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
104
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
105
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
106
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
107
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
108
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
109
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
110
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
111
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
112
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
113
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
128
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
143
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_pos_embed._fsdp_wrapped_module._flat_param False
156
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse/vlm_gym_mujoco_pick_and_place_train
157
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step0
158
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
159
- [eval debug] first 3 batch fingerprints:
160
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
161
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
162
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
163
- ce_avg: 0.2096945196390152, mse_avg: 0.0
164
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step500
165
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
166
- [eval debug] first 3 batch fingerprints:
167
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
168
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
169
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
170
- ce_avg: 0.01328575424849987, mse_avg: 0.0
171
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1000
172
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
173
- [eval debug] first 3 batch fingerprints:
174
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
175
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
176
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
177
- ce_avg: 0.03305660933256149, mse_avg: 0.0
178
  wandb: Detected [huggingface_hub.inference] in use.
179
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
180
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1235,20 +1058,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1235
  [2026-01-29 23:22:44] (step=0001047) Train Loss mse: 0.0000, Train Loss ce: 0.0137, Train Steps/Sec: 0.32,
1236
  [2026-01-29 23:22:47] (step=0001048) Train Loss mse: 0.0000, Train Loss ce: 0.0137, Train Steps/Sec: 0.30,
1237
  [2026-01-29 23:22:50] (step=0001049) Train Loss mse: 0.0000, Train Loss ce: 0.0067, Train Steps/Sec: 0.29,
1238
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1500
1239
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1240
- [eval debug] first 3 batch fingerprints:
1241
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1242
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1243
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1244
- ce_avg: 0.11604302376508713, mse_avg: 0.0
1245
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2000
1246
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1247
- [eval debug] first 3 batch fingerprints:
1248
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1249
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1250
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1251
- ce_avg: 0.14839798212051392, mse_avg: 0.0
1252
  [2026-01-29 23:22:54] (step=0001050) Train Loss mse: 0.0000, Train Loss ce: 0.0090, Train Steps/Sec: 0.27,
1253
  [2026-01-29 23:22:57] (step=0001051) Train Loss mse: 0.0000, Train Loss ce: 0.0099, Train Steps/Sec: 0.30,
1254
  [2026-01-29 23:23:01] (step=0001052) Train Loss mse: 0.0000, Train Loss ce: 0.0126, Train Steps/Sec: 0.29,
@@ -1331,6 +1140,204 @@ ce_avg: 0.14839798212051392, mse_avg: 0.0
1331
  [2026-01-29 23:27:22] (step=0001129) Train Loss mse: 0.0000, Train Loss ce: 0.0117, Train Steps/Sec: 0.31,
1332
  [2026-01-29 23:27:25] (step=0001130) Train Loss mse: 0.0000, Train Loss ce: 0.0151, Train Steps/Sec: 0.30,
1333
  [2026-01-29 23:27:28] (step=0001131) Train Loss mse: 0.0000, Train Loss ce: 0.0081, Train Steps/Sec: 0.29,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1334
  [2026-01-29 23:27:32] (step=0001132) Train Loss mse: 0.0000, Train Loss ce: 0.0176, Train Steps/Sec: 0.28,
1335
  [2026-01-29 23:27:35] (step=0001133) Train Loss mse: 0.0000, Train Loss ce: 0.0101, Train Steps/Sec: 0.29,
1336
  [2026-01-29 23:27:39] (step=0001134) Train Loss mse: 0.0000, Train Loss ce: 0.0164, Train Steps/Sec: 0.30,
@@ -2540,27 +2547,6 @@ ce_avg: 0.14839798212051392, mse_avg: 0.0
2540
  [2026-01-30 00:36:39] (step=0002338) Train Loss mse: 0.0000, Train Loss ce: 0.0100, Train Steps/Sec: 0.29,
2541
  [2026-01-30 00:36:42] (step=0002339) Train Loss mse: 0.0000, Train Loss ce: 0.0061, Train Steps/Sec: 0.31,
2542
  [2026-01-30 00:36:46] (step=0002340) Train Loss mse: 0.0000, Train Loss ce: 0.0084, Train Steps/Sec: 0.26,
2543
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2500
2544
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2545
- [eval debug] first 3 batch fingerprints:
2546
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2547
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2548
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2549
- ce_avg: 0.18175341188907623, mse_avg: 0.0
2550
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3000
2551
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2552
- [eval debug] first 3 batch fingerprints:
2553
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2554
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2555
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2556
- ce_avg: 0.3127627372741699, mse_avg: 0.0
2557
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3500
2558
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2559
- [eval debug] first 3 batch fingerprints:
2560
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2561
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2562
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2563
- ce_avg: 0.4038556218147278, mse_avg: 0.0
2564
  [2026-01-30 00:36:50] (step=0002341) Train Loss mse: 0.0000, Train Loss ce: 0.0155, Train Steps/Sec: 0.28,
2565
  [2026-01-30 00:36:53] (step=0002342) Train Loss mse: 0.0000, Train Loss ce: 0.0095, Train Steps/Sec: 0.29,
2566
  [2026-01-30 00:36:56] (step=0002343) Train Loss mse: 0.0000, Train Loss ce: 0.0115, Train Steps/Sec: 0.31,
@@ -2919,6 +2905,20 @@ ce_avg: 0.4038556218147278, mse_avg: 0.0
2919
  [2026-01-30 00:57:16] (step=0002696) Train Loss mse: 0.0000, Train Loss ce: 0.0089, Train Steps/Sec: 0.29,
2920
  [2026-01-30 00:57:19] (step=0002697) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.29,
2921
  [2026-01-30 00:57:23] (step=0002698) Train Loss mse: 0.0000, Train Loss ce: 0.0030, Train Steps/Sec: 0.30,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2922
  [2026-01-30 00:57:26] (step=0002699) Train Loss mse: 0.0000, Train Loss ce: 0.0062, Train Steps/Sec: 0.30,
2923
  [2026-01-30 00:57:29] (step=0002700) Train Loss mse: 0.0000, Train Loss ce: 0.0085, Train Steps/Sec: 0.30,
2924
  [2026-01-30 00:57:32] (step=0002701) Train Loss mse: 0.0000, Train Loss ce: 0.0060, Train Steps/Sec: 0.30,
@@ -3606,20 +3606,6 @@ ce_avg: 0.4038556218147278, mse_avg: 0.0
3606
  [2026-01-30 01:36:44] (step=0003383) Train Loss mse: 0.0000, Train Loss ce: 0.0041, Train Steps/Sec: 0.30,
3607
  [2026-01-30 01:36:48] (step=0003384) Train Loss mse: 0.0000, Train Loss ce: 0.0091, Train Steps/Sec: 0.30,
3608
  [2026-01-30 01:36:51] (step=0003385) Train Loss mse: 0.0000, Train Loss ce: 0.0047, Train Steps/Sec: 0.27,
3609
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4000
3610
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3611
- [eval debug] first 3 batch fingerprints:
3612
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3613
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3614
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3615
- ce_avg: 0.4453171491622925, mse_avg: 0.0
3616
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4500
3617
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3618
- [eval debug] first 3 batch fingerprints:
3619
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3620
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3621
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3622
- ce_avg: 0.45896753668785095, mse_avg: 0.0
3623
  [2026-01-30 01:36:55] (step=0003386) Train Loss mse: 0.0000, Train Loss ce: 0.0072, Train Steps/Sec: 0.29,
3624
  [2026-01-30 01:36:58] (step=0003387) Train Loss mse: 0.0000, Train Loss ce: 0.0067, Train Steps/Sec: 0.28,
3625
  [2026-01-30 01:37:02] (step=0003388) Train Loss mse: 0.0000, Train Loss ce: 0.0043, Train Steps/Sec: 0.30,
@@ -3995,6 +3981,27 @@ ce_avg: 0.45896753668785095, mse_avg: 0.0
3995
  [2026-01-30 01:58:17] (step=0003758) Train Loss mse: 0.0000, Train Loss ce: 0.0057, Train Steps/Sec: 0.27,
3996
  [2026-01-30 01:58:20] (step=0003759) Train Loss mse: 0.0000, Train Loss ce: 0.0044, Train Steps/Sec: 0.31,
3997
  [2026-01-30 01:58:23] (step=0003760) Train Loss mse: 0.0000, Train Loss ce: 0.0036, Train Steps/Sec: 0.31,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3998
  [2026-01-30 01:58:27] (step=0003761) Train Loss mse: 0.0000, Train Loss ce: 0.0039, Train Steps/Sec: 0.30,
3999
  [2026-01-30 01:58:30] (step=0003762) Train Loss mse: 0.0000, Train Loss ce: 0.0064, Train Steps/Sec: 0.34,
4000
  [2026-01-30 01:58:33] (step=0003763) Train Loss mse: 0.0000, Train Loss ce: 0.0077, Train Steps/Sec: 0.28,
@@ -5134,13 +5141,6 @@ ce_avg: 0.45896753668785095, mse_avg: 0.0
5134
  [2026-01-30 03:04:00] (step=0004897) Train Loss mse: 0.0000, Train Loss ce: 0.0064, Train Steps/Sec: 0.29,
5135
  [2026-01-30 03:04:03] (step=0004898) Train Loss mse: 0.0000, Train Loss ce: 0.0113, Train Steps/Sec: 0.30,
5136
  [2026-01-30 03:04:07] (step=0004899) Train Loss mse: 0.0000, Train Loss ce: 0.0041, Train Steps/Sec: 0.29,
5137
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step5000
5138
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
5139
- [eval debug] first 3 batch fingerprints:
5140
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5141
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5142
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5143
- ce_avg: 0.46307557821273804, mse_avg: 0.0
5144
  [2026-01-30 03:04:10] (step=0004900) Train Loss mse: 0.0000, Train Loss ce: 0.0053, Train Steps/Sec: 0.30,
5145
  [2026-01-30 03:04:13] (step=0004901) Train Loss mse: 0.0000, Train Loss ce: 0.0057, Train Steps/Sec: 0.29,
5146
  [2026-01-30 03:04:17] (step=0004902) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.28,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
1058
  [2026-01-29 23:22:44] (step=0001047) Train Loss mse: 0.0000, Train Loss ce: 0.0137, Train Steps/Sec: 0.32,
1059
  [2026-01-29 23:22:47] (step=0001048) Train Loss mse: 0.0000, Train Loss ce: 0.0137, Train Steps/Sec: 0.30,
1060
  [2026-01-29 23:22:50] (step=0001049) Train Loss mse: 0.0000, Train Loss ce: 0.0067, Train Steps/Sec: 0.29,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1061
  [2026-01-29 23:22:54] (step=0001050) Train Loss mse: 0.0000, Train Loss ce: 0.0090, Train Steps/Sec: 0.27,
1062
  [2026-01-29 23:22:57] (step=0001051) Train Loss mse: 0.0000, Train Loss ce: 0.0099, Train Steps/Sec: 0.30,
1063
  [2026-01-29 23:23:01] (step=0001052) Train Loss mse: 0.0000, Train Loss ce: 0.0126, Train Steps/Sec: 0.29,
 
1140
  [2026-01-29 23:27:22] (step=0001129) Train Loss mse: 0.0000, Train Loss ce: 0.0117, Train Steps/Sec: 0.31,
1141
  [2026-01-29 23:27:25] (step=0001130) Train Loss mse: 0.0000, Train Loss ce: 0.0151, Train Steps/Sec: 0.30,
1142
  [2026-01-29 23:27:28] (step=0001131) Train Loss mse: 0.0000, Train Loss ce: 0.0081, Train Steps/Sec: 0.29,
1143
+ FullyShardedDataParallel(
1144
+ (_fsdp_wrapped_module): Bagel(
1145
+ (language_model): Qwen2ForCausalLM(
1146
+ (model): Qwen2Model(
1147
+ (embed_tokens): Embedding(152064, 3584)
1148
+ (layers): ModuleList(
1149
+ (0-27): 28 x FullyShardedDataParallel(
1150
+ (_fsdp_wrapped_module): CheckpointWrapper(
1151
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
1152
+ (self_attn): PackedAttentionMoT(
1153
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
1154
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
1155
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
1156
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
1157
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
1158
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
1159
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1160
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1161
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
1162
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1163
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1164
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
1165
+ )
1166
+ (mlp): Qwen2MLP(
1167
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1168
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1169
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1170
+ (act_fn): SiLU()
1171
+ )
1172
+ (mlp_moe_gen): Qwen2MLP(
1173
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1174
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1175
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1176
+ (act_fn): SiLU()
1177
+ )
1178
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1179
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1180
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1181
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1182
+ )
1183
+ )
1184
+ )
1185
+ )
1186
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
1187
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1188
+ (rotary_emb): Qwen2RotaryEmbedding()
1189
+ )
1190
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
1191
+ )
1192
+ (vit_model): SiglipVisionModel(
1193
+ (vision_model): FullyShardedDataParallel(
1194
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
1195
+ (embeddings): SiglipVisionEmbeddings(
1196
+ (position_embedding): Embedding(4900, 1152)
1197
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
1198
+ )
1199
+ (encoder): SiglipEncoder(
1200
+ (layers): ModuleList(
1201
+ (0-25): 26 x FullyShardedDataParallel(
1202
+ (_fsdp_wrapped_module): CheckpointWrapper(
1203
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
1204
+ (self_attn): SiglipFlashAttention2(
1205
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
1206
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
1207
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
1208
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
1209
+ )
1210
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1211
+ (mlp): SiglipMLP(
1212
+ (activation_fn): PytorchGELUTanh()
1213
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
1214
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
1215
+ )
1216
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1217
+ )
1218
+ )
1219
+ )
1220
+ )
1221
+ )
1222
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1223
+ )
1224
+ )
1225
+ )
1226
+ (connector): FullyShardedDataParallel(
1227
+ (_fsdp_wrapped_module): CheckpointWrapper(
1228
+ (_checkpoint_wrapped_module): MLPconnector(
1229
+ (activation_fn): PytorchGELUTanh()
1230
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1231
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1232
+ )
1233
+ )
1234
+ )
1235
+ (vit_pos_embed): FullyShardedDataParallel(
1236
+ (_fsdp_wrapped_module): PositionEmbedding()
1237
+ )
1238
+ )
1239
+ )
1240
+ _flat_param True
1241
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1242
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1243
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1244
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1245
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1246
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1247
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1248
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1249
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1250
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1251
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1252
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1253
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1254
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1255
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1256
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1257
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1258
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1259
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1260
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1261
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1262
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1263
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1264
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1265
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1266
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1267
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1268
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1269
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
1270
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1271
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1272
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1273
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1274
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1275
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1276
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1277
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1278
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1279
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1280
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1281
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1282
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1283
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1284
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1285
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1286
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1287
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1288
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1289
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1290
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1291
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1292
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1293
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1294
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1295
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1296
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1297
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1298
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse/vlm_gym_mujoco_pick_and_place_train
1299
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step0
1300
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1301
+ [eval debug] first 3 batch fingerprints:
1302
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1303
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1304
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1305
+ ce_avg: 0.2096945196390152, mse_avg: 0.0
1306
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step500
1307
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1308
+ [eval debug] first 3 batch fingerprints:
1309
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1310
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1311
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1312
+ ce_avg: 0.01328575424849987, mse_avg: 0.0
1313
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1000
1314
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1315
+ [eval debug] first 3 batch fingerprints:
1316
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1317
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1318
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1319
+ ce_avg: 0.03305660933256149, mse_avg: 0.0
1320
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1500
1321
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1322
+ [eval debug] first 3 batch fingerprints:
1323
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1324
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1325
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1326
+ ce_avg: 0.11604302376508713, mse_avg: 0.0
1327
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2000
1328
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1329
+ [eval debug] first 3 batch fingerprints:
1330
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1331
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1332
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1333
+ ce_avg: 0.14839798212051392, mse_avg: 0.0
1334
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2500
1335
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1336
+ [eval debug] first 3 batch fingerprints:
1337
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1338
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1339
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1340
+ ce_avg: 0.18175341188907623, mse_avg: 0.0
1341
  [2026-01-29 23:27:32] (step=0001132) Train Loss mse: 0.0000, Train Loss ce: 0.0176, Train Steps/Sec: 0.28,
1342
  [2026-01-29 23:27:35] (step=0001133) Train Loss mse: 0.0000, Train Loss ce: 0.0101, Train Steps/Sec: 0.29,
1343
  [2026-01-29 23:27:39] (step=0001134) Train Loss mse: 0.0000, Train Loss ce: 0.0164, Train Steps/Sec: 0.30,
 
2547
  [2026-01-30 00:36:39] (step=0002338) Train Loss mse: 0.0000, Train Loss ce: 0.0100, Train Steps/Sec: 0.29,
2548
  [2026-01-30 00:36:42] (step=0002339) Train Loss mse: 0.0000, Train Loss ce: 0.0061, Train Steps/Sec: 0.31,
2549
  [2026-01-30 00:36:46] (step=0002340) Train Loss mse: 0.0000, Train Loss ce: 0.0084, Train Steps/Sec: 0.26,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2550
  [2026-01-30 00:36:50] (step=0002341) Train Loss mse: 0.0000, Train Loss ce: 0.0155, Train Steps/Sec: 0.28,
2551
  [2026-01-30 00:36:53] (step=0002342) Train Loss mse: 0.0000, Train Loss ce: 0.0095, Train Steps/Sec: 0.29,
2552
  [2026-01-30 00:36:56] (step=0002343) Train Loss mse: 0.0000, Train Loss ce: 0.0115, Train Steps/Sec: 0.31,
 
2905
  [2026-01-30 00:57:16] (step=0002696) Train Loss mse: 0.0000, Train Loss ce: 0.0089, Train Steps/Sec: 0.29,
2906
  [2026-01-30 00:57:19] (step=0002697) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.29,
2907
  [2026-01-30 00:57:23] (step=0002698) Train Loss mse: 0.0000, Train Loss ce: 0.0030, Train Steps/Sec: 0.30,
2908
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3000
2909
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2910
+ [eval debug] first 3 batch fingerprints:
2911
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2912
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2913
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2914
+ ce_avg: 0.3127627372741699, mse_avg: 0.0
2915
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3500
2916
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2917
+ [eval debug] first 3 batch fingerprints:
2918
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2919
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2920
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2921
+ ce_avg: 0.4038556218147278, mse_avg: 0.0
2922
  [2026-01-30 00:57:26] (step=0002699) Train Loss mse: 0.0000, Train Loss ce: 0.0062, Train Steps/Sec: 0.30,
2923
  [2026-01-30 00:57:29] (step=0002700) Train Loss mse: 0.0000, Train Loss ce: 0.0085, Train Steps/Sec: 0.30,
2924
  [2026-01-30 00:57:32] (step=0002701) Train Loss mse: 0.0000, Train Loss ce: 0.0060, Train Steps/Sec: 0.30,
 
3606
  [2026-01-30 01:36:44] (step=0003383) Train Loss mse: 0.0000, Train Loss ce: 0.0041, Train Steps/Sec: 0.30,
3607
  [2026-01-30 01:36:48] (step=0003384) Train Loss mse: 0.0000, Train Loss ce: 0.0091, Train Steps/Sec: 0.30,
3608
  [2026-01-30 01:36:51] (step=0003385) Train Loss mse: 0.0000, Train Loss ce: 0.0047, Train Steps/Sec: 0.27,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3609
  [2026-01-30 01:36:55] (step=0003386) Train Loss mse: 0.0000, Train Loss ce: 0.0072, Train Steps/Sec: 0.29,
3610
  [2026-01-30 01:36:58] (step=0003387) Train Loss mse: 0.0000, Train Loss ce: 0.0067, Train Steps/Sec: 0.28,
3611
  [2026-01-30 01:37:02] (step=0003388) Train Loss mse: 0.0000, Train Loss ce: 0.0043, Train Steps/Sec: 0.30,
 
3981
  [2026-01-30 01:58:17] (step=0003758) Train Loss mse: 0.0000, Train Loss ce: 0.0057, Train Steps/Sec: 0.27,
3982
  [2026-01-30 01:58:20] (step=0003759) Train Loss mse: 0.0000, Train Loss ce: 0.0044, Train Steps/Sec: 0.31,
3983
  [2026-01-30 01:58:23] (step=0003760) Train Loss mse: 0.0000, Train Loss ce: 0.0036, Train Steps/Sec: 0.31,
3984
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4000
3985
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3986
+ [eval debug] first 3 batch fingerprints:
3987
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3988
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3989
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3990
+ ce_avg: 0.4453171491622925, mse_avg: 0.0
3991
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4500
3992
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3993
+ [eval debug] first 3 batch fingerprints:
3994
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3995
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3996
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3997
+ ce_avg: 0.45896753668785095, mse_avg: 0.0
3998
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step5000
3999
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
4000
+ [eval debug] first 3 batch fingerprints:
4001
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
4002
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
4003
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
4004
+ ce_avg: 0.46307557821273804, mse_avg: 0.0
4005
  [2026-01-30 01:58:27] (step=0003761) Train Loss mse: 0.0000, Train Loss ce: 0.0039, Train Steps/Sec: 0.30,
4006
  [2026-01-30 01:58:30] (step=0003762) Train Loss mse: 0.0000, Train Loss ce: 0.0064, Train Steps/Sec: 0.34,
4007
  [2026-01-30 01:58:33] (step=0003763) Train Loss mse: 0.0000, Train Loss ce: 0.0077, Train Steps/Sec: 0.28,
 
5141
  [2026-01-30 03:04:00] (step=0004897) Train Loss mse: 0.0000, Train Loss ce: 0.0064, Train Steps/Sec: 0.29,
5142
  [2026-01-30 03:04:03] (step=0004898) Train Loss mse: 0.0000, Train Loss ce: 0.0113, Train Steps/Sec: 0.30,
5143
  [2026-01-30 03:04:07] (step=0004899) Train Loss mse: 0.0000, Train Loss ce: 0.0041, Train Steps/Sec: 0.29,
 
 
 
 
 
 
 
5144
  [2026-01-30 03:04:10] (step=0004900) Train Loss mse: 0.0000, Train Loss ce: 0.0053, Train Steps/Sec: 0.30,
5145
  [2026-01-30 03:04:13] (step=0004901) Train Loss mse: 0.0000, Train Loss ce: 0.0057, Train Steps/Sec: 0.29,
5146
  [2026-01-30 03:04:17] (step=0004902) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.28,
checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260130_175015-checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1148,6 +1148,7 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1148
  [2026-01-30 18:52:55] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0138, Train Steps/Sec: 0.28,
1149
  [2026-01-30 18:52:59] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0096, Train Steps/Sec: 0.27,
1150
  [2026-01-30 18:53:02] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0131, Train Steps/Sec: 0.31,
 
1151
  base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1000
1152
  Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1153
  [eval debug] first 3 batch fingerprints:
@@ -1169,7 +1170,6 @@ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_m
1169
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1170
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1171
  ce_avg: 0.16064727306365967, mse_avg: 0.0
1172
- [2026-01-30 18:53:05] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0099, Train Steps/Sec: 0.30,
1173
  [2026-01-30 18:53:09] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0102, Train Steps/Sec: 0.28,
1174
  [2026-01-30 18:53:12] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0103, Train Steps/Sec: 0.30,
1175
  [2026-01-30 18:53:16] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0145, Train Steps/Sec: 0.29,
@@ -2600,27 +2600,6 @@ ce_avg: 0.16064727306365967, mse_avg: 0.0
2600
  [2026-01-30 20:15:49] (step=0002398) Train Loss mse: 0.0000, Train Loss ce: 0.0071, Train Steps/Sec: 0.29,
2601
  [2026-01-30 20:15:52] (step=0002399) Train Loss mse: 0.0000, Train Loss ce: 0.0055, Train Steps/Sec: 0.31,
2602
  [2026-01-30 20:15:56] (step=0002400) Train Loss mse: 0.0000, Train Loss ce: 0.0069, Train Steps/Sec: 0.28,
2603
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2500
2604
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2605
- [eval debug] first 3 batch fingerprints:
2606
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2607
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2608
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2609
- ce_avg: 0.38690388202667236, mse_avg: 0.0
2610
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3000
2611
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2612
- [eval debug] first 3 batch fingerprints:
2613
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2614
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2615
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2616
- ce_avg: 0.47913289070129395, mse_avg: 0.0
2617
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3500
2618
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2619
- [eval debug] first 3 batch fingerprints:
2620
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2621
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2622
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2623
- ce_avg: 0.5199082493782043, mse_avg: 0.0
2624
  [2026-01-30 20:15:59] (step=0002401) Train Loss mse: 0.0000, Train Loss ce: 0.0036, Train Steps/Sec: 0.29,
2625
  [2026-01-30 20:16:03] (step=0002402) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.27,
2626
  [2026-01-30 20:16:06] (step=0002403) Train Loss mse: 0.0000, Train Loss ce: 0.0061, Train Steps/Sec: 0.30,
@@ -2681,6 +2660,27 @@ ce_avg: 0.5199082493782043, mse_avg: 0.0
2681
  [2026-01-30 20:19:17] (step=0002458) Train Loss mse: 0.0000, Train Loss ce: 0.0087, Train Steps/Sec: 0.29,
2682
  [2026-01-30 20:19:20] (step=0002459) Train Loss mse: 0.0000, Train Loss ce: 0.0069, Train Steps/Sec: 0.28,
2683
  [2026-01-30 20:19:24] (step=0002460) Train Loss mse: 0.0000, Train Loss ce: 0.0052, Train Steps/Sec: 0.28,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2684
  [2026-01-30 20:19:27] (step=0002461) Train Loss mse: 0.0000, Train Loss ce: 0.0033, Train Steps/Sec: 0.30,
2685
  [2026-01-30 20:19:31] (step=0002462) Train Loss mse: 0.0000, Train Loss ce: 0.0102, Train Steps/Sec: 0.28,
2686
  [2026-01-30 20:19:34] (step=0002463) Train Loss mse: 0.0000, Train Loss ce: 0.0104, Train Steps/Sec: 0.29,
@@ -3679,27 +3679,6 @@ ce_avg: 0.5199082493782043, mse_avg: 0.0
3679
  [2026-01-30 21:17:00] (step=0003456) Train Loss mse: 0.0000, Train Loss ce: 0.0121, Train Steps/Sec: 0.30,
3680
  [2026-01-30 21:17:04] (step=0003457) Train Loss mse: 0.0000, Train Loss ce: 0.0080, Train Steps/Sec: 0.29,
3681
  [2026-01-30 21:17:07] (step=0003458) Train Loss mse: 0.0000, Train Loss ce: 0.0110, Train Steps/Sec: 0.30,
3682
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4000
3683
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3684
- [eval debug] first 3 batch fingerprints:
3685
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3686
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3687
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3688
- ce_avg: 0.5366948843002319, mse_avg: 0.0
3689
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4500
3690
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3691
- [eval debug] first 3 batch fingerprints:
3692
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3693
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3694
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3695
- ce_avg: 0.5437333583831787, mse_avg: 0.0
3696
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step5000
3697
- Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3698
- [eval debug] first 3 batch fingerprints:
3699
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3700
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3701
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3702
- ce_avg: 0.5457080006599426, mse_avg: 0.0
3703
  [2026-01-30 21:17:10] (step=0003459) Train Loss mse: 0.0000, Train Loss ce: 0.0090, Train Steps/Sec: 0.30,
3704
  [2026-01-30 21:17:14] (step=0003460) Train Loss mse: 0.0000, Train Loss ce: 0.0042, Train Steps/Sec: 0.28,
3705
  [2026-01-30 21:17:17] (step=0003461) Train Loss mse: 0.0000, Train Loss ce: 0.0094, Train Steps/Sec: 0.27,
@@ -3788,6 +3767,20 @@ ce_avg: 0.5457080006599426, mse_avg: 0.0
3788
  [2026-01-30 21:22:15] (step=0003544) Train Loss mse: 0.0000, Train Loss ce: 0.0024, Train Steps/Sec: 0.31,
3789
  [2026-01-30 21:22:18] (step=0003545) Train Loss mse: 0.0000, Train Loss ce: 0.0104, Train Steps/Sec: 0.30,
3790
  [2026-01-30 21:22:22] (step=0003546) Train Loss mse: 0.0000, Train Loss ce: 0.0060, Train Steps/Sec: 0.29,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3791
  [2026-01-30 21:22:25] (step=0003547) Train Loss mse: 0.0000, Train Loss ce: 0.0049, Train Steps/Sec: 0.30,
3792
  [2026-01-30 21:22:28] (step=0003548) Train Loss mse: 0.0000, Train Loss ce: 0.0052, Train Steps/Sec: 0.30,
3793
  [2026-01-30 21:22:32] (step=0003549) Train Loss mse: 0.0000, Train Loss ce: 0.0053, Train Steps/Sec: 0.26,
@@ -5245,4 +5238,11 @@ ce_avg: 0.5457080006599426, mse_avg: 0.0
5245
  [2026-01-30 22:46:46] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/0005000.
5246
  /opt/conda/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
5247
  warnings.warn(
5248
- [2026-01-30 22:49:19] Done!
 
 
 
 
 
 
 
 
1148
  [2026-01-30 18:52:55] (step=0000967) Train Loss mse: 0.0000, Train Loss ce: 0.0138, Train Steps/Sec: 0.28,
1149
  [2026-01-30 18:52:59] (step=0000968) Train Loss mse: 0.0000, Train Loss ce: 0.0096, Train Steps/Sec: 0.27,
1150
  [2026-01-30 18:53:02] (step=0000969) Train Loss mse: 0.0000, Train Loss ce: 0.0131, Train Steps/Sec: 0.31,
1151
+ [2026-01-30 18:53:05] (step=0000970) Train Loss mse: 0.0000, Train Loss ce: 0.0099, Train Steps/Sec: 0.30,
1152
  base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step1000
1153
  Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
1154
  [eval debug] first 3 batch fingerprints:
 
1170
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1171
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
1172
  ce_avg: 0.16064727306365967, mse_avg: 0.0
 
1173
  [2026-01-30 18:53:09] (step=0000971) Train Loss mse: 0.0000, Train Loss ce: 0.0102, Train Steps/Sec: 0.28,
1174
  [2026-01-30 18:53:12] (step=0000972) Train Loss mse: 0.0000, Train Loss ce: 0.0103, Train Steps/Sec: 0.30,
1175
  [2026-01-30 18:53:16] (step=0000973) Train Loss mse: 0.0000, Train Loss ce: 0.0145, Train Steps/Sec: 0.29,
 
2600
  [2026-01-30 20:15:49] (step=0002398) Train Loss mse: 0.0000, Train Loss ce: 0.0071, Train Steps/Sec: 0.29,
2601
  [2026-01-30 20:15:52] (step=0002399) Train Loss mse: 0.0000, Train Loss ce: 0.0055, Train Steps/Sec: 0.31,
2602
  [2026-01-30 20:15:56] (step=0002400) Train Loss mse: 0.0000, Train Loss ce: 0.0069, Train Steps/Sec: 0.28,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2603
  [2026-01-30 20:15:59] (step=0002401) Train Loss mse: 0.0000, Train Loss ce: 0.0036, Train Steps/Sec: 0.29,
2604
  [2026-01-30 20:16:03] (step=0002402) Train Loss mse: 0.0000, Train Loss ce: 0.0051, Train Steps/Sec: 0.27,
2605
  [2026-01-30 20:16:06] (step=0002403) Train Loss mse: 0.0000, Train Loss ce: 0.0061, Train Steps/Sec: 0.30,
 
2660
  [2026-01-30 20:19:17] (step=0002458) Train Loss mse: 0.0000, Train Loss ce: 0.0087, Train Steps/Sec: 0.29,
2661
  [2026-01-30 20:19:20] (step=0002459) Train Loss mse: 0.0000, Train Loss ce: 0.0069, Train Steps/Sec: 0.28,
2662
  [2026-01-30 20:19:24] (step=0002460) Train Loss mse: 0.0000, Train Loss ce: 0.0052, Train Steps/Sec: 0.28,
2663
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step2500
2664
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2665
+ [eval debug] first 3 batch fingerprints:
2666
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2667
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2668
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2669
+ ce_avg: 0.38690388202667236, mse_avg: 0.0
2670
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3000
2671
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2672
+ [eval debug] first 3 batch fingerprints:
2673
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2674
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2675
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2676
+ ce_avg: 0.47913289070129395, mse_avg: 0.0
2677
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step3500
2678
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
2679
+ [eval debug] first 3 batch fingerprints:
2680
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2681
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2682
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
2683
+ ce_avg: 0.5199082493782043, mse_avg: 0.0
2684
  [2026-01-30 20:19:27] (step=0002461) Train Loss mse: 0.0000, Train Loss ce: 0.0033, Train Steps/Sec: 0.30,
2685
  [2026-01-30 20:19:31] (step=0002462) Train Loss mse: 0.0000, Train Loss ce: 0.0102, Train Steps/Sec: 0.28,
2686
  [2026-01-30 20:19:34] (step=0002463) Train Loss mse: 0.0000, Train Loss ce: 0.0104, Train Steps/Sec: 0.29,
 
3679
  [2026-01-30 21:17:00] (step=0003456) Train Loss mse: 0.0000, Train Loss ce: 0.0121, Train Steps/Sec: 0.30,
3680
  [2026-01-30 21:17:04] (step=0003457) Train Loss mse: 0.0000, Train Loss ce: 0.0080, Train Steps/Sec: 0.29,
3681
  [2026-01-30 21:17:07] (step=0003458) Train Loss mse: 0.0000, Train Loss ce: 0.0110, Train Steps/Sec: 0.30,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3682
  [2026-01-30 21:17:10] (step=0003459) Train Loss mse: 0.0000, Train Loss ce: 0.0090, Train Steps/Sec: 0.30,
3683
  [2026-01-30 21:17:14] (step=0003460) Train Loss mse: 0.0000, Train Loss ce: 0.0042, Train Steps/Sec: 0.28,
3684
  [2026-01-30 21:17:17] (step=0003461) Train Loss mse: 0.0000, Train Loss ce: 0.0094, Train Steps/Sec: 0.27,
 
3767
  [2026-01-30 21:22:15] (step=0003544) Train Loss mse: 0.0000, Train Loss ce: 0.0024, Train Steps/Sec: 0.31,
3768
  [2026-01-30 21:22:18] (step=0003545) Train Loss mse: 0.0000, Train Loss ce: 0.0104, Train Steps/Sec: 0.30,
3769
  [2026-01-30 21:22:22] (step=0003546) Train Loss mse: 0.0000, Train Loss ce: 0.0060, Train Steps/Sec: 0.29,
3770
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4000
3771
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3772
+ [eval debug] first 3 batch fingerprints:
3773
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3774
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3775
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3776
+ ce_avg: 0.5366948843002319, mse_avg: 0.0
3777
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step4500
3778
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
3779
+ [eval debug] first 3 batch fingerprints:
3780
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3781
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3782
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
3783
+ ce_avg: 0.5437333583831787, mse_avg: 0.0
3784
  [2026-01-30 21:22:25] (step=0003547) Train Loss mse: 0.0000, Train Loss ce: 0.0049, Train Steps/Sec: 0.30,
3785
  [2026-01-30 21:22:28] (step=0003548) Train Loss mse: 0.0000, Train Loss ce: 0.0052, Train Steps/Sec: 0.30,
3786
  [2026-01-30 21:22:32] (step=0003549) Train Loss mse: 0.0000, Train Loss ce: 0.0053, Train Steps/Sec: 0.26,
 
5238
  [2026-01-30 22:46:46] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/0005000.
5239
  /opt/conda/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
5240
  warnings.warn(
5241
+ [2026-01-30 22:49:19] Done!
5242
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mujoco_pick_and_place_one_image_lr2e_5_ce_no_mse_ins_step5000
5243
+ Preparing Dataset vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce/vlm_gym_mujoco_pick_and_place_val
5244
+ [eval debug] first 3 batch fingerprints:
5245
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5246
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5247
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mujoco_pick_and_place_celoss_no_mse_evalonce'}]
5248
+ ce_avg: 0.5457080006599426, mse_avg: 0.0