Junyi42 commited on
Commit
46e6c55
·
verified ·
1 Parent(s): af9e9d1

Upload checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins

Browse files
checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/wandb/offline-run-20260129_220034-vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins-run0/files/output.log CHANGED
@@ -1,189 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (time_embedder): FullyShardedDataParallel(
51
- (_fsdp_wrapped_module): TimestepEmbedder(
52
- (mlp): Sequential(
53
- (0): Linear(in_features=256, out_features=3584, bias=True)
54
- (1): SiLU()
55
- (2): Linear(in_features=3584, out_features=3584, bias=True)
56
- )
57
- )
58
- )
59
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
- (latent_pos_embed): FullyShardedDataParallel(
62
- (_fsdp_wrapped_module): PositionEmbedding()
63
- )
64
- (vit_model): SiglipVisionModel(
65
- (vision_model): FullyShardedDataParallel(
66
- (_fsdp_wrapped_module): SiglipVisionTransformer(
67
- (embeddings): SiglipVisionEmbeddings(
68
- (position_embedding): Embedding(4900, 1152)
69
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
- )
71
- (encoder): SiglipEncoder(
72
- (layers): ModuleList(
73
- (0-25): 26 x FullyShardedDataParallel(
74
- (_fsdp_wrapped_module): CheckpointWrapper(
75
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
- (self_attn): SiglipFlashAttention2(
77
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- )
82
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
- (mlp): SiglipMLP(
84
- (activation_fn): PytorchGELUTanh()
85
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
- )
88
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
- )
90
- )
91
- )
92
- )
93
- )
94
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
- )
96
- )
97
- )
98
- (connector): FullyShardedDataParallel(
99
- (_fsdp_wrapped_module): CheckpointWrapper(
100
- (_checkpoint_wrapped_module): MLPconnector(
101
- (activation_fn): PytorchGELUTanh()
102
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
- )
105
- )
106
- )
107
- (vit_pos_embed): FullyShardedDataParallel(
108
- (_fsdp_wrapped_module): PositionEmbedding()
109
- )
110
- )
111
- )
112
- _flat_param True
113
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- time_embedder._fsdp_wrapped_module._flat_param True
142
- latent_pos_embed._fsdp_wrapped_module._flat_param False
143
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- vit_pos_embed._fsdp_wrapped_module._flat_param False
172
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only/vlm_gym_mental_rotation_2d_train
173
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step0
174
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
175
- [eval debug] first 3 batch fingerprints:
176
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
177
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
178
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
179
- ce_avg: 0.0, mse_avg: 0.33117932081222534
180
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step500
181
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
182
- [eval debug] first 3 batch fingerprints:
183
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
184
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
185
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
186
- ce_avg: 0.0, mse_avg: 0.09970466047525406
187
  wandb: Detected [huggingface_hub.inference] in use.
188
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
189
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1182,20 +996,192 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1182
  [2026-01-30 00:05:52] (step=0000985) Train Loss mse: 0.0868, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
1183
  [2026-01-30 00:05:59] (step=0000986) Train Loss mse: 0.0982, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1184
  [2026-01-30 00:06:06] (step=0000987) Train Loss mse: 0.1057, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
1185
- [2026-01-30 00:06:14] (step=0000988) Train Loss mse: 0.0923, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1186
- [2026-01-30 00:06:21] (step=0000989) Train Loss mse: 0.0922, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1187
- [2026-01-30 00:06:29] (step=0000990) Train Loss mse: 0.0976, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1188
- [2026-01-30 00:06:34] (step=0000991) Train Loss mse: 0.0941, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
1189
- [2026-01-30 00:06:40] (step=0000992) Train Loss mse: 0.0977, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
1190
- [2026-01-30 00:06:48] (step=0000993) Train Loss mse: 0.0935, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1191
- [2026-01-30 00:06:54] (step=0000994) Train Loss mse: 0.0901, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
1192
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1193
  Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
1194
  [eval debug] first 3 batch fingerprints:
1195
  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1196
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1197
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1198
- ce_avg: 0.0, mse_avg: 0.09500830620527267
1199
  base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step1500
1200
  Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
1201
  [eval debug] first 3 batch fingerprints:
@@ -1210,6 +1196,13 @@ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_ment
1210
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1211
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1212
  ce_avg: 0.0, mse_avg: 0.09311425685882568
 
 
 
 
 
 
 
1213
  [2026-01-30 00:07:02] (step=0000995) Train Loss mse: 0.1012, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
1214
  [2026-01-30 00:07:09] (step=0000996) Train Loss mse: 0.1097, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1215
  [2026-01-30 00:07:16] (step=0000997) Train Loss mse: 0.1051, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
@@ -2600,20 +2593,6 @@ ce_avg: 0.0, mse_avg: 0.09311425685882568
2600
  [2026-01-30 02:55:05] (step=0002382) Train Loss mse: 0.0867, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
2601
  [2026-01-30 02:55:12] (step=0002383) Train Loss mse: 0.0865, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
2602
  [2026-01-30 02:55:19] (step=0002384) Train Loss mse: 0.0934, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
2603
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step2500
2604
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
2605
- [eval debug] first 3 batch fingerprints:
2606
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2607
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2608
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2609
- ce_avg: 0.0, mse_avg: 0.09291024506092072
2610
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step3000
2611
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
2612
- [eval debug] first 3 batch fingerprints:
2613
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2614
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2615
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2616
- ce_avg: 0.0, mse_avg: 0.09515543282032013
2617
  [2026-01-30 02:55:27] (step=0002385) Train Loss mse: 0.0960, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
2618
  [2026-01-30 02:55:32] (step=0002386) Train Loss mse: 0.0849, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
2619
  [2026-01-30 02:55:40] (step=0002387) Train Loss mse: 0.0905, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
@@ -2661,6 +2640,27 @@ ce_avg: 0.0, mse_avg: 0.09515543282032013
2661
  [2026-01-30 03:00:42] (step=0002429) Train Loss mse: 0.0940, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
2662
  [2026-01-30 03:00:50] (step=0002430) Train Loss mse: 0.0971, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2663
  [2026-01-30 03:00:57] (step=0002431) Train Loss mse: 0.0891, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2664
  [2026-01-30 03:01:04] (step=0002432) Train Loss mse: 0.0877, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2665
  [2026-01-30 03:01:12] (step=0002433) Train Loss mse: 0.0990, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2666
  [2026-01-30 03:01:17] (step=0002434) Train Loss mse: 0.0945, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
@@ -3613,27 +3613,6 @@ ce_avg: 0.0, mse_avg: 0.09515543282032013
3613
  [2026-01-30 04:56:32] (step=0003381) Train Loss mse: 0.0866, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3614
  [2026-01-30 04:56:39] (step=0003382) Train Loss mse: 0.0849, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3615
  [2026-01-30 04:56:47] (step=0003383) Train Loss mse: 0.0870, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3616
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step3500
3617
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
3618
- [eval debug] first 3 batch fingerprints:
3619
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3620
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3621
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3622
- ce_avg: 0.0, mse_avg: 0.09188222140073776
3623
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step4000
3624
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
3625
- [eval debug] first 3 batch fingerprints:
3626
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3627
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3628
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3629
- ce_avg: 0.0, mse_avg: 0.09225528687238693
3630
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step4500
3631
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
3632
- [eval debug] first 3 batch fingerprints:
3633
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3634
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3635
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3636
- ce_avg: 0.0, mse_avg: 0.0914882943034172
3637
  [2026-01-30 04:56:54] (step=0003384) Train Loss mse: 0.0835, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3638
  [2026-01-30 04:57:02] (step=0003385) Train Loss mse: 0.0951, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3639
  [2026-01-30 04:57:09] (step=0003386) Train Loss mse: 0.0884, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
@@ -3664,6 +3643,20 @@ ce_avg: 0.0, mse_avg: 0.0914882943034172
3664
  [2026-01-30 05:00:10] (step=0003411) Train Loss mse: 0.0851, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3665
  [2026-01-30 05:00:17] (step=0003412) Train Loss mse: 0.0867, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3666
  [2026-01-30 05:00:25] (step=0003413) Train Loss mse: 0.0948, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3667
  [2026-01-30 05:00:32] (step=0003414) Train Loss mse: 0.0811, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3668
  [2026-01-30 05:00:40] (step=0003415) Train Loss mse: 0.0855, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3669
  [2026-01-30 05:00:47] (step=0003416) Train Loss mse: 0.0879, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
@@ -5094,13 +5087,6 @@ ce_avg: 0.0, mse_avg: 0.0914882943034172
5094
  [2026-01-30 07:53:22] (step=0004841) Train Loss mse: 0.0809, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
5095
  [2026-01-30 07:53:29] (step=0004842) Train Loss mse: 0.0935, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
5096
  [2026-01-30 07:53:37] (step=0004843) Train Loss mse: 0.0973, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5097
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step5000
5098
- Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
5099
- [eval debug] first 3 batch fingerprints:
5100
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5101
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5102
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5103
- ce_avg: 0.0, mse_avg: 0.09255406260490417
5104
  [2026-01-30 07:53:44] (step=0004844) Train Loss mse: 0.0932, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5105
  [2026-01-30 07:53:51] (step=0004845) Train Loss mse: 0.0846, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
5106
  [2026-01-30 07:53:58] (step=0004846) Train Loss mse: 0.0876, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
@@ -5126,6 +5112,13 @@ ce_avg: 0.0, mse_avg: 0.09255406260490417
5126
  [2026-01-30 07:56:13] (step=0004866) Train Loss mse: 0.0875, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5127
  [2026-01-30 07:56:20] (step=0004867) Train Loss mse: 0.0830, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
5128
  [2026-01-30 07:56:27] (step=0004868) Train Loss mse: 0.0884, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
5129
  [2026-01-30 07:56:34] (step=0004869) Train Loss mse: 0.0773, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
5130
  [2026-01-30 07:56:42] (step=0004870) Train Loss mse: 0.0833, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5131
  [2026-01-30 07:56:49] (step=0004871) Train Loss mse: 0.0900, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
996
  [2026-01-30 00:05:52] (step=0000985) Train Loss mse: 0.0868, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
997
  [2026-01-30 00:05:59] (step=0000986) Train Loss mse: 0.0982, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
998
  [2026-01-30 00:06:06] (step=0000987) Train Loss mse: 0.1057, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
999
+ FullyShardedDataParallel(
1000
+ (_fsdp_wrapped_module): Bagel(
1001
+ (language_model): Qwen2ForCausalLM(
1002
+ (model): Qwen2Model(
1003
+ (embed_tokens): Embedding(152064, 3584)
1004
+ (layers): ModuleList(
1005
+ (0-27): 28 x FullyShardedDataParallel(
1006
+ (_fsdp_wrapped_module): CheckpointWrapper(
1007
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
1008
+ (self_attn): PackedAttentionMoT(
1009
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
1010
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
1011
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
1012
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
1013
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
1014
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
1015
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1016
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1017
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
1018
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1019
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1020
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
1021
+ )
1022
+ (mlp): Qwen2MLP(
1023
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1024
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1025
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1026
+ (act_fn): SiLU()
1027
+ )
1028
+ (mlp_moe_gen): Qwen2MLP(
1029
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1030
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1031
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1032
+ (act_fn): SiLU()
1033
+ )
1034
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1035
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1036
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1037
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1038
+ )
1039
+ )
1040
+ )
1041
+ )
1042
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
1043
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1044
+ (rotary_emb): Qwen2RotaryEmbedding()
1045
+ )
1046
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
1047
+ )
1048
+ (time_embedder): FullyShardedDataParallel(
1049
+ (_fsdp_wrapped_module): TimestepEmbedder(
1050
+ (mlp): Sequential(
1051
+ (0): Linear(in_features=256, out_features=3584, bias=True)
1052
+ (1): SiLU()
1053
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
1054
+ )
1055
+ )
1056
+ )
1057
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
1058
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
1059
+ (latent_pos_embed): FullyShardedDataParallel(
1060
+ (_fsdp_wrapped_module): PositionEmbedding()
1061
+ )
1062
+ (vit_model): SiglipVisionModel(
1063
+ (vision_model): FullyShardedDataParallel(
1064
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
1065
+ (embeddings): SiglipVisionEmbeddings(
1066
+ (position_embedding): Embedding(4900, 1152)
1067
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
1068
+ )
1069
+ (encoder): SiglipEncoder(
1070
+ (layers): ModuleList(
1071
+ (0-25): 26 x FullyShardedDataParallel(
1072
+ (_fsdp_wrapped_module): CheckpointWrapper(
1073
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
1074
+ (self_attn): SiglipFlashAttention2(
1075
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
1076
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
1077
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
1078
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
1079
+ )
1080
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1081
+ (mlp): SiglipMLP(
1082
+ (activation_fn): PytorchGELUTanh()
1083
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
1084
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
1085
+ )
1086
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1087
+ )
1088
+ )
1089
+ )
1090
+ )
1091
+ )
1092
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1093
+ )
1094
+ )
1095
+ )
1096
+ (connector): FullyShardedDataParallel(
1097
+ (_fsdp_wrapped_module): CheckpointWrapper(
1098
+ (_checkpoint_wrapped_module): MLPconnector(
1099
+ (activation_fn): PytorchGELUTanh()
1100
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1101
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1102
+ )
1103
+ )
1104
+ )
1105
+ (vit_pos_embed): FullyShardedDataParallel(
1106
+ (_fsdp_wrapped_module): PositionEmbedding()
1107
+ )
1108
+ )
1109
+ )
1110
+ _flat_param True
1111
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1112
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1113
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1114
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1115
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1116
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1117
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1118
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1119
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1120
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1121
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1122
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1123
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1124
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1125
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1126
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1127
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1128
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1129
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1130
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1131
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1132
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1133
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1134
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1135
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1136
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1137
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1138
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1139
+ time_embedder._fsdp_wrapped_module._flat_param True
1140
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
1141
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
1142
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1143
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1144
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1145
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1146
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1147
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1148
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1149
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1150
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1151
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1152
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1153
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1154
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1155
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1156
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1157
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1158
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1159
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1160
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1161
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1162
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1163
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1164
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1165
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1166
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1167
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1168
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1169
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1170
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only/vlm_gym_mental_rotation_2d_train
1171
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step0
1172
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
1173
+ [eval debug] first 3 batch fingerprints:
1174
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1175
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1176
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1177
+ ce_avg: 0.0, mse_avg: 0.33117932081222534
1178
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step500
1179
  Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
1180
  [eval debug] first 3 batch fingerprints:
1181
  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1182
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1183
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1184
+ ce_avg: 0.0, mse_avg: 0.09970466047525406
1185
  base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step1500
1186
  Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
1187
  [eval debug] first 3 batch fingerprints:
 
1196
  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1197
  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
1198
  ce_avg: 0.0, mse_avg: 0.09311425685882568
1199
+ [2026-01-30 00:06:14] (step=0000988) Train Loss mse: 0.0923, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1200
+ [2026-01-30 00:06:21] (step=0000989) Train Loss mse: 0.0922, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1201
+ [2026-01-30 00:06:29] (step=0000990) Train Loss mse: 0.0976, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1202
+ [2026-01-30 00:06:34] (step=0000991) Train Loss mse: 0.0941, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
1203
+ [2026-01-30 00:06:40] (step=0000992) Train Loss mse: 0.0977, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
1204
+ [2026-01-30 00:06:48] (step=0000993) Train Loss mse: 0.0935, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1205
+ [2026-01-30 00:06:54] (step=0000994) Train Loss mse: 0.0901, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
1206
  [2026-01-30 00:07:02] (step=0000995) Train Loss mse: 0.1012, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
1207
  [2026-01-30 00:07:09] (step=0000996) Train Loss mse: 0.1097, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
1208
  [2026-01-30 00:07:16] (step=0000997) Train Loss mse: 0.1051, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
 
2593
  [2026-01-30 02:55:05] (step=0002382) Train Loss mse: 0.0867, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
2594
  [2026-01-30 02:55:12] (step=0002383) Train Loss mse: 0.0865, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
2595
  [2026-01-30 02:55:19] (step=0002384) Train Loss mse: 0.0934, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2596
  [2026-01-30 02:55:27] (step=0002385) Train Loss mse: 0.0960, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
2597
  [2026-01-30 02:55:32] (step=0002386) Train Loss mse: 0.0849, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
2598
  [2026-01-30 02:55:40] (step=0002387) Train Loss mse: 0.0905, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
2640
  [2026-01-30 03:00:42] (step=0002429) Train Loss mse: 0.0940, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
2641
  [2026-01-30 03:00:50] (step=0002430) Train Loss mse: 0.0971, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2642
  [2026-01-30 03:00:57] (step=0002431) Train Loss mse: 0.0891, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
2643
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step2500
2644
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
2645
+ [eval debug] first 3 batch fingerprints:
2646
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2647
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2648
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2649
+ ce_avg: 0.0, mse_avg: 0.09291024506092072
2650
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step3000
2651
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
2652
+ [eval debug] first 3 batch fingerprints:
2653
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2654
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2655
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2656
+ ce_avg: 0.0, mse_avg: 0.09515543282032013
2657
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step3500
2658
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
2659
+ [eval debug] first 3 batch fingerprints:
2660
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2661
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2662
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
2663
+ ce_avg: 0.0, mse_avg: 0.09188222140073776
2664
  [2026-01-30 03:01:04] (step=0002432) Train Loss mse: 0.0877, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2665
  [2026-01-30 03:01:12] (step=0002433) Train Loss mse: 0.0990, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
2666
  [2026-01-30 03:01:17] (step=0002434) Train Loss mse: 0.0945, Train Loss ce: 0.0000, Train Steps/Sec: 0.18,
 
3613
  [2026-01-30 04:56:32] (step=0003381) Train Loss mse: 0.0866, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3614
  [2026-01-30 04:56:39] (step=0003382) Train Loss mse: 0.0849, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3615
  [2026-01-30 04:56:47] (step=0003383) Train Loss mse: 0.0870, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3616
  [2026-01-30 04:56:54] (step=0003384) Train Loss mse: 0.0835, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3617
  [2026-01-30 04:57:02] (step=0003385) Train Loss mse: 0.0951, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3618
  [2026-01-30 04:57:09] (step=0003386) Train Loss mse: 0.0884, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
3643
  [2026-01-30 05:00:10] (step=0003411) Train Loss mse: 0.0851, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3644
  [2026-01-30 05:00:17] (step=0003412) Train Loss mse: 0.0867, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
3645
  [2026-01-30 05:00:25] (step=0003413) Train Loss mse: 0.0948, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3646
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step4000
3647
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
3648
+ [eval debug] first 3 batch fingerprints:
3649
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3650
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3651
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3652
+ ce_avg: 0.0, mse_avg: 0.09225528687238693
3653
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step4500
3654
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
3655
+ [eval debug] first 3 batch fingerprints:
3656
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3657
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3658
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
3659
+ ce_avg: 0.0, mse_avg: 0.0914882943034172
3660
  [2026-01-30 05:00:32] (step=0003414) Train Loss mse: 0.0811, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3661
  [2026-01-30 05:00:40] (step=0003415) Train Loss mse: 0.0855, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
3662
  [2026-01-30 05:00:47] (step=0003416) Train Loss mse: 0.0879, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
5087
  [2026-01-30 07:53:22] (step=0004841) Train Loss mse: 0.0809, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
5088
  [2026-01-30 07:53:29] (step=0004842) Train Loss mse: 0.0935, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
5089
  [2026-01-30 07:53:37] (step=0004843) Train Loss mse: 0.0973, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
 
 
 
 
 
 
5090
  [2026-01-30 07:53:44] (step=0004844) Train Loss mse: 0.0932, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5091
  [2026-01-30 07:53:51] (step=0004845) Train Loss mse: 0.0846, Train Loss ce: 0.0000, Train Steps/Sec: 0.16,
5092
  [2026-01-30 07:53:58] (step=0004846) Train Loss mse: 0.0876, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
 
5112
  [2026-01-30 07:56:13] (step=0004866) Train Loss mse: 0.0875, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5113
  [2026-01-30 07:56:20] (step=0004867) Train Loss mse: 0.0830, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,
5114
  [2026-01-30 07:56:27] (step=0004868) Train Loss mse: 0.0884, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5115
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_2d_one_image_lr2e_5_mse_only_ins/eval_used_rows, step_tag is vlm_gym_mental_rotation_2d_one_img_lr2e_5_mse_only_ins_step5000
5116
+ Preparing Dataset vlm_gym_mental_rotation_2d_mse_loss_only_evalonce/vlm_gym_mental_rotation_2d_val
5117
+ [eval debug] first 3 batch fingerprints:
5118
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5119
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5120
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_2d_mse_loss_only_evalonce'}]
5121
+ ce_avg: 0.0, mse_avg: 0.09255406260490417
5122
  [2026-01-30 07:56:34] (step=0004869) Train Loss mse: 0.0773, Train Loss ce: 0.0000, Train Steps/Sec: 0.15,
5123
  [2026-01-30 07:56:42] (step=0004870) Train Loss mse: 0.0833, Train Loss ce: 0.0000, Train Steps/Sec: 0.13,
5124
  [2026-01-30 07:56:49] (step=0004871) Train Loss mse: 0.0900, Train Loss ce: 0.0000, Train Steps/Sec: 0.14,