Junyi42 commited on
Commit
ed5379f
·
verified ·
1 Parent(s): 3a520d9

Upload checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins

Browse files
checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/wandb/offline-run-20260129_221049-checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins-run0/files/output.log CHANGED
@@ -1,173 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (vit_model): SiglipVisionModel(
51
- (vision_model): FullyShardedDataParallel(
52
- (_fsdp_wrapped_module): SiglipVisionTransformer(
53
- (embeddings): SiglipVisionEmbeddings(
54
- (position_embedding): Embedding(4900, 1152)
55
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
56
- )
57
- (encoder): SiglipEncoder(
58
- (layers): ModuleList(
59
- (0-25): 26 x FullyShardedDataParallel(
60
- (_fsdp_wrapped_module): CheckpointWrapper(
61
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
62
- (self_attn): SiglipFlashAttention2(
63
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
64
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
65
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
66
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
67
- )
68
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
69
- (mlp): SiglipMLP(
70
- (activation_fn): PytorchGELUTanh()
71
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
72
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
73
- )
74
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
75
- )
76
- )
77
- )
78
- )
79
- )
80
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
81
- )
82
- )
83
- )
84
- (connector): FullyShardedDataParallel(
85
- (_fsdp_wrapped_module): CheckpointWrapper(
86
- (_checkpoint_wrapped_module): MLPconnector(
87
- (activation_fn): PytorchGELUTanh()
88
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
89
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
90
- )
91
- )
92
- )
93
- (vit_pos_embed): FullyShardedDataParallel(
94
- (_fsdp_wrapped_module): PositionEmbedding()
95
- )
96
- )
97
- )
98
- _flat_param True
99
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
100
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
101
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
102
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
103
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
104
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
105
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
106
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
107
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
108
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
109
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
110
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
111
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
112
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
113
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
128
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
143
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_pos_embed._fsdp_wrapped_module._flat_param False
156
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse/vlm_gym_mental_rotation_3d_pad3_by_axis_train
157
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step0
158
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
159
- [eval debug] first 3 batch fingerprints:
160
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
161
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
162
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
163
- ce_avg: 0.2727155089378357, mse_avg: 0.0
164
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step500
165
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
166
- [eval debug] first 3 batch fingerprints:
167
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
168
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
169
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
170
- ce_avg: 0.05276435613632202, mse_avg: 0.0
171
  wandb: Detected [huggingface_hub.inference] in use.
172
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
173
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1207,6 +1037,197 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1207
  [2026-01-29 22:53:25] (step=0001026) Train Loss mse: 0.0000, Train Loss ce: 0.0461, Train Steps/Sec: 0.51,
1208
  [2026-01-29 22:53:27] (step=0001027) Train Loss mse: 0.0000, Train Loss ce: 0.0435, Train Steps/Sec: 0.51,
1209
  [2026-01-29 22:53:29] (step=0001028) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 0.51,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1210
  [2026-01-29 22:53:31] (step=0001029) Train Loss mse: 0.0000, Train Loss ce: 0.0456, Train Steps/Sec: 0.51,
1211
  [2026-01-29 22:53:33] (step=0001030) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 0.46,
1212
  [2026-01-29 22:53:35] (step=0001031) Train Loss mse: 0.0000, Train Loss ce: 0.0421, Train Steps/Sec: 0.51,
@@ -1333,27 +1354,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1333
  [2026-01-29 22:57:50] (step=0001152) Train Loss mse: 0.0000, Train Loss ce: 0.0421, Train Steps/Sec: 0.51,
1334
  [2026-01-29 22:57:52] (step=0001153) Train Loss mse: 0.0000, Train Loss ce: 0.0400, Train Steps/Sec: 0.39,
1335
  [2026-01-29 22:57:54] (step=0001154) Train Loss mse: 0.0000, Train Loss ce: 0.0450, Train Steps/Sec: 0.42,
1336
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step1500
1337
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1338
- [eval debug] first 3 batch fingerprints:
1339
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1340
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1341
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1342
- ce_avg: 0.04674335569143295, mse_avg: 0.0
1343
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step2000
1344
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1345
- [eval debug] first 3 batch fingerprints:
1346
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1347
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1348
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1349
- ce_avg: 0.04835177958011627, mse_avg: 0.0
1350
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step2500
1351
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1352
- [eval debug] first 3 batch fingerprints:
1353
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1354
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1355
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1356
- ce_avg: 0.05139823630452156, mse_avg: 0.0
1357
  [2026-01-29 22:57:57] (step=0001155) Train Loss mse: 0.0000, Train Loss ce: 0.0432, Train Steps/Sec: 0.46,
1358
  [2026-01-29 22:57:59] (step=0001156) Train Loss mse: 0.0000, Train Loss ce: 0.0424, Train Steps/Sec: 0.52,
1359
  [2026-01-29 22:58:01] (step=0001157) Train Loss mse: 0.0000, Train Loss ce: 0.0411, Train Steps/Sec: 0.51,
@@ -2766,20 +2766,6 @@ ce_avg: 0.05139823630452156, mse_avg: 0.0
2766
  [2026-01-29 23:48:06] (step=0002564) Train Loss mse: 0.0000, Train Loss ce: 0.0328, Train Steps/Sec: 0.51,
2767
  [2026-01-29 23:48:09] (step=0002565) Train Loss mse: 0.0000, Train Loss ce: 0.0326, Train Steps/Sec: 0.40,
2768
  [2026-01-29 23:48:11] (step=0002566) Train Loss mse: 0.0000, Train Loss ce: 0.0364, Train Steps/Sec: 0.47,
2769
- [2026-01-29 23:48:13
2770
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step3000
2771
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
2772
- [eval debug] first 3 batch fingerprints:
2773
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2774
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2775
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2776
- ce_avg: 0.054898910224437714, mse_avg: 0.0
2777
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step3500
2778
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
2779
- [eval debug] first 3 batch fingerprints:
2780
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2781
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2782
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2783
  [2026-01-29 23:48:13] (step=0002567) Train Loss mse: 0.0000, Train Loss ce: 0.0333, Train Steps/Sec: 0.46,
2784
  [2026-01-29 23:48:15] (step=0002568) Train Loss mse: 0.0000, Train Loss ce: 0.0331, Train Steps/Sec: 0.51,
2785
  [2026-01-29 23:48:17] (step=0002569) Train Loss mse: 0.0000, Train Loss ce: 0.0392, Train Steps/Sec: 0.51,
@@ -2872,6 +2858,27 @@ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce
2872
  [2026-01-29 23:51:23] (step=0002656) Train Loss mse: 0.0000, Train Loss ce: 0.0317, Train Steps/Sec: 0.51,
2873
  [2026-01-29 23:51:25] (step=0002657) Train Loss mse: 0.0000, Train Loss ce: 0.0354, Train Steps/Sec: 0.47,
2874
  [2026-01-29 23:51:27] (step=0002658) Train Loss mse: 0.0000, Train Loss ce: 0.0380, Train Steps/Sec: 0.51,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2875
  [2026-01-29 23:51:29] (step=0002659) Train Loss mse: 0.0000, Train Loss ce: 0.0363, Train Steps/Sec: 0.46,
2876
  [2026-01-29 23:51:31] (step=0002660) Train Loss mse: 0.0000, Train Loss ce: 0.0347, Train Steps/Sec: 0.46,
2877
  [2026-01-29 23:51:33] (step=0002661) Train Loss mse: 0.0000, Train Loss ce: 0.0356, Train Steps/Sec: 0.51,
@@ -3900,6 +3907,27 @@ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce
3900
  [2026-01-30 00:27:52] (step=0003684) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 0.52,
3901
  [2026-01-30 00:27:54] (step=0003685) Train Loss mse: 0.0000, Train Loss ce: 0.0327, Train Steps/Sec: 0.52,
3902
  [2026-01-30 00:27:56] (step=0003686) Train Loss mse: 0.0000, Train Loss ce: 0.0292, Train Steps/Sec: 0.45,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3903
  [2026-01-30 00:27:58] (step=0003687) Train Loss mse: 0.0000, Train Loss ce: 0.0334, Train Steps/Sec: 0.47,
3904
  [2026-01-30 00:28:01] (step=0003688) Train Loss mse: 0.0000, Train Loss ce: 0.0278, Train Steps/Sec: 0.46,
3905
  [2026-01-30 00:28:03] (step=0003689) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.47,
@@ -4092,20 +4120,105 @@ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce
4092
  [2026-01-30 00:34:40] (step=0003876) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.52,
4093
  [2026-01-30 00:34:42] (step=0003877) Train Loss mse: 0.0000, Train Loss ce: 0.0304, Train Steps/Sec: 0.43,
4094
  [2026-01-30 00:34:44] (step=0003878) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.51,
4095
- [2026-01-29 23:48:13] (step=0002567) Train Loss mse: 0.0000, Train Loss ce: 0.0333, Train Steps/Sec: 0.46,
4096
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step4000
4097
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
4098
- [eval debug] first 3 batch fingerprints:
4099
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
4100
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
4101
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
4102
- ce_avg: 0.04602212458848953, mse_avg: 0.0
4103
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step4500
4104
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
4105
- [eval debug] first 3 batch fingerprints:
4106
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
4107
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
4108
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4109
  [2026-01-30 00:38:17] (step=0003978) Train Loss mse: 0.0000, Train Loss ce: 0.0309, Train Steps/Sec: 0.47,
4110
  [2026-01-30 00:38:20] (step=0003979) Train Loss mse: 0.0000, Train Loss ce: 0.0288, Train Steps/Sec: 0.46,
4111
  [2026-01-30 00:38:22] (step=0003980) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.52,
@@ -5132,12 +5245,4 @@ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce
5132
  [2026-01-30 01:14:45] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/0005000.
5133
  /opt/conda/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
5134
  warnings.warn(
5135
- [2026-01-30 01:17:18] Done!
5136
- [2026-01-30 00:38:17] (step=0003978) Train Loss mse: 0.0000, Train Loss ce: 0.0309, Train Steps/Sec: 0.47,
5137
- base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step5000
5138
- Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
5139
- [eval debug] first 3 batch fingerprints:
5140
- fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
5141
- fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
5142
- fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
5143
- ce_avg: 0.04619377478957176, mse_avg: 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
1037
  [2026-01-29 22:53:25] (step=0001026) Train Loss mse: 0.0000, Train Loss ce: 0.0461, Train Steps/Sec: 0.51,
1038
  [2026-01-29 22:53:27] (step=0001027) Train Loss mse: 0.0000, Train Loss ce: 0.0435, Train Steps/Sec: 0.51,
1039
  [2026-01-29 22:53:29] (step=0001028) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 0.51,
1040
+ FullyShardedDataParallel(
1041
+ (_fsdp_wrapped_module): Bagel(
1042
+ (language_model): Qwen2ForCausalLM(
1043
+ (model): Qwen2Model(
1044
+ (embed_tokens): Embedding(152064, 3584)
1045
+ (layers): ModuleList(
1046
+ (0-27): 28 x FullyShardedDataParallel(
1047
+ (_fsdp_wrapped_module): CheckpointWrapper(
1048
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
1049
+ (self_attn): PackedAttentionMoT(
1050
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
1051
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
1052
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
1053
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
1054
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
1055
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
1056
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1057
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
1058
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
1059
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1060
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
1061
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
1062
+ )
1063
+ (mlp): Qwen2MLP(
1064
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1065
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1066
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1067
+ (act_fn): SiLU()
1068
+ )
1069
+ (mlp_moe_gen): Qwen2MLP(
1070
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
1071
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
1072
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
1073
+ (act_fn): SiLU()
1074
+ )
1075
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1076
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1077
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
1078
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1079
+ )
1080
+ )
1081
+ )
1082
+ )
1083
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
1084
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
1085
+ (rotary_emb): Qwen2RotaryEmbedding()
1086
+ )
1087
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
1088
+ )
1089
+ (vit_model): SiglipVisionModel(
1090
+ (vision_model): FullyShardedDataParallel(
1091
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
1092
+ (embeddings): SiglipVisionEmbeddings(
1093
+ (position_embedding): Embedding(4900, 1152)
1094
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
1095
+ )
1096
+ (encoder): SiglipEncoder(
1097
+ (layers): ModuleList(
1098
+ (0-25): 26 x FullyShardedDataParallel(
1099
+ (_fsdp_wrapped_module): CheckpointWrapper(
1100
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
1101
+ (self_attn): SiglipFlashAttention2(
1102
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
1103
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
1104
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
1105
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
1106
+ )
1107
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1108
+ (mlp): SiglipMLP(
1109
+ (activation_fn): PytorchGELUTanh()
1110
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
1111
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
1112
+ )
1113
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1114
+ )
1115
+ )
1116
+ )
1117
+ )
1118
+ )
1119
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
1120
+ )
1121
+ )
1122
+ )
1123
+ (connector): FullyShardedDataParallel(
1124
+ (_fsdp_wrapped_module): CheckpointWrapper(
1125
+ (_checkpoint_wrapped_module): MLPconnector(
1126
+ (activation_fn): PytorchGELUTanh()
1127
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
1128
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
1129
+ )
1130
+ )
1131
+ )
1132
+ (vit_pos_embed): FullyShardedDataParallel(
1133
+ (_fsdp_wrapped_module): PositionEmbedding()
1134
+ )
1135
+ )
1136
+ )
1137
+ _flat_param True
1138
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1139
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1140
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1141
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1142
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1143
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1144
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1145
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1146
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1147
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1148
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1149
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1150
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1151
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1152
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1153
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1154
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1155
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1156
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1157
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1158
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1159
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1160
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1161
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1162
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1163
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1164
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1165
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1166
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
1167
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1168
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1169
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1170
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1171
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1172
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1173
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1174
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1175
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1176
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1177
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1178
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1179
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1180
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1181
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1182
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1183
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1184
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1185
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1186
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1187
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1188
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1189
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1190
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1191
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1192
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1193
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1194
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1195
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse/vlm_gym_mental_rotation_3d_pad3_by_axis_train
1196
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step0
1197
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1198
+ [eval debug] first 3 batch fingerprints:
1199
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1200
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1201
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1202
+ ce_avg: 0.2727155089378357, mse_avg: 0.0
1203
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step500
1204
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1205
+ [eval debug] first 3 batch fingerprints:
1206
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1207
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1208
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1209
+ ce_avg: 0.05276435613632202, mse_avg: 0.0
1210
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step1000
1211
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1212
+ [eval debug] first 3 batch fingerprints:
1213
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1214
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1215
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1216
+ ce_avg: 0.04682447761297226, mse_avg: 0.0
1217
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step1500
1218
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1219
+ [eval debug] first 3 batch fingerprints:
1220
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1221
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1222
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1223
+ ce_avg: 0.04674335569143295, mse_avg: 0.0
1224
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step2000
1225
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
1226
+ [eval debug] first 3 batch fingerprints:
1227
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1228
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1229
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
1230
+ ce_avg: 0.04835177958011627, mse_avg: 0.0
1231
  [2026-01-29 22:53:31] (step=0001029) Train Loss mse: 0.0000, Train Loss ce: 0.0456, Train Steps/Sec: 0.51,
1232
  [2026-01-29 22:53:33] (step=0001030) Train Loss mse: 0.0000, Train Loss ce: 0.0425, Train Steps/Sec: 0.46,
1233
  [2026-01-29 22:53:35] (step=0001031) Train Loss mse: 0.0000, Train Loss ce: 0.0421, Train Steps/Sec: 0.51,
 
1354
  [2026-01-29 22:57:50] (step=0001152) Train Loss mse: 0.0000, Train Loss ce: 0.0421, Train Steps/Sec: 0.51,
1355
  [2026-01-29 22:57:52] (step=0001153) Train Loss mse: 0.0000, Train Loss ce: 0.0400, Train Steps/Sec: 0.39,
1356
  [2026-01-29 22:57:54] (step=0001154) Train Loss mse: 0.0000, Train Loss ce: 0.0450, Train Steps/Sec: 0.42,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1357
  [2026-01-29 22:57:57] (step=0001155) Train Loss mse: 0.0000, Train Loss ce: 0.0432, Train Steps/Sec: 0.46,
1358
  [2026-01-29 22:57:59] (step=0001156) Train Loss mse: 0.0000, Train Loss ce: 0.0424, Train Steps/Sec: 0.52,
1359
  [2026-01-29 22:58:01] (step=0001157) Train Loss mse: 0.0000, Train Loss ce: 0.0411, Train Steps/Sec: 0.51,
 
2766
  [2026-01-29 23:48:06] (step=0002564) Train Loss mse: 0.0000, Train Loss ce: 0.0328, Train Steps/Sec: 0.51,
2767
  [2026-01-29 23:48:09] (step=0002565) Train Loss mse: 0.0000, Train Loss ce: 0.0326, Train Steps/Sec: 0.40,
2768
  [2026-01-29 23:48:11] (step=0002566) Train Loss mse: 0.0000, Train Loss ce: 0.0364, Train Steps/Sec: 0.47,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2769
  [2026-01-29 23:48:13] (step=0002567) Train Loss mse: 0.0000, Train Loss ce: 0.0333, Train Steps/Sec: 0.46,
2770
  [2026-01-29 23:48:15] (step=0002568) Train Loss mse: 0.0000, Train Loss ce: 0.0331, Train Steps/Sec: 0.51,
2771
  [2026-01-29 23:48:17] (step=0002569) Train Loss mse: 0.0000, Train Loss ce: 0.0392, Train Steps/Sec: 0.51,
 
2858
  [2026-01-29 23:51:23] (step=0002656) Train Loss mse: 0.0000, Train Loss ce: 0.0317, Train Steps/Sec: 0.51,
2859
  [2026-01-29 23:51:25] (step=0002657) Train Loss mse: 0.0000, Train Loss ce: 0.0354, Train Steps/Sec: 0.47,
2860
  [2026-01-29 23:51:27] (step=0002658) Train Loss mse: 0.0000, Train Loss ce: 0.0380, Train Steps/Sec: 0.51,
2861
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step2500
2862
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
2863
+ [eval debug] first 3 batch fingerprints:
2864
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2865
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2866
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2867
+ ce_avg: 0.05139823630452156, mse_avg: 0.0
2868
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step3000
2869
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
2870
+ [eval debug] first 3 batch fingerprints:
2871
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2872
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2873
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2874
+ ce_avg: 0.054898910224437714, mse_avg: 0.0
2875
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step3500
2876
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
2877
+ [eval debug] first 3 batch fingerprints:
2878
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2879
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2880
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
2881
+ ce_avg: 0.05028936266899109, mse_avg: 0.0
2882
  [2026-01-29 23:51:29] (step=0002659) Train Loss mse: 0.0000, Train Loss ce: 0.0363, Train Steps/Sec: 0.46,
2883
  [2026-01-29 23:51:31] (step=0002660) Train Loss mse: 0.0000, Train Loss ce: 0.0347, Train Steps/Sec: 0.46,
2884
  [2026-01-29 23:51:33] (step=0002661) Train Loss mse: 0.0000, Train Loss ce: 0.0356, Train Steps/Sec: 0.51,
 
3907
  [2026-01-30 00:27:52] (step=0003684) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 0.52,
3908
  [2026-01-30 00:27:54] (step=0003685) Train Loss mse: 0.0000, Train Loss ce: 0.0327, Train Steps/Sec: 0.52,
3909
  [2026-01-30 00:27:56] (step=0003686) Train Loss mse: 0.0000, Train Loss ce: 0.0292, Train Steps/Sec: 0.45,
3910
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step4000
3911
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
3912
+ [eval debug] first 3 batch fingerprints:
3913
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3914
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3915
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3916
+ ce_avg: 0.04602212458848953, mse_avg: 0.0
3917
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step4500
3918
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
3919
+ [eval debug] first 3 batch fingerprints:
3920
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3921
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3922
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3923
+ ce_avg: 0.04616258293390274, mse_avg: 0.0
3924
+ base_dir is /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins_step5000
3925
+ Preparing Dataset vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce/vlm_gym_mental_rotation_3d_pad3_by_axis_val
3926
+ [eval debug] first 3 batch fingerprints:
3927
+ fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3928
+ fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3929
+ fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_mental_rotation_3d_pad3_by_axis_celoss_no_mse_evalonce'}]
3930
+ ce_avg: 0.04619377478957176, mse_avg: 0.0
3931
  [2026-01-30 00:27:58] (step=0003687) Train Loss mse: 0.0000, Train Loss ce: 0.0334, Train Steps/Sec: 0.47,
3932
  [2026-01-30 00:28:01] (step=0003688) Train Loss mse: 0.0000, Train Loss ce: 0.0278, Train Steps/Sec: 0.46,
3933
  [2026-01-30 00:28:03] (step=0003689) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.47,
 
4120
  [2026-01-30 00:34:40] (step=0003876) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.52,
4121
  [2026-01-30 00:34:42] (step=0003877) Train Loss mse: 0.0000, Train Loss ce: 0.0304, Train Steps/Sec: 0.43,
4122
  [2026-01-30 00:34:44] (step=0003878) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.51,
4123
+ [2026-01-30 00:34:46] (step=0003879) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 0.47,
4124
+ [2026-01-30 00:34:48] (step=0003880) Train Loss mse: 0.0000, Train Loss ce: 0.0305, Train Steps/Sec: 0.47,
4125
+ [2026-01-30 00:34:50] (step=0003881) Train Loss mse: 0.0000, Train Loss ce: 0.0298, Train Steps/Sec: 0.47,
4126
+ [2026-01-30 00:34:52] (step=0003882) Train Loss mse: 0.0000, Train Loss ce: 0.0307, Train Steps/Sec: 0.46,
4127
+ [2026-01-30 00:34:54] (step=0003883) Train Loss mse: 0.0000, Train Loss ce: 0.0307, Train Steps/Sec: 0.52,
4128
+ [2026-01-30 00:34:57] (step=0003884) Train Loss mse: 0.0000, Train Loss ce: 0.0319, Train Steps/Sec: 0.42,
4129
+ [2026-01-30 00:34:59] (step=0003885) Train Loss mse: 0.0000, Train Loss ce: 0.0299, Train Steps/Sec: 0.47,
4130
+ [2026-01-30 00:35:01] (step=0003886) Train Loss mse: 0.0000, Train Loss ce: 0.0336, Train Steps/Sec: 0.47,
4131
+ [2026-01-30 00:35:03] (step=0003887) Train Loss mse: 0.0000, Train Loss ce: 0.0345, Train Steps/Sec: 0.52,
4132
+ [2026-01-30 00:35:05] (step=0003888) Train Loss mse: 0.0000, Train Loss ce: 0.0271, Train Steps/Sec: 0.47,
4133
+ [2026-01-30 00:35:07] (step=0003889) Train Loss mse: 0.0000, Train Loss ce: 0.0273, Train Steps/Sec: 0.46,
4134
+ [2026-01-30 00:35:09] (step=0003890) Train Loss mse: 0.0000, Train Loss ce: 0.0301, Train Steps/Sec: 0.46,
4135
+ [2026-01-30 00:35:11] (step=0003891) Train Loss mse: 0.0000, Train Loss ce: 0.0294, Train Steps/Sec: 0.52,
4136
+ [2026-01-30 00:35:14] (step=0003892) Train Loss mse: 0.0000, Train Loss ce: 0.0313, Train Steps/Sec: 0.46,
4137
+ [2026-01-30 00:35:16] (step=0003893) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.47,
4138
+ [2026-01-30 00:35:18] (step=0003894) Train Loss mse: 0.0000, Train Loss ce: 0.0290, Train Steps/Sec: 0.47,
4139
+ [2026-01-30 00:35:20] (step=0003895) Train Loss mse: 0.0000, Train Loss ce: 0.0299, Train Steps/Sec: 0.47,
4140
+ [2026-01-30 00:35:22] (step=0003896) Train Loss mse: 0.0000, Train Loss ce: 0.0287, Train Steps/Sec: 0.47,
4141
+ [2026-01-30 00:35:24] (step=0003897) Train Loss mse: 0.0000, Train Loss ce: 0.0318, Train Steps/Sec: 0.52,
4142
+ [2026-01-30 00:35:26] (step=0003898) Train Loss mse: 0.0000, Train Loss ce: 0.0280, Train Steps/Sec: 0.47,
4143
+ [2026-01-30 00:35:28] (step=0003899) Train Loss mse: 0.0000, Train Loss ce: 0.0302, Train Steps/Sec: 0.52,
4144
+ [2026-01-30 00:35:31] (step=0003900) Train Loss mse: 0.0000, Train Loss ce: 0.0312, Train Steps/Sec: 0.42,
4145
+ [2026-01-30 00:35:32] (step=0003901) Train Loss mse: 0.0000, Train Loss ce: 0.0344, Train Steps/Sec: 0.51,
4146
+ [2026-01-30 00:35:35] (step=0003902) Train Loss mse: 0.0000, Train Loss ce: 0.0311, Train Steps/Sec: 0.42,
4147
+ [2026-01-30 00:35:37] (step=0003903) Train Loss mse: 0.0000, Train Loss ce: 0.0297, Train Steps/Sec: 0.43,
4148
+ [2026-01-30 00:35:39] (step=0003904) Train Loss mse: 0.0000, Train Loss ce: 0.0293, Train Steps/Sec: 0.52,
4149
+ [2026-01-30 00:35:41] (step=0003905) Train Loss mse: 0.0000, Train Loss ce: 0.0291, Train Steps/Sec: 0.52,
4150
+ [2026-01-30 00:35:43] (step=0003906) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.46,
4151
+ [2026-01-30 00:35:45] (step=0003907) Train Loss mse: 0.0000, Train Loss ce: 0.0293, Train Steps/Sec: 0.47,
4152
+ [2026-01-30 00:35:48] (step=0003908) Train Loss mse: 0.0000, Train Loss ce: 0.0331, Train Steps/Sec: 0.45,
4153
+ [2026-01-30 00:35:50] (step=0003909) Train Loss mse: 0.0000, Train Loss ce: 0.0302, Train Steps/Sec: 0.46,
4154
+ [2026-01-30 00:35:52] (step=0003910) Train Loss mse: 0.0000, Train Loss ce: 0.0321, Train Steps/Sec: 0.43,
4155
+ [2026-01-30 00:35:54] (step=0003911) Train Loss mse: 0.0000, Train Loss ce: 0.0285, Train Steps/Sec: 0.51,
4156
+ [2026-01-30 00:35:56] (step=0003912) Train Loss mse: 0.0000, Train Loss ce: 0.0301, Train Steps/Sec: 0.51,
4157
+ [2026-01-30 00:35:58] (step=0003913) Train Loss mse: 0.0000, Train Loss ce: 0.0304, Train Steps/Sec: 0.46,
4158
+ [2026-01-30 00:36:00] (step=0003914) Train Loss mse: 0.0000, Train Loss ce: 0.0280, Train Steps/Sec: 0.52,
4159
+ [2026-01-30 00:36:02] (step=0003915) Train Loss mse: 0.0000, Train Loss ce: 0.0283, Train Steps/Sec: 0.46,
4160
+ [2026-01-30 00:36:04] (step=0003916) Train Loss mse: 0.0000, Train Loss ce: 0.0293, Train Steps/Sec: 0.47,
4161
+ [2026-01-30 00:36:07] (step=0003917) Train Loss mse: 0.0000, Train Loss ce: 0.0319, Train Steps/Sec: 0.43,
4162
+ [2026-01-30 00:36:09] (step=0003918) Train Loss mse: 0.0000, Train Loss ce: 0.0295, Train Steps/Sec: 0.46,
4163
+ [2026-01-30 00:36:11] (step=0003919) Train Loss mse: 0.0000, Train Loss ce: 0.0283, Train Steps/Sec: 0.47,
4164
+ [2026-01-30 00:36:13] (step=0003920) Train Loss mse: 0.0000, Train Loss ce: 0.0292, Train Steps/Sec: 0.52,
4165
+ [2026-01-30 00:36:15] (step=0003921) Train Loss mse: 0.0000, Train Loss ce: 0.0293, Train Steps/Sec: 0.51,
4166
+ [2026-01-30 00:36:17] (step=0003922) Train Loss mse: 0.0000, Train Loss ce: 0.0302, Train Steps/Sec: 0.42,
4167
+ [2026-01-30 00:36:20] (step=0003923) Train Loss mse: 0.0000, Train Loss ce: 0.0309, Train Steps/Sec: 0.45,
4168
+ [2026-01-30 00:36:22] (step=0003924) Train Loss mse: 0.0000, Train Loss ce: 0.0296, Train Steps/Sec: 0.46,
4169
+ [2026-01-30 00:36:24] (step=0003925) Train Loss mse: 0.0000, Train Loss ce: 0.0289, Train Steps/Sec: 0.42,
4170
+ [2026-01-30 00:36:26] (step=0003926) Train Loss mse: 0.0000, Train Loss ce: 0.0292, Train Steps/Sec: 0.52,
4171
+ [2026-01-30 00:36:28] (step=0003927) Train Loss mse: 0.0000, Train Loss ce: 0.0281, Train Steps/Sec: 0.47,
4172
+ [2026-01-30 00:36:30] (step=0003928) Train Loss mse: 0.0000, Train Loss ce: 0.0272, Train Steps/Sec: 0.47,
4173
+ [2026-01-30 00:36:32] (step=0003929) Train Loss mse: 0.0000, Train Loss ce: 0.0330, Train Steps/Sec: 0.51,
4174
+ [2026-01-30 00:36:34] (step=0003930) Train Loss mse: 0.0000, Train Loss ce: 0.0296, Train Steps/Sec: 0.47,
4175
+ [2026-01-30 00:36:36] (step=0003931) Train Loss mse: 0.0000, Train Loss ce: 0.0323, Train Steps/Sec: 0.51,
4176
+ [2026-01-30 00:36:39] (step=0003932) Train Loss mse: 0.0000, Train Loss ce: 0.0301, Train Steps/Sec: 0.41,
4177
+ [2026-01-30 00:36:41] (step=0003933) Train Loss mse: 0.0000, Train Loss ce: 0.0316, Train Steps/Sec: 0.47,
4178
+ [2026-01-30 00:36:43] (step=0003934) Train Loss mse: 0.0000, Train Loss ce: 0.0287, Train Steps/Sec: 0.47,
4179
+ [2026-01-30 00:36:45] (step=0003935) Train Loss mse: 0.0000, Train Loss ce: 0.0328, Train Steps/Sec: 0.51,
4180
+ [2026-01-30 00:36:47] (step=0003936) Train Loss mse: 0.0000, Train Loss ce: 0.0307, Train Steps/Sec: 0.46,
4181
+ [2026-01-30 00:36:49] (step=0003937) Train Loss mse: 0.0000, Train Loss ce: 0.0336, Train Steps/Sec: 0.47,
4182
+ [2026-01-30 00:36:51] (step=0003938) Train Loss mse: 0.0000, Train Loss ce: 0.0322, Train Steps/Sec: 0.47,
4183
+ [2026-01-30 00:36:54] (step=0003939) Train Loss mse: 0.0000, Train Loss ce: 0.0276, Train Steps/Sec: 0.46,
4184
+ [2026-01-30 00:36:56] (step=0003940) Train Loss mse: 0.0000, Train Loss ce: 0.0299, Train Steps/Sec: 0.51,
4185
+ [2026-01-30 00:36:58] (step=0003941) Train Loss mse: 0.0000, Train Loss ce: 0.0283, Train Steps/Sec: 0.42,
4186
+ [2026-01-30 00:37:00] (step=0003942) Train Loss mse: 0.0000, Train Loss ce: 0.0321, Train Steps/Sec: 0.52,
4187
+ [2026-01-30 00:37:02] (step=0003943) Train Loss mse: 0.0000, Train Loss ce: 0.0288, Train Steps/Sec: 0.47,
4188
+ [2026-01-30 00:37:04] (step=0003944) Train Loss mse: 0.0000, Train Loss ce: 0.0342, Train Steps/Sec: 0.47,
4189
+ [2026-01-30 00:37:06] (step=0003945) Train Loss mse: 0.0000, Train Loss ce: 0.0294, Train Steps/Sec: 0.51,
4190
+ [2026-01-30 00:37:09] (step=0003946) Train Loss mse: 0.0000, Train Loss ce: 0.0295, Train Steps/Sec: 0.39,
4191
+ [2026-01-30 00:37:11] (step=0003947) Train Loss mse: 0.0000, Train Loss ce: 0.0301, Train Steps/Sec: 0.52,
4192
+ [2026-01-30 00:37:13] (step=0003948) Train Loss mse: 0.0000, Train Loss ce: 0.0314, Train Steps/Sec: 0.52,
4193
+ [2026-01-30 00:37:15] (step=0003949) Train Loss mse: 0.0000, Train Loss ce: 0.0268, Train Steps/Sec: 0.46,
4194
+ [2026-01-30 00:37:17] (step=0003950) Train Loss mse: 0.0000, Train Loss ce: 0.0295, Train Steps/Sec: 0.45,
4195
+ [2026-01-30 00:37:19] (step=0003951) Train Loss mse: 0.0000, Train Loss ce: 0.0281, Train Steps/Sec: 0.51,
4196
+ [2026-01-30 00:37:21] (step=0003952) Train Loss mse: 0.0000, Train Loss ce: 0.0322, Train Steps/Sec: 0.46,
4197
+ [2026-01-30 00:37:23] (step=0003953) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.46,
4198
+ [2026-01-30 00:37:26] (step=0003954) Train Loss mse: 0.0000, Train Loss ce: 0.0323, Train Steps/Sec: 0.43,
4199
+ [2026-01-30 00:37:28] (step=0003955) Train Loss mse: 0.0000, Train Loss ce: 0.0320, Train Steps/Sec: 0.51,
4200
+ [2026-01-30 00:37:29] (step=0003956) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.51,
4201
+ [2026-01-30 00:37:32] (step=0003957) Train Loss mse: 0.0000, Train Loss ce: 0.0284, Train Steps/Sec: 0.47,
4202
+ [2026-01-30 00:37:34] (step=0003958) Train Loss mse: 0.0000, Train Loss ce: 0.0317, Train Steps/Sec: 0.41,
4203
+ [2026-01-30 00:37:36] (step=0003959) Train Loss mse: 0.0000, Train Loss ce: 0.0298, Train Steps/Sec: 0.51,
4204
+ [2026-01-30 00:37:38] (step=0003960) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.42,
4205
+ [2026-01-30 00:37:41] (step=0003961) Train Loss mse: 0.0000, Train Loss ce: 0.0302, Train Steps/Sec: 0.43,
4206
+ [2026-01-30 00:37:43] (step=0003962) Train Loss mse: 0.0000, Train Loss ce: 0.0295, Train Steps/Sec: 0.47,
4207
+ [2026-01-30 00:37:45] (step=0003963) Train Loss mse: 0.0000, Train Loss ce: 0.0318, Train Steps/Sec: 0.52,
4208
+ [2026-01-30 00:37:47] (step=0003964) Train Loss mse: 0.0000, Train Loss ce: 0.0300, Train Steps/Sec: 0.47,
4209
+ [2026-01-30 00:37:49] (step=0003965) Train Loss mse: 0.0000, Train Loss ce: 0.0346, Train Steps/Sec: 0.51,
4210
+ [2026-01-30 00:37:51] (step=0003966) Train Loss mse: 0.0000, Train Loss ce: 0.0300, Train Steps/Sec: 0.41,
4211
+ [2026-01-30 00:37:54] (step=0003967) Train Loss mse: 0.0000, Train Loss ce: 0.0305, Train Steps/Sec: 0.42,
4212
+ [2026-01-30 00:37:56] (step=0003968) Train Loss mse: 0.0000, Train Loss ce: 0.0291, Train Steps/Sec: 0.51,
4213
+ [2026-01-30 00:37:58] (step=0003969) Train Loss mse: 0.0000, Train Loss ce: 0.0325, Train Steps/Sec: 0.43,
4214
+ [2026-01-30 00:38:00] (step=0003970) Train Loss mse: 0.0000, Train Loss ce: 0.0314, Train Steps/Sec: 0.47,
4215
+ [2026-01-30 00:38:02] (step=0003971) Train Loss mse: 0.0000, Train Loss ce: 0.0304, Train Steps/Sec: 0.51,
4216
+ [2026-01-30 00:38:04] (step=0003972) Train Loss mse: 0.0000, Train Loss ce: 0.0286, Train Steps/Sec: 0.46,
4217
+ [2026-01-30 00:38:06] (step=0003973) Train Loss mse: 0.0000, Train Loss ce: 0.0294, Train Steps/Sec: 0.46,
4218
+ [2026-01-30 00:38:09] (step=0003974) Train Loss mse: 0.0000, Train Loss ce: 0.0310, Train Steps/Sec: 0.46,
4219
+ [2026-01-30 00:38:11] (step=0003975) Train Loss mse: 0.0000, Train Loss ce: 0.0335, Train Steps/Sec: 0.42,
4220
+ [2026-01-30 00:38:13] (step=0003976) Train Loss mse: 0.0000, Train Loss ce: 0.0308, Train Steps/Sec: 0.47,
4221
+ [2026-01-30 00:38:15] (step=0003977) Train Loss mse: 0.0000, Train Loss ce: 0.0291, Train Steps/Sec: 0.47,
4222
  [2026-01-30 00:38:17] (step=0003978) Train Loss mse: 0.0000, Train Loss ce: 0.0309, Train Steps/Sec: 0.47,
4223
  [2026-01-30 00:38:20] (step=0003979) Train Loss mse: 0.0000, Train Loss ce: 0.0288, Train Steps/Sec: 0.46,
4224
  [2026-01-30 00:38:22] (step=0003980) Train Loss mse: 0.0000, Train Loss ce: 0.0315, Train Steps/Sec: 0.52,
 
5245
  [2026-01-30 01:14:45] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_mental_rotation_3d_pad3_by_axis_one_image_lr2e_5_ce_no_mse_ins/0005000.
5246
  /opt/conda/lib/python3.11/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py:690: FutureWarning: FSDP.state_dict_type() and FSDP.set_state_dict_type() are being deprecated. Please use APIs, get_state_dict() and set_state_dict(), which can support different parallelisms, FSDP1, FSDP2, DDP. API doc: https://pytorch.org/docs/stable/distributed.checkpoint.html#torch.distributed.checkpoint.state_dict.get_state_dict .Tutorial: https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html .
5247
  warnings.warn(
5248
+ [2026-01-30 01:17:18] Done!