Upload checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins

Browse files

Files changed (1) hide show

checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/wandb/offline-run-20260126_213949-checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins-run0/files/output.log +47 -54

checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/wandb/offline-run-20260126_213949-checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins-run0/files/output.log CHANGED Viewed

@@ -1099,18 +1099,6 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 [[34m2026-01-26 22:14:55[39m] (step=0001088) Train Loss mse: 0.0112, Train Loss ce: 0.0307, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:14:56[39m] (step=0001089) Train Loss mse: 0.0141, Train Loss ce: 0.0282, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:14:57[39m] (step=0001090) Train Loss mse: 0.0187, Train Loss ce: 0.0744, Train Steps/Sec: 0.68,
-[[34m2026-01-26 22:14:59[39m] (step=0001091) Train Loss mse: 0.0144, Train Loss ce: 0.0371, Train Steps/Sec: 0.56,
-[[34m2026-01-26 22:15:01[39m] (step=0001092) Train Loss mse: 0.0181, Train Loss ce: 0.0742, Train Steps/Sec: 0.69,
-[[34m2026-01-26 22:15:02[39m] (step=0001093) Train Loss mse: 0.0119, Train Loss ce: 0.0614, Train Steps/Sec: 0.56,
-[[34m2026-01-26 22:15:04[39m] (step=0001094) Train Loss mse: 0.0140, Train Loss ce: 0.0572, Train Steps/Sec: 0.59,
-[[34m2026-01-26 22:15:06[39m] (step=0001095) Train Loss mse: 0.0155, Train Loss ce: 0.0332, Train Steps/Sec: 0.69,
-[[34m2026-01-26 22:15:07[39m] (step=0001096) Train Loss mse: 0.0166, Train Loss ce: 0.0692, Train Steps/Sec: 0.68,
-[[34m2026-01-26 22:15:09[39m] (step=0001097) Train Loss mse: 0.0175, Train Loss ce: 0.0863, Train Steps/Sec: 0.68,
-[[34m2026-01-26 22:15:10[39m] (step=0001098) Train Loss mse: 0.0082, Train Loss ce: 0.0662, Train Steps/Sec: 0.57,
-[[34m2026-01-26 22:15:12[39m] (step=0001099) Train Loss mse: 0.0139, Train Loss ce: 0.0561, Train Steps/Sec: 0.68,
-[[34m2026-01-26 22:15:13[39m] (step=0001100) Train Loss mse: 0.0106, Train Loss ce: 0.0682, Train Steps/Sec: 0.68,
-[[34m2026-01-26 22:15:15[39m] (step=0001101) Train Loss mse: 0.0105, Train Loss ce: 0.0531, Train Steps/Sec: 0.56,
-[[34m2026-01-26 22:15:17[39m] (step=0001102) Train Loss mse: 0.0123, Train Loss ce: 0.0902, Train Steps/Sec: 0.59,
 FullyShardedDataParallel(
   (_fsdp_wrapped_module): Bagel(
     (language_model): Qwen2ForCausalLM(
@@ -1297,13 +1285,6 @@ Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equat
   fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
   fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
 ce_avg: 0.06269639730453491, mse_avg: 0.0149351442232728
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step1000
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.06731808930635452, mse_avg: 0.011200404725968838
 base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step1500
 Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
 [eval debug] first 3 batch fingerprints:
@@ -1325,6 +1306,18 @@ Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equat
   fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
   fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
 ce_avg: 0.20800399780273438, mse_avg: 0.011797062121331692
 [[34m2026-01-26 22:15:18[39m] (step=0001103) Train Loss mse: 0.0096, Train Loss ce: 0.0452, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:15:20[39m] (step=0001104) Train Loss mse: 0.0135, Train Loss ce: 0.0637, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:15:21[39m] (step=0001105) Train Loss mse: 0.0077, Train Loss ce: 0.0449, Train Steps/Sec: 0.68,
@@ -2815,6 +2808,20 @@ ce_avg: 0.20800399780273438, mse_avg: 0.011797062121331692
 [[34m2026-01-26 22:57:58[39m] (step=0002587) Train Loss mse: 0.0121, Train Loss ce: 0.0432, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:57:59[39m] (step=0002588) Train Loss mse: 0.0060, Train Loss ce: 0.0675, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:58:01[39m] (step=0002589) Train Loss mse: 0.0137, Train Loss ce: 0.0460, Train Steps/Sec: 0.69,
 [[34m2026-01-26 22:58:02[39m] (step=0002590) Train Loss mse: 0.0089, Train Loss ce: 0.0360, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:58:04[39m] (step=0002591) Train Loss mse: 0.0089, Train Loss ce: 0.0466, Train Steps/Sec: 0.56,
 [[34m2026-01-26 22:58:06[39m] (step=0002592) Train Loss mse: 0.0074, Train Loss ce: 0.0222, Train Steps/Sec: 0.59,
@@ -2902,27 +2909,6 @@ ce_avg: 0.20800399780273438, mse_avg: 0.011797062121331692
 [[34m2026-01-26 23:00:13[39m] (step=0002674) Train Loss mse: 0.0056, Train Loss ce: 0.0613, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:00:15[39m] (step=0002675) Train Loss mse: 0.0054, Train Loss ce: 0.0283, Train Steps/Sec: 0.49,
 [[34m2026-01-26 23:00:16[39m] (step=0002676) Train Loss mse: 0.0099, Train Loss ce: 0.0483, Train Steps/Sec: 0.56,
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step3000
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.04329414293169975, mse_avg: 0.006353132426738739
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step3500
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.04275057464838028, mse_avg: 0.0056571937166154385
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step4000
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.03896063566207886, mse_avg: 0.005920059513300657
 [[34m2026-01-26 23:00:18[39m] (step=0002677) Train Loss mse: 0.0085, Train Loss ce: 0.0506, Train Steps/Sec: 0.69,
 [[34m2026-01-26 23:00:19[39m] (step=0002678) Train Loss mse: 0.0104, Train Loss ce: 0.0571, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:00:21[39m] (step=0002679) Train Loss mse: 0.0179, Train Loss ce: 0.0435, Train Steps/Sec: 0.68,
@@ -3899,6 +3885,27 @@ ce_avg: 0.03896063566207886, mse_avg: 0.005920059513300657
 [[34m2026-01-26 23:26:22[39m] (step=0003650) Train Loss mse: 0.0108, Train Loss ce: 0.0542, Train Steps/Sec: 0.49,
 [[34m2026-01-26 23:26:24[39m] (step=0003651) Train Loss mse: 0.0060, Train Loss ce: 0.0508, Train Steps/Sec: 0.57,
 [[34m2026-01-26 23:26:26[39m] (step=0003652) Train Loss mse: 0.0067, Train Loss ce: 0.0496, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:27[39m] (step=0003653) Train Loss mse: 0.0060, Train Loss ce: 0.0390, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:29[39m] (step=0003654) Train Loss mse: 0.0091, Train Loss ce: 0.0192, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:30[39m] (step=0003655) Train Loss mse: 0.0045, Train Loss ce: 0.0370, Train Steps/Sec: 0.58,
@@ -4059,20 +4066,6 @@ ce_avg: 0.03896063566207886, mse_avg: 0.005920059513300657
 [[34m2026-01-26 23:30:41[39m] (step=0003810) Train Loss mse: 0.0053, Train Loss ce: 0.0262, Train Steps/Sec: 0.67,
 [[34m2026-01-26 23:30:43[39m] (step=0003811) Train Loss mse: 0.0043, Train Loss ce: 0.0578, Train Steps/Sec: 0.58,
 [[34m2026-01-26 23:30:44[39m] (step=0003812) Train Loss mse: 0.0063, Train Loss ce: 0.0334, Train Steps/Sec: 0.68,
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step4500
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.03593315929174423, mse_avg: 0.005641990341246128
-base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step5000
-Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
-[eval debug] first 3 batch fingerprints:
-  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
-ce_avg: 0.03604895621538162, mse_avg: 0.005358231253921986
 [[34m2026-01-26 23:30:46[39m] (step=0003813) Train Loss mse: 0.0074, Train Loss ce: 0.0200, Train Steps/Sec: 0.58,
 [[34m2026-01-26 23:30:48[39m] (step=0003814) Train Loss mse: 0.0077, Train Loss ce: 0.0290, Train Steps/Sec: 0.55,
 [[34m2026-01-26 23:30:49[39m] (step=0003815) Train Loss mse: 0.0057, Train Loss ce: 0.0341, Train Steps/Sec: 0.68,

 [[34m2026-01-26 22:14:55[39m] (step=0001088) Train Loss mse: 0.0112, Train Loss ce: 0.0307, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:14:56[39m] (step=0001089) Train Loss mse: 0.0141, Train Loss ce: 0.0282, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:14:57[39m] (step=0001090) Train Loss mse: 0.0187, Train Loss ce: 0.0744, Train Steps/Sec: 0.68,
 FullyShardedDataParallel(
   (_fsdp_wrapped_module): Bagel(
     (language_model): Qwen2ForCausalLM(
   fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
   fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
 ce_avg: 0.06269639730453491, mse_avg: 0.0149351442232728
 base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step1500
 Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
 [eval debug] first 3 batch fingerprints:
   fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
   fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
 ce_avg: 0.20800399780273438, mse_avg: 0.011797062121331692
+[[34m2026-01-26 22:14:59[39m] (step=0001091) Train Loss mse: 0.0144, Train Loss ce: 0.0371, Train Steps/Sec: 0.56,
+[[34m2026-01-26 22:15:01[39m] (step=0001092) Train Loss mse: 0.0181, Train Loss ce: 0.0742, Train Steps/Sec: 0.69,
+[[34m2026-01-26 22:15:02[39m] (step=0001093) Train Loss mse: 0.0119, Train Loss ce: 0.0614, Train Steps/Sec: 0.56,
+[[34m2026-01-26 22:15:04[39m] (step=0001094) Train Loss mse: 0.0140, Train Loss ce: 0.0572, Train Steps/Sec: 0.59,
+[[34m2026-01-26 22:15:06[39m] (step=0001095) Train Loss mse: 0.0155, Train Loss ce: 0.0332, Train Steps/Sec: 0.69,
+[[34m2026-01-26 22:15:07[39m] (step=0001096) Train Loss mse: 0.0166, Train Loss ce: 0.0692, Train Steps/Sec: 0.68,
+[[34m2026-01-26 22:15:09[39m] (step=0001097) Train Loss mse: 0.0175, Train Loss ce: 0.0863, Train Steps/Sec: 0.68,
+[[34m2026-01-26 22:15:10[39m] (step=0001098) Train Loss mse: 0.0082, Train Loss ce: 0.0662, Train Steps/Sec: 0.57,
+[[34m2026-01-26 22:15:12[39m] (step=0001099) Train Loss mse: 0.0139, Train Loss ce: 0.0561, Train Steps/Sec: 0.68,
+[[34m2026-01-26 22:15:13[39m] (step=0001100) Train Loss mse: 0.0106, Train Loss ce: 0.0682, Train Steps/Sec: 0.68,
+[[34m2026-01-26 22:15:15[39m] (step=0001101) Train Loss mse: 0.0105, Train Loss ce: 0.0531, Train Steps/Sec: 0.56,
+[[34m2026-01-26 22:15:17[39m] (step=0001102) Train Loss mse: 0.0123, Train Loss ce: 0.0902, Train Steps/Sec: 0.59,
 [[34m2026-01-26 22:15:18[39m] (step=0001103) Train Loss mse: 0.0096, Train Loss ce: 0.0452, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:15:20[39m] (step=0001104) Train Loss mse: 0.0135, Train Loss ce: 0.0637, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:15:21[39m] (step=0001105) Train Loss mse: 0.0077, Train Loss ce: 0.0449, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:57:58[39m] (step=0002587) Train Loss mse: 0.0121, Train Loss ce: 0.0432, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:57:59[39m] (step=0002588) Train Loss mse: 0.0060, Train Loss ce: 0.0675, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:58:01[39m] (step=0002589) Train Loss mse: 0.0137, Train Loss ce: 0.0460, Train Steps/Sec: 0.69,
+base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step3000
+Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
+[eval debug] first 3 batch fingerprints:
+  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+ce_avg: 0.04329414293169975, mse_avg: 0.006353132426738739
+base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step3500
+Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
+[eval debug] first 3 batch fingerprints:
+  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+ce_avg: 0.04275057464838028, mse_avg: 0.0056571937166154385
 [[34m2026-01-26 22:58:02[39m] (step=0002590) Train Loss mse: 0.0089, Train Loss ce: 0.0360, Train Steps/Sec: 0.68,
 [[34m2026-01-26 22:58:04[39m] (step=0002591) Train Loss mse: 0.0089, Train Loss ce: 0.0466, Train Steps/Sec: 0.56,
 [[34m2026-01-26 22:58:06[39m] (step=0002592) Train Loss mse: 0.0074, Train Loss ce: 0.0222, Train Steps/Sec: 0.59,
 [[34m2026-01-26 23:00:13[39m] (step=0002674) Train Loss mse: 0.0056, Train Loss ce: 0.0613, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:00:15[39m] (step=0002675) Train Loss mse: 0.0054, Train Loss ce: 0.0283, Train Steps/Sec: 0.49,
 [[34m2026-01-26 23:00:16[39m] (step=0002676) Train Loss mse: 0.0099, Train Loss ce: 0.0483, Train Steps/Sec: 0.56,
 [[34m2026-01-26 23:00:18[39m] (step=0002677) Train Loss mse: 0.0085, Train Loss ce: 0.0506, Train Steps/Sec: 0.69,
 [[34m2026-01-26 23:00:19[39m] (step=0002678) Train Loss mse: 0.0104, Train Loss ce: 0.0571, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:00:21[39m] (step=0002679) Train Loss mse: 0.0179, Train Loss ce: 0.0435, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:22[39m] (step=0003650) Train Loss mse: 0.0108, Train Loss ce: 0.0542, Train Steps/Sec: 0.49,
 [[34m2026-01-26 23:26:24[39m] (step=0003651) Train Loss mse: 0.0060, Train Loss ce: 0.0508, Train Steps/Sec: 0.57,
 [[34m2026-01-26 23:26:26[39m] (step=0003652) Train Loss mse: 0.0067, Train Loss ce: 0.0496, Train Steps/Sec: 0.68,
+base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step4000
+Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
+[eval debug] first 3 batch fingerprints:
+  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+ce_avg: 0.03896063566207886, mse_avg: 0.005920059513300657
+base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step4500
+Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
+[eval debug] first 3 batch fingerprints:
+  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+ce_avg: 0.03593315929174423, mse_avg: 0.005641990341246128
+base_dir is /dev/shm/models/checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins/eval_used_rows, step_tag is checkpoints_vlm_gym_match_equation_sos_one_image_lr2e_5_ce_ins_step5000
+Preparing Dataset vlm_gym_match_equation_sos_celoss_evalonce/vlm_gym_match_equation_sos_val
+[eval debug] first 3 batch fingerprints:
+  fp[0]: [{'data_indexes': [0], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[1]: [{'data_indexes': [8], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+  fp[2]: [{'data_indexes': [16], 'worker_id': 0, 'dataset_name': 'vlm_gym_match_equation_sos_celoss_evalonce'}]
+ce_avg: 0.03604895621538162, mse_avg: 0.005358231253921986
 [[34m2026-01-26 23:26:27[39m] (step=0003653) Train Loss mse: 0.0060, Train Loss ce: 0.0390, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:29[39m] (step=0003654) Train Loss mse: 0.0091, Train Loss ce: 0.0192, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:26:30[39m] (step=0003655) Train Loss mse: 0.0045, Train Loss ce: 0.0370, Train Steps/Sec: 0.58,
 [[34m2026-01-26 23:30:41[39m] (step=0003810) Train Loss mse: 0.0053, Train Loss ce: 0.0262, Train Steps/Sec: 0.67,
 [[34m2026-01-26 23:30:43[39m] (step=0003811) Train Loss mse: 0.0043, Train Loss ce: 0.0578, Train Steps/Sec: 0.58,
 [[34m2026-01-26 23:30:44[39m] (step=0003812) Train Loss mse: 0.0063, Train Loss ce: 0.0334, Train Steps/Sec: 0.68,
 [[34m2026-01-26 23:30:46[39m] (step=0003813) Train Loss mse: 0.0074, Train Loss ce: 0.0200, Train Steps/Sec: 0.58,
 [[34m2026-01-26 23:30:48[39m] (step=0003814) Train Loss mse: 0.0077, Train Loss ce: 0.0290, Train Steps/Sec: 0.55,
 [[34m2026-01-26 23:30:49[39m] (step=0003815) Train Loss mse: 0.0057, Train Loss ce: 0.0341, Train Steps/Sec: 0.68,