Junyi42 commited on
Commit
93f2e0b
·
verified ·
1 Parent(s): c692aea

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test

Browse files
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log CHANGED
@@ -919,84 +919,6 @@ ImportError: cannot import name 'NaiveCache' from 'modeling.bagel' (/home/cloudu
919
  [2026-01-03 11:42:37] (step=0000876) Train Loss mse: 0.0359, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
920
  [2026-01-03 11:42:50] (step=0000877) Train Loss mse: 0.0421, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
921
  [2026-01-03 11:43:05] (step=0000878) Train Loss mse: 0.0467, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
922
- [2026-01-03 11:43:18] (step=0000879) Train Loss mse: 0.0381, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
923
- [2026-01-03 11:43:31] (step=0000880) Train Loss mse: 0.0545, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
924
- [2026-01-03 11:43:44] (step=0000881) Train Loss mse: 0.0397, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
925
- [2026-01-03 11:43:55] (step=0000882) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
926
- [2026-01-03 11:44:11] (step=0000883) Train Loss mse: 0.0294, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
927
- [2026-01-03 11:44:24] (step=0000884) Train Loss mse: 0.0485, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
928
- [2026-01-03 11:44:38] (step=0000885) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
929
- [2026-01-03 11:44:54] (step=0000886) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
930
- [2026-01-03 11:45:08] (step=0000887) Train Loss mse: 0.0396, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
931
- [2026-01-03 11:45:21] (step=0000888) Train Loss mse: 0.0443, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
932
- [2026-01-03 11:45:37] (step=0000889) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
933
- [2026-01-03 11:45:53] (step=0000890) Train Loss mse: 0.0598, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
934
- [2026-01-03 11:46:06] (step=0000891) Train Loss mse: 0.0548, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
935
- [2026-01-03 11:46:20] (step=0000892) Train Loss mse: 0.0408, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
936
- [2026-01-03 11:46:32] (step=0000893) Train Loss mse: 0.0526, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
937
- [2026-01-03 11:46:48] (step=0000894) Train Loss mse: 0.0472, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
938
- [2026-01-03 11:47:01] (step=0000895) Train Loss mse: 0.0413, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
939
- [2026-01-03 11:47:17] (step=0000896) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
940
- [2026-01-03 11:47:30] (step=0000897) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
941
- [2026-01-03 11:47:43] (step=0000898) Train Loss mse: 0.0499, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
942
- [2026-01-03 11:47:59] (step=0000899) Train Loss mse: 0.0460, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
943
- [2026-01-03 11:48:15] (step=0000900) Train Loss mse: 0.0338, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
944
- [2026-01-03 11:48:28] (step=0000901) Train Loss mse: 0.0443, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
945
- [2026-01-03 11:48:39] (step=0000902) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
946
- [2026-01-03 11:48:52] (step=0000903) Train Loss mse: 0.0455, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
947
- [2026-01-03 11:49:06] (step=0000904) Train Loss mse: 0.0329, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
948
- [2026-01-03 11:49:18] (step=0000905) Train Loss mse: 0.0453, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
949
- [2026-01-03 11:49:29] (step=0000906) Train Loss mse: 0.0520, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
950
- [2026-01-03 11:49:45] (step=0000907) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
951
- [2026-01-03 11:50:01] (step=0000908) Train Loss mse: 0.0422, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
952
- [2026-01-03 11:50:15] (step=0000909) Train Loss mse: 0.0392, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
953
- [2026-01-03 11:50:28] (step=0000910) Train Loss mse: 0.0422, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
954
- [2026-01-03 11:50:41] (step=0000911) Train Loss mse: 0.0387, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
955
- [2026-01-03 11:50:55] (step=0000912) Train Loss mse: 0.0551, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
956
- [2026-01-03 11:51:08] (step=0000913) Train Loss mse: 0.0414, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
957
- [2026-01-03 11:51:24] (step=0000914) Train Loss mse: 0.0393, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
958
- [2026-01-03 11:51:40] (step=0000915) Train Loss mse: 0.0388, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
959
- [2026-01-03 11:51:56] (step=0000916) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
960
- [2026-01-03 11:52:08] (step=0000917) Train Loss mse: 0.0476, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
961
- [2026-01-03 11:52:22] (step=0000918) Train Loss mse: 0.0407, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
962
- [2026-01-03 11:52:35] (step=0000919) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
963
- [2026-01-03 11:52:52] (step=0000920) Train Loss mse: 0.0420, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
964
- [2026-01-03 11:53:04] (step=0000921) Train Loss mse: 0.0508, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
965
- [2026-01-03 11:53:17] (step=0000922) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
966
- [2026-01-03 11:53:33] (step=0000923) Train Loss mse: 0.0344, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
967
- [2026-01-03 11:53:45] (step=0000924) Train Loss mse: 0.0430, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
968
- [2026-01-03 11:53:56] (step=0000925) Train Loss mse: 0.0482, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
969
- [2026-01-03 11:54:12] (step=0000926) Train Loss mse: 0.0336, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
970
- [2026-01-03 11:54:28] (step=0000927) Train Loss mse: 0.0440, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
971
- [2026-01-03 11:54:44] (step=0000928) Train Loss mse: 0.0481, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
972
- [2026-01-03 11:54:58] (step=0000929) Train Loss mse: 0.0429, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
973
- [2026-01-03 11:55:14] (step=0000930) Train Loss mse: 0.0304, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
974
- [2026-01-03 11:55:28] (step=0000931) Train Loss mse: 0.0314, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
975
- [2026-01-03 11:55:42] (step=0000932) Train Loss mse: 0.0376, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
976
- [2026-01-03 11:55:56] (step=0000933) Train Loss mse: 0.0394, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
977
- [2026-01-03 11:56:09] (step=0000934) Train Loss mse: 0.0447, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
978
- [2026-01-03 11:56:23] (step=0000935) Train Loss mse: 0.0389, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
979
- [2026-01-03 11:56:37] (step=0000936) Train Loss mse: 0.0335, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
980
- [2026-01-03 11:56:53] (step=0000937) Train Loss mse: 0.0384, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
981
- [2026-01-03 11:57:06] (step=0000938) Train Loss mse: 0.0453, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
982
- [2026-01-03 11:57:19] (step=0000939) Train Loss mse: 0.0514, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
983
- [2026-01-03 11:57:32] (step=0000940) Train Loss mse: 0.0461, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
984
- [2026-01-03 11:57:44] (step=0000941) Train Loss mse: 0.0522, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
985
- [2026-01-03 11:57:56] (step=0000942) Train Loss mse: 0.0522, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
986
- [2026-01-03 11:58:09] (step=0000943) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
987
- [2026-01-03 11:58:22] (step=0000944) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
988
- [2026-01-03 11:58:35] (step=0000945) Train Loss mse: 0.0529, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
989
- [2026-01-03 11:58:49] (step=0000946) Train Loss mse: 0.0354, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
990
- [2026-01-03 11:59:02] (step=0000947) Train Loss mse: 0.0330, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
991
- [2026-01-03 11:59:19] (step=0000948) Train Loss mse: 0.0374, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
992
- [2026-01-03 11:59:32] (step=0000949) Train Loss mse: 0.0409, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
993
- [2026-01-03 11:59:48] (step=0000950) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
994
- [2026-01-03 12:00:04] (step=0000951) Train Loss mse: 0.0349, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
995
- [2026-01-03 12:00:16] (step=0000952) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
996
- [2026-01-03 12:00:32] (step=0000953) Train Loss mse: 0.0324, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
997
- [2026-01-03 12:00:45] (step=0000954) Train Loss mse: 0.0471, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
998
- [2026-01-03 12:00:58] (step=0000955) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
999
- [2026-01-03 12:01:12] (step=0000956) Train Loss mse: 0.0324, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1000
  FullyShardedDataParallel(
1001
  (_fsdp_wrapped_module): Bagel(
1002
  (language_model): Qwen2ForCausalLM(
@@ -1170,6 +1092,84 @@ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1170
  vit_pos_embed._fsdp_wrapped_module._flat_param False
1171
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1172
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1173
  [2026-01-03 12:01:28] (step=0000957) Train Loss mse: 0.0300, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1174
  [2026-01-03 12:01:42] (step=0000958) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1175
  [2026-01-03 12:01:53] (step=0000959) Train Loss mse: 0.0465, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
 
919
  [2026-01-03 11:42:37] (step=0000876) Train Loss mse: 0.0359, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
920
  [2026-01-03 11:42:50] (step=0000877) Train Loss mse: 0.0421, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
921
  [2026-01-03 11:43:05] (step=0000878) Train Loss mse: 0.0467, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
  FullyShardedDataParallel(
923
  (_fsdp_wrapped_module): Bagel(
924
  (language_model): Qwen2ForCausalLM(
 
1092
  vit_pos_embed._fsdp_wrapped_module._flat_param False
1093
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1094
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1095
+ [2026-01-03 11:43:18] (step=0000879) Train Loss mse: 0.0381, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1096
+ [2026-01-03 11:43:31] (step=0000880) Train Loss mse: 0.0545, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1097
+ [2026-01-03 11:43:44] (step=0000881) Train Loss mse: 0.0397, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1098
+ [2026-01-03 11:43:55] (step=0000882) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1099
+ [2026-01-03 11:44:11] (step=0000883) Train Loss mse: 0.0294, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1100
+ [2026-01-03 11:44:24] (step=0000884) Train Loss mse: 0.0485, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1101
+ [2026-01-03 11:44:38] (step=0000885) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1102
+ [2026-01-03 11:44:54] (step=0000886) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1103
+ [2026-01-03 11:45:08] (step=0000887) Train Loss mse: 0.0396, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1104
+ [2026-01-03 11:45:21] (step=0000888) Train Loss mse: 0.0443, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1105
+ [2026-01-03 11:45:37] (step=0000889) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1106
+ [2026-01-03 11:45:53] (step=0000890) Train Loss mse: 0.0598, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1107
+ [2026-01-03 11:46:06] (step=0000891) Train Loss mse: 0.0548, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1108
+ [2026-01-03 11:46:20] (step=0000892) Train Loss mse: 0.0408, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1109
+ [2026-01-03 11:46:32] (step=0000893) Train Loss mse: 0.0526, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1110
+ [2026-01-03 11:46:48] (step=0000894) Train Loss mse: 0.0472, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1111
+ [2026-01-03 11:47:01] (step=0000895) Train Loss mse: 0.0413, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1112
+ [2026-01-03 11:47:17] (step=0000896) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1113
+ [2026-01-03 11:47:30] (step=0000897) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1114
+ [2026-01-03 11:47:43] (step=0000898) Train Loss mse: 0.0499, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1115
+ [2026-01-03 11:47:59] (step=0000899) Train Loss mse: 0.0460, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1116
+ [2026-01-03 11:48:15] (step=0000900) Train Loss mse: 0.0338, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1117
+ [2026-01-03 11:48:28] (step=0000901) Train Loss mse: 0.0443, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1118
+ [2026-01-03 11:48:39] (step=0000902) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1119
+ [2026-01-03 11:48:52] (step=0000903) Train Loss mse: 0.0455, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1120
+ [2026-01-03 11:49:06] (step=0000904) Train Loss mse: 0.0329, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1121
+ [2026-01-03 11:49:18] (step=0000905) Train Loss mse: 0.0453, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1122
+ [2026-01-03 11:49:29] (step=0000906) Train Loss mse: 0.0520, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1123
+ [2026-01-03 11:49:45] (step=0000907) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1124
+ [2026-01-03 11:50:01] (step=0000908) Train Loss mse: 0.0422, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1125
+ [2026-01-03 11:50:15] (step=0000909) Train Loss mse: 0.0392, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1126
+ [2026-01-03 11:50:28] (step=0000910) Train Loss mse: 0.0422, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1127
+ [2026-01-03 11:50:41] (step=0000911) Train Loss mse: 0.0387, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1128
+ [2026-01-03 11:50:55] (step=0000912) Train Loss mse: 0.0551, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1129
+ [2026-01-03 11:51:08] (step=0000913) Train Loss mse: 0.0414, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1130
+ [2026-01-03 11:51:24] (step=0000914) Train Loss mse: 0.0393, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1131
+ [2026-01-03 11:51:40] (step=0000915) Train Loss mse: 0.0388, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1132
+ [2026-01-03 11:51:56] (step=0000916) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1133
+ [2026-01-03 11:52:08] (step=0000917) Train Loss mse: 0.0476, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1134
+ [2026-01-03 11:52:22] (step=0000918) Train Loss mse: 0.0407, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1135
+ [2026-01-03 11:52:35] (step=0000919) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1136
+ [2026-01-03 11:52:52] (step=0000920) Train Loss mse: 0.0420, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1137
+ [2026-01-03 11:53:04] (step=0000921) Train Loss mse: 0.0508, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1138
+ [2026-01-03 11:53:17] (step=0000922) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1139
+ [2026-01-03 11:53:33] (step=0000923) Train Loss mse: 0.0344, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1140
+ [2026-01-03 11:53:45] (step=0000924) Train Loss mse: 0.0430, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1141
+ [2026-01-03 11:53:56] (step=0000925) Train Loss mse: 0.0482, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1142
+ [2026-01-03 11:54:12] (step=0000926) Train Loss mse: 0.0336, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1143
+ [2026-01-03 11:54:28] (step=0000927) Train Loss mse: 0.0440, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1144
+ [2026-01-03 11:54:44] (step=0000928) Train Loss mse: 0.0481, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1145
+ [2026-01-03 11:54:58] (step=0000929) Train Loss mse: 0.0429, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1146
+ [2026-01-03 11:55:14] (step=0000930) Train Loss mse: 0.0304, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1147
+ [2026-01-03 11:55:28] (step=0000931) Train Loss mse: 0.0314, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1148
+ [2026-01-03 11:55:42] (step=0000932) Train Loss mse: 0.0376, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1149
+ [2026-01-03 11:55:56] (step=0000933) Train Loss mse: 0.0394, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1150
+ [2026-01-03 11:56:09] (step=0000934) Train Loss mse: 0.0447, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1151
+ [2026-01-03 11:56:23] (step=0000935) Train Loss mse: 0.0389, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1152
+ [2026-01-03 11:56:37] (step=0000936) Train Loss mse: 0.0335, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1153
+ [2026-01-03 11:56:53] (step=0000937) Train Loss mse: 0.0384, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1154
+ [2026-01-03 11:57:06] (step=0000938) Train Loss mse: 0.0453, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1155
+ [2026-01-03 11:57:19] (step=0000939) Train Loss mse: 0.0514, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1156
+ [2026-01-03 11:57:32] (step=0000940) Train Loss mse: 0.0461, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1157
+ [2026-01-03 11:57:44] (step=0000941) Train Loss mse: 0.0522, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1158
+ [2026-01-03 11:57:56] (step=0000942) Train Loss mse: 0.0522, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1159
+ [2026-01-03 11:58:09] (step=0000943) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1160
+ [2026-01-03 11:58:22] (step=0000944) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1161
+ [2026-01-03 11:58:35] (step=0000945) Train Loss mse: 0.0529, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1162
+ [2026-01-03 11:58:49] (step=0000946) Train Loss mse: 0.0354, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1163
+ [2026-01-03 11:59:02] (step=0000947) Train Loss mse: 0.0330, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1164
+ [2026-01-03 11:59:19] (step=0000948) Train Loss mse: 0.0374, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1165
+ [2026-01-03 11:59:32] (step=0000949) Train Loss mse: 0.0409, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1166
+ [2026-01-03 11:59:48] (step=0000950) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1167
+ [2026-01-03 12:00:04] (step=0000951) Train Loss mse: 0.0349, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1168
+ [2026-01-03 12:00:16] (step=0000952) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1169
+ [2026-01-03 12:00:32] (step=0000953) Train Loss mse: 0.0324, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1170
+ [2026-01-03 12:00:45] (step=0000954) Train Loss mse: 0.0471, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1171
+ [2026-01-03 12:00:58] (step=0000955) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1172
+ [2026-01-03 12:01:12] (step=0000956) Train Loss mse: 0.0324, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1173
  [2026-01-03 12:01:28] (step=0000957) Train Loss mse: 0.0300, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1174
  [2026-01-03 12:01:42] (step=0000958) Train Loss mse: 0.0315, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1175
  [2026-01-03 12:01:53] (step=0000959) Train Loss mse: 0.0465, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log CHANGED
@@ -1,176 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (time_embedder): FullyShardedDataParallel(
51
- (_fsdp_wrapped_module): TimestepEmbedder(
52
- (mlp): Sequential(
53
- (0): Linear(in_features=256, out_features=3584, bias=True)
54
- (1): SiLU()
55
- (2): Linear(in_features=3584, out_features=3584, bias=True)
56
- )
57
- )
58
- )
59
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
- (latent_pos_embed): FullyShardedDataParallel(
62
- (_fsdp_wrapped_module): PositionEmbedding()
63
- )
64
- (vit_model): SiglipVisionModel(
65
- (vision_model): FullyShardedDataParallel(
66
- (_fsdp_wrapped_module): SiglipVisionTransformer(
67
- (embeddings): SiglipVisionEmbeddings(
68
- (position_embedding): Embedding(4900, 1152)
69
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
- )
71
- (encoder): SiglipEncoder(
72
- (layers): ModuleList(
73
- (0-25): 26 x FullyShardedDataParallel(
74
- (_fsdp_wrapped_module): CheckpointWrapper(
75
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
- (self_attn): SiglipFlashAttention2(
77
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- )
82
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
- (mlp): SiglipMLP(
84
- (activation_fn): PytorchGELUTanh()
85
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
- )
88
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
- )
90
- )
91
- )
92
- )
93
- )
94
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
- )
96
- )
97
- )
98
- (connector): FullyShardedDataParallel(
99
- (_fsdp_wrapped_module): CheckpointWrapper(
100
- (_checkpoint_wrapped_module): MLPconnector(
101
- (activation_fn): PytorchGELUTanh()
102
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
- )
105
- )
106
- )
107
- (vit_pos_embed): FullyShardedDataParallel(
108
- (_fsdp_wrapped_module): PositionEmbedding()
109
- )
110
- )
111
- )
112
- _flat_param True
113
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- time_embedder._fsdp_wrapped_module._flat_param True
142
- latent_pos_embed._fsdp_wrapped_module._flat_param False
143
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- vit_pos_embed._fsdp_wrapped_module._flat_param False
172
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
173
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
174
  wandb: Detected [huggingface_hub.inference] in use.
175
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
176
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -711,4 +538,177 @@ Traceback (most recent call last):
711
  for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
712
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
713
  TypeError: 'NoneType' object is not iterable
714
- Traceback (most recent call last):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
538
  for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
539
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
540
  TypeError: 'NoneType' object is not iterable
541
+ Traceback (most recent call last):
542
+ FullyShardedDataParallel(
543
+ (_fsdp_wrapped_module): Bagel(
544
+ (language_model): Qwen2ForCausalLM(
545
+ (model): Qwen2Model(
546
+ (embed_tokens): Embedding(152064, 3584)
547
+ (layers): ModuleList(
548
+ (0-27): 28 x FullyShardedDataParallel(
549
+ (_fsdp_wrapped_module): CheckpointWrapper(
550
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
551
+ (self_attn): PackedAttentionMoT(
552
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
553
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
554
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
555
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
556
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
557
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
558
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
559
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
560
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
561
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
562
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
563
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
564
+ )
565
+ (mlp): Qwen2MLP(
566
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
567
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
568
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
569
+ (act_fn): SiLU()
570
+ )
571
+ (mlp_moe_gen): Qwen2MLP(
572
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
573
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
574
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
575
+ (act_fn): SiLU()
576
+ )
577
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
578
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
579
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
580
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
581
+ )
582
+ )
583
+ )
584
+ )
585
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
586
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
587
+ (rotary_emb): Qwen2RotaryEmbedding()
588
+ )
589
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
590
+ )
591
+ (time_embedder): FullyShardedDataParallel(
592
+ (_fsdp_wrapped_module): TimestepEmbedder(
593
+ (mlp): Sequential(
594
+ (0): Linear(in_features=256, out_features=3584, bias=True)
595
+ (1): SiLU()
596
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
597
+ )
598
+ )
599
+ )
600
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
601
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
602
+ (latent_pos_embed): FullyShardedDataParallel(
603
+ (_fsdp_wrapped_module): PositionEmbedding()
604
+ )
605
+ (vit_model): SiglipVisionModel(
606
+ (vision_model): FullyShardedDataParallel(
607
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
608
+ (embeddings): SiglipVisionEmbeddings(
609
+ (position_embedding): Embedding(4900, 1152)
610
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
611
+ )
612
+ (encoder): SiglipEncoder(
613
+ (layers): ModuleList(
614
+ (0-25): 26 x FullyShardedDataParallel(
615
+ (_fsdp_wrapped_module): CheckpointWrapper(
616
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
617
+ (self_attn): SiglipFlashAttention2(
618
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
619
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
620
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
621
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
622
+ )
623
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
624
+ (mlp): SiglipMLP(
625
+ (activation_fn): PytorchGELUTanh()
626
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
627
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
628
+ )
629
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
630
+ )
631
+ )
632
+ )
633
+ )
634
+ )
635
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
636
+ )
637
+ )
638
+ )
639
+ (connector): FullyShardedDataParallel(
640
+ (_fsdp_wrapped_module): CheckpointWrapper(
641
+ (_checkpoint_wrapped_module): MLPconnector(
642
+ (activation_fn): PytorchGELUTanh()
643
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
644
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
645
+ )
646
+ )
647
+ )
648
+ (vit_pos_embed): FullyShardedDataParallel(
649
+ (_fsdp_wrapped_module): PositionEmbedding()
650
+ )
651
+ )
652
+ )
653
+ _flat_param True
654
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
655
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
656
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
657
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
658
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
659
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
660
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
661
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
662
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
663
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
664
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
665
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
666
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
667
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
668
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
669
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
670
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
671
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
672
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
673
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
674
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
675
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
676
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
677
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
678
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
679
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
680
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
681
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
682
+ time_embedder._fsdp_wrapped_module._flat_param True
683
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
684
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
685
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
686
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
687
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
688
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
689
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
690
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
691
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
692
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
693
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
694
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
695
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
696
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
697
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
698
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
699
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
700
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
701
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
702
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
703
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
704
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
705
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
706
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
707
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
708
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
709
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
710
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
711
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
712
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
713
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
714
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train