diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.13995801259622112, + "epoch": 0.5598320503848845, "eval_steps": 500, - "global_step": 1000, + "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -7008,6 +7008,21006 @@ "learning_rate": 4.8437575681263656e-05, "loss": 0.628, "step": 1000 + }, + { + "epoch": 0.14009797060881735, + "grad_norm": 0.4506290084182641, + "learning_rate": 4.843362954163273e-05, + "loss": 0.5924, + "step": 1001 + }, + { + "epoch": 0.14023792862141357, + "grad_norm": 0.4476783073678505, + "learning_rate": 4.842967858615316e-05, + "loss": 0.6682, + "step": 1002 + }, + { + "epoch": 0.1403778866340098, + "grad_norm": 0.4357549411702438, + "learning_rate": 4.842572281563691e-05, + "loss": 0.6332, + "step": 1003 + }, + { + "epoch": 0.14051784464660602, + "grad_norm": 0.4883894311666662, + "learning_rate": 4.842176223089694e-05, + "loss": 0.642, + "step": 1004 + }, + { + "epoch": 0.14065780265920225, + "grad_norm": 0.4432669792413136, + "learning_rate": 4.8417796832747186e-05, + "loss": 0.6049, + "step": 1005 + }, + { + "epoch": 0.14079776067179847, + "grad_norm": 0.46556766571285496, + "learning_rate": 4.841382662200257e-05, + "loss": 0.646, + "step": 1006 + }, + { + "epoch": 0.14093771868439467, + "grad_norm": 0.5157114366864044, + "learning_rate": 4.8409851599479015e-05, + "loss": 0.657, + "step": 1007 + }, + { + "epoch": 0.1410776766969909, + "grad_norm": 0.4366811539636766, + "learning_rate": 4.8405871765993433e-05, + "loss": 0.6326, + "step": 1008 + }, + { + "epoch": 0.14121763470958712, + "grad_norm": 0.43049502553637836, + "learning_rate": 4.8401887122363714e-05, + "loss": 0.616, + "step": 1009 + }, + { + "epoch": 0.14135759272218335, + "grad_norm": 0.48197017641080875, + "learning_rate": 4.839789766940875e-05, + "loss": 0.6432, + "step": 1010 + }, + { + "epoch": 0.14149755073477957, + "grad_norm": 0.45845061514071844, + "learning_rate": 4.839390340794841e-05, + "loss": 0.6262, + "step": 1011 + }, + { + "epoch": 0.1416375087473758, + "grad_norm": 0.457307052580857, + "learning_rate": 4.838990433880355e-05, + "loss": 0.6317, + "step": 1012 + }, + { + "epoch": 0.14177746675997202, + "grad_norm": 0.44363047689877544, + "learning_rate": 4.838590046279602e-05, + "loss": 0.599, + "step": 1013 + }, + { + "epoch": 0.14191742477256822, + "grad_norm": 0.45048773354087446, + "learning_rate": 4.838189178074867e-05, + "loss": 0.626, + "step": 1014 + }, + { + "epoch": 0.14205738278516444, + "grad_norm": 0.44588282768255694, + "learning_rate": 4.8377878293485305e-05, + "loss": 0.5985, + "step": 1015 + }, + { + "epoch": 0.14219734079776067, + "grad_norm": 0.4579523725156681, + "learning_rate": 4.8373860001830755e-05, + "loss": 0.5844, + "step": 1016 + }, + { + "epoch": 0.1423372988103569, + "grad_norm": 0.46436658695739996, + "learning_rate": 4.8369836906610816e-05, + "loss": 0.5891, + "step": 1017 + }, + { + "epoch": 0.14247725682295312, + "grad_norm": 0.46569223528579046, + "learning_rate": 4.836580900865227e-05, + "loss": 0.6164, + "step": 1018 + }, + { + "epoch": 0.14261721483554934, + "grad_norm": 0.4543042451718399, + "learning_rate": 4.836177630878289e-05, + "loss": 0.588, + "step": 1019 + }, + { + "epoch": 0.14275717284814557, + "grad_norm": 0.44767680240836294, + "learning_rate": 4.8357738807831446e-05, + "loss": 0.6551, + "step": 1020 + }, + { + "epoch": 0.14289713086074177, + "grad_norm": 0.44934339880943197, + "learning_rate": 4.835369650662767e-05, + "loss": 0.6167, + "step": 1021 + }, + { + "epoch": 0.143037088873338, + "grad_norm": 0.45729799528169507, + "learning_rate": 4.834964940600231e-05, + "loss": 0.5913, + "step": 1022 + }, + { + "epoch": 0.14317704688593422, + "grad_norm": 0.46783307870591107, + "learning_rate": 4.8345597506787075e-05, + "loss": 0.6267, + "step": 1023 + }, + { + "epoch": 0.14331700489853044, + "grad_norm": 0.4863803445979481, + "learning_rate": 4.8341540809814686e-05, + "loss": 0.62, + "step": 1024 + }, + { + "epoch": 0.14345696291112667, + "grad_norm": 0.48368481434141886, + "learning_rate": 4.8337479315918825e-05, + "loss": 0.6458, + "step": 1025 + }, + { + "epoch": 0.1435969209237229, + "grad_norm": 0.44240686647941085, + "learning_rate": 4.833341302593417e-05, + "loss": 0.5951, + "step": 1026 + }, + { + "epoch": 0.14373687893631912, + "grad_norm": 0.46334653267689374, + "learning_rate": 4.832934194069639e-05, + "loss": 0.6634, + "step": 1027 + }, + { + "epoch": 0.14387683694891532, + "grad_norm": 0.4566293728675077, + "learning_rate": 4.832526606104213e-05, + "loss": 0.6574, + "step": 1028 + }, + { + "epoch": 0.14401679496151154, + "grad_norm": 0.4482024873976331, + "learning_rate": 4.832118538780902e-05, + "loss": 0.5956, + "step": 1029 + }, + { + "epoch": 0.14415675297410777, + "grad_norm": 0.45091056986953776, + "learning_rate": 4.8317099921835697e-05, + "loss": 0.638, + "step": 1030 + }, + { + "epoch": 0.144296710986704, + "grad_norm": 0.4546329207907057, + "learning_rate": 4.8313009663961746e-05, + "loss": 0.6386, + "step": 1031 + }, + { + "epoch": 0.14443666899930022, + "grad_norm": 0.4279917091758518, + "learning_rate": 4.830891461502777e-05, + "loss": 0.6225, + "step": 1032 + }, + { + "epoch": 0.14457662701189644, + "grad_norm": 0.4653479806373598, + "learning_rate": 4.8304814775875326e-05, + "loss": 0.6273, + "step": 1033 + }, + { + "epoch": 0.14471658502449264, + "grad_norm": 0.44000485919639887, + "learning_rate": 4.8300710147346996e-05, + "loss": 0.6049, + "step": 1034 + }, + { + "epoch": 0.14485654303708886, + "grad_norm": 0.4481538097684572, + "learning_rate": 4.829660073028631e-05, + "loss": 0.6526, + "step": 1035 + }, + { + "epoch": 0.1449965010496851, + "grad_norm": 0.4392800745551569, + "learning_rate": 4.829248652553779e-05, + "loss": 0.6282, + "step": 1036 + }, + { + "epoch": 0.14513645906228131, + "grad_norm": 0.43714289037219956, + "learning_rate": 4.8288367533946964e-05, + "loss": 0.6162, + "step": 1037 + }, + { + "epoch": 0.14527641707487754, + "grad_norm": 0.47850026452151156, + "learning_rate": 4.828424375636031e-05, + "loss": 0.653, + "step": 1038 + }, + { + "epoch": 0.14541637508747376, + "grad_norm": 0.4387192439390933, + "learning_rate": 4.828011519362531e-05, + "loss": 0.6186, + "step": 1039 + }, + { + "epoch": 0.14555633310007, + "grad_norm": 0.438096448983694, + "learning_rate": 4.827598184659043e-05, + "loss": 0.5983, + "step": 1040 + }, + { + "epoch": 0.1456962911126662, + "grad_norm": 0.4545201024028468, + "learning_rate": 4.827184371610511e-05, + "loss": 0.6423, + "step": 1041 + }, + { + "epoch": 0.1458362491252624, + "grad_norm": 0.45563857615067394, + "learning_rate": 4.826770080301978e-05, + "loss": 0.5828, + "step": 1042 + }, + { + "epoch": 0.14597620713785864, + "grad_norm": 0.45222712800980513, + "learning_rate": 4.826355310818585e-05, + "loss": 0.5934, + "step": 1043 + }, + { + "epoch": 0.14611616515045486, + "grad_norm": 0.4831310051395056, + "learning_rate": 4.825940063245572e-05, + "loss": 0.6841, + "step": 1044 + }, + { + "epoch": 0.1462561231630511, + "grad_norm": 0.46283974537294126, + "learning_rate": 4.8255243376682744e-05, + "loss": 0.6435, + "step": 1045 + }, + { + "epoch": 0.1463960811756473, + "grad_norm": 0.4735013090808921, + "learning_rate": 4.825108134172131e-05, + "loss": 0.6406, + "step": 1046 + }, + { + "epoch": 0.14653603918824354, + "grad_norm": 0.4506208707883696, + "learning_rate": 4.824691452842675e-05, + "loss": 0.6372, + "step": 1047 + }, + { + "epoch": 0.14667599720083974, + "grad_norm": 0.4364931698009048, + "learning_rate": 4.824274293765536e-05, + "loss": 0.6026, + "step": 1048 + }, + { + "epoch": 0.14681595521343596, + "grad_norm": 0.44459989160312974, + "learning_rate": 4.823856657026448e-05, + "loss": 0.6316, + "step": 1049 + }, + { + "epoch": 0.1469559132260322, + "grad_norm": 0.44933496657307287, + "learning_rate": 4.8234385427112385e-05, + "loss": 0.5813, + "step": 1050 + }, + { + "epoch": 0.1470958712386284, + "grad_norm": 0.4594564123055747, + "learning_rate": 4.8230199509058326e-05, + "loss": 0.6263, + "step": 1051 + }, + { + "epoch": 0.14723582925122464, + "grad_norm": 0.4597297128739321, + "learning_rate": 4.822600881696256e-05, + "loss": 0.6166, + "step": 1052 + }, + { + "epoch": 0.14737578726382086, + "grad_norm": 0.4797057798768006, + "learning_rate": 4.822181335168634e-05, + "loss": 0.6791, + "step": 1053 + }, + { + "epoch": 0.1475157452764171, + "grad_norm": 0.463031471453022, + "learning_rate": 4.821761311409184e-05, + "loss": 0.6442, + "step": 1054 + }, + { + "epoch": 0.14765570328901328, + "grad_norm": 0.46690698533496683, + "learning_rate": 4.821340810504228e-05, + "loss": 0.6209, + "step": 1055 + }, + { + "epoch": 0.1477956613016095, + "grad_norm": 0.47419444368207214, + "learning_rate": 4.8209198325401815e-05, + "loss": 0.6504, + "step": 1056 + }, + { + "epoch": 0.14793561931420574, + "grad_norm": 0.4959825016958297, + "learning_rate": 4.8204983776035605e-05, + "loss": 0.615, + "step": 1057 + }, + { + "epoch": 0.14807557732680196, + "grad_norm": 0.4481720138656539, + "learning_rate": 4.8200764457809784e-05, + "loss": 0.6137, + "step": 1058 + }, + { + "epoch": 0.14821553533939819, + "grad_norm": 0.45342457377099943, + "learning_rate": 4.819654037159146e-05, + "loss": 0.6605, + "step": 1059 + }, + { + "epoch": 0.1483554933519944, + "grad_norm": 0.4383482872844384, + "learning_rate": 4.8192311518248726e-05, + "loss": 0.6383, + "step": 1060 + }, + { + "epoch": 0.14849545136459064, + "grad_norm": 0.4558068303448877, + "learning_rate": 4.818807789865065e-05, + "loss": 0.6221, + "step": 1061 + }, + { + "epoch": 0.14863540937718683, + "grad_norm": 0.44912539719700584, + "learning_rate": 4.818383951366729e-05, + "loss": 0.6413, + "step": 1062 + }, + { + "epoch": 0.14877536738978306, + "grad_norm": 0.4355408755527561, + "learning_rate": 4.817959636416969e-05, + "loss": 0.6451, + "step": 1063 + }, + { + "epoch": 0.14891532540237928, + "grad_norm": 0.46358234313926816, + "learning_rate": 4.8175348451029836e-05, + "loss": 0.6249, + "step": 1064 + }, + { + "epoch": 0.1490552834149755, + "grad_norm": 0.47591686796466215, + "learning_rate": 4.817109577512073e-05, + "loss": 0.6829, + "step": 1065 + }, + { + "epoch": 0.14919524142757173, + "grad_norm": 0.4231210119222125, + "learning_rate": 4.8166838337316334e-05, + "loss": 0.5962, + "step": 1066 + }, + { + "epoch": 0.14933519944016796, + "grad_norm": 0.47903022082147834, + "learning_rate": 4.81625761384916e-05, + "loss": 0.6666, + "step": 1067 + }, + { + "epoch": 0.14947515745276416, + "grad_norm": 0.4356774379772111, + "learning_rate": 4.8158309179522454e-05, + "loss": 0.584, + "step": 1068 + }, + { + "epoch": 0.14961511546536038, + "grad_norm": 0.43493168432918317, + "learning_rate": 4.8154037461285796e-05, + "loss": 0.6003, + "step": 1069 + }, + { + "epoch": 0.1497550734779566, + "grad_norm": 0.4152139582969938, + "learning_rate": 4.8149760984659506e-05, + "loss": 0.6396, + "step": 1070 + }, + { + "epoch": 0.14989503149055283, + "grad_norm": 0.4578712882918065, + "learning_rate": 4.814547975052245e-05, + "loss": 0.6249, + "step": 1071 + }, + { + "epoch": 0.15003498950314906, + "grad_norm": 0.46527522340987304, + "learning_rate": 4.814119375975447e-05, + "loss": 0.6691, + "step": 1072 + }, + { + "epoch": 0.15017494751574528, + "grad_norm": 0.4551938793084905, + "learning_rate": 4.813690301323636e-05, + "loss": 0.6347, + "step": 1073 + }, + { + "epoch": 0.1503149055283415, + "grad_norm": 0.472894226480286, + "learning_rate": 4.813260751184992e-05, + "loss": 0.6558, + "step": 1074 + }, + { + "epoch": 0.1504548635409377, + "grad_norm": 0.45206474588868106, + "learning_rate": 4.812830725647793e-05, + "loss": 0.5915, + "step": 1075 + }, + { + "epoch": 0.15059482155353393, + "grad_norm": 0.46083474871884716, + "learning_rate": 4.8124002248004126e-05, + "loss": 0.6336, + "step": 1076 + }, + { + "epoch": 0.15073477956613016, + "grad_norm": 0.44095545335189645, + "learning_rate": 4.811969248731323e-05, + "loss": 0.6307, + "step": 1077 + }, + { + "epoch": 0.15087473757872638, + "grad_norm": 0.46628362437250337, + "learning_rate": 4.8115377975290955e-05, + "loss": 0.6267, + "step": 1078 + }, + { + "epoch": 0.1510146955913226, + "grad_norm": 0.4570685734330465, + "learning_rate": 4.811105871282395e-05, + "loss": 0.6891, + "step": 1079 + }, + { + "epoch": 0.15115465360391883, + "grad_norm": 0.4494515219252805, + "learning_rate": 4.81067347007999e-05, + "loss": 0.6147, + "step": 1080 + }, + { + "epoch": 0.15129461161651506, + "grad_norm": 0.44047992997082713, + "learning_rate": 4.810240594010742e-05, + "loss": 0.6443, + "step": 1081 + }, + { + "epoch": 0.15143456962911125, + "grad_norm": 0.4423964257933292, + "learning_rate": 4.80980724316361e-05, + "loss": 0.6316, + "step": 1082 + }, + { + "epoch": 0.15157452764170748, + "grad_norm": 0.43454276433469047, + "learning_rate": 4.809373417627654e-05, + "loss": 0.6244, + "step": 1083 + }, + { + "epoch": 0.1517144856543037, + "grad_norm": 0.4576456453029119, + "learning_rate": 4.8089391174920275e-05, + "loss": 0.6158, + "step": 1084 + }, + { + "epoch": 0.15185444366689993, + "grad_norm": 0.4376862180207936, + "learning_rate": 4.808504342845986e-05, + "loss": 0.6287, + "step": 1085 + }, + { + "epoch": 0.15199440167949615, + "grad_norm": 0.4272520839128296, + "learning_rate": 4.808069093778879e-05, + "loss": 0.5709, + "step": 1086 + }, + { + "epoch": 0.15213435969209238, + "grad_norm": 0.4562369557196528, + "learning_rate": 4.807633370380155e-05, + "loss": 0.651, + "step": 1087 + }, + { + "epoch": 0.1522743177046886, + "grad_norm": 0.4304596270726819, + "learning_rate": 4.807197172739357e-05, + "loss": 0.6338, + "step": 1088 + }, + { + "epoch": 0.1524142757172848, + "grad_norm": 0.4427035165037056, + "learning_rate": 4.806760500946132e-05, + "loss": 0.6196, + "step": 1089 + }, + { + "epoch": 0.15255423372988103, + "grad_norm": 0.4603650971833003, + "learning_rate": 4.806323355090218e-05, + "loss": 0.6526, + "step": 1090 + }, + { + "epoch": 0.15269419174247725, + "grad_norm": 0.4348487042293239, + "learning_rate": 4.805885735261454e-05, + "loss": 0.6375, + "step": 1091 + }, + { + "epoch": 0.15283414975507348, + "grad_norm": 0.4515390501287558, + "learning_rate": 4.805447641549774e-05, + "loss": 0.5708, + "step": 1092 + }, + { + "epoch": 0.1529741077676697, + "grad_norm": 0.4715753314020402, + "learning_rate": 4.805009074045213e-05, + "loss": 0.6477, + "step": 1093 + }, + { + "epoch": 0.15311406578026593, + "grad_norm": 0.4434658029205517, + "learning_rate": 4.8045700328378986e-05, + "loss": 0.6013, + "step": 1094 + }, + { + "epoch": 0.15325402379286215, + "grad_norm": 0.44663741205752033, + "learning_rate": 4.804130518018058e-05, + "loss": 0.6081, + "step": 1095 + }, + { + "epoch": 0.15339398180545835, + "grad_norm": 0.41670655291144576, + "learning_rate": 4.803690529676019e-05, + "loss": 0.6378, + "step": 1096 + }, + { + "epoch": 0.15353393981805458, + "grad_norm": 0.49686928110177025, + "learning_rate": 4.803250067902202e-05, + "loss": 0.6445, + "step": 1097 + }, + { + "epoch": 0.1536738978306508, + "grad_norm": 0.4685195748194762, + "learning_rate": 4.802809132787125e-05, + "loss": 0.6066, + "step": 1098 + }, + { + "epoch": 0.15381385584324703, + "grad_norm": 0.46286655961754153, + "learning_rate": 4.802367724421407e-05, + "loss": 0.6144, + "step": 1099 + }, + { + "epoch": 0.15395381385584325, + "grad_norm": 0.43665548007096733, + "learning_rate": 4.8019258428957605e-05, + "loss": 0.6404, + "step": 1100 + }, + { + "epoch": 0.15409377186843948, + "grad_norm": 0.4700705212693585, + "learning_rate": 4.8014834883009966e-05, + "loss": 0.6297, + "step": 1101 + }, + { + "epoch": 0.1542337298810357, + "grad_norm": 0.46140020925769776, + "learning_rate": 4.8010406607280244e-05, + "loss": 0.63, + "step": 1102 + }, + { + "epoch": 0.1543736878936319, + "grad_norm": 0.432596752729328, + "learning_rate": 4.800597360267849e-05, + "loss": 0.6014, + "step": 1103 + }, + { + "epoch": 0.15451364590622813, + "grad_norm": 0.4087158619129061, + "learning_rate": 4.800153587011573e-05, + "loss": 0.6101, + "step": 1104 + }, + { + "epoch": 0.15465360391882435, + "grad_norm": 0.4306421416237517, + "learning_rate": 4.799709341050396e-05, + "loss": 0.6088, + "step": 1105 + }, + { + "epoch": 0.15479356193142058, + "grad_norm": 0.439174570947343, + "learning_rate": 4.799264622475616e-05, + "loss": 0.6271, + "step": 1106 + }, + { + "epoch": 0.1549335199440168, + "grad_norm": 0.45230787848804416, + "learning_rate": 4.7988194313786275e-05, + "loss": 0.6314, + "step": 1107 + }, + { + "epoch": 0.15507347795661303, + "grad_norm": 0.4452846746911816, + "learning_rate": 4.79837376785092e-05, + "loss": 0.6189, + "step": 1108 + }, + { + "epoch": 0.15521343596920922, + "grad_norm": 0.43395577576303523, + "learning_rate": 4.7979276319840824e-05, + "loss": 0.5867, + "step": 1109 + }, + { + "epoch": 0.15535339398180545, + "grad_norm": 0.4198939865361689, + "learning_rate": 4.797481023869801e-05, + "loss": 0.6033, + "step": 1110 + }, + { + "epoch": 0.15549335199440167, + "grad_norm": 0.4115113980248625, + "learning_rate": 4.797033943599859e-05, + "loss": 0.5792, + "step": 1111 + }, + { + "epoch": 0.1556333100069979, + "grad_norm": 0.42305720865767826, + "learning_rate": 4.796586391266134e-05, + "loss": 0.5721, + "step": 1112 + }, + { + "epoch": 0.15577326801959412, + "grad_norm": 0.4594490270870182, + "learning_rate": 4.796138366960603e-05, + "loss": 0.5923, + "step": 1113 + }, + { + "epoch": 0.15591322603219035, + "grad_norm": 0.4660997680749236, + "learning_rate": 4.7956898707753405e-05, + "loss": 0.6168, + "step": 1114 + }, + { + "epoch": 0.15605318404478657, + "grad_norm": 0.4300722165746083, + "learning_rate": 4.795240902802517e-05, + "loss": 0.5975, + "step": 1115 + }, + { + "epoch": 0.15619314205738277, + "grad_norm": 0.4593249810088033, + "learning_rate": 4.794791463134399e-05, + "loss": 0.598, + "step": 1116 + }, + { + "epoch": 0.156333100069979, + "grad_norm": 0.4615991027920945, + "learning_rate": 4.79434155186335e-05, + "loss": 0.6267, + "step": 1117 + }, + { + "epoch": 0.15647305808257522, + "grad_norm": 0.441601529655813, + "learning_rate": 4.7938911690818347e-05, + "loss": 0.6607, + "step": 1118 + }, + { + "epoch": 0.15661301609517145, + "grad_norm": 0.45355246625005374, + "learning_rate": 4.793440314882408e-05, + "loss": 0.6071, + "step": 1119 + }, + { + "epoch": 0.15675297410776767, + "grad_norm": 0.44036948902223627, + "learning_rate": 4.792988989357727e-05, + "loss": 0.6154, + "step": 1120 + }, + { + "epoch": 0.1568929321203639, + "grad_norm": 0.4734397090177979, + "learning_rate": 4.7925371926005435e-05, + "loss": 0.6102, + "step": 1121 + }, + { + "epoch": 0.15703289013296012, + "grad_norm": 0.43617965348321686, + "learning_rate": 4.792084924703705e-05, + "loss": 0.6104, + "step": 1122 + }, + { + "epoch": 0.15717284814555632, + "grad_norm": 0.4478534547540619, + "learning_rate": 4.791632185760158e-05, + "loss": 0.6235, + "step": 1123 + }, + { + "epoch": 0.15731280615815255, + "grad_norm": 0.44766677539513156, + "learning_rate": 4.791178975862945e-05, + "loss": 0.6439, + "step": 1124 + }, + { + "epoch": 0.15745276417074877, + "grad_norm": 0.44554264209996225, + "learning_rate": 4.790725295105205e-05, + "loss": 0.5959, + "step": 1125 + }, + { + "epoch": 0.157592722183345, + "grad_norm": 0.4296700348089923, + "learning_rate": 4.790271143580174e-05, + "loss": 0.6539, + "step": 1126 + }, + { + "epoch": 0.15773268019594122, + "grad_norm": 0.4412703085902547, + "learning_rate": 4.789816521381185e-05, + "loss": 0.6375, + "step": 1127 + }, + { + "epoch": 0.15787263820853745, + "grad_norm": 0.4373118306763795, + "learning_rate": 4.7893614286016684e-05, + "loss": 0.6112, + "step": 1128 + }, + { + "epoch": 0.15801259622113367, + "grad_norm": 0.44070716127461335, + "learning_rate": 4.7889058653351485e-05, + "loss": 0.6195, + "step": 1129 + }, + { + "epoch": 0.15815255423372987, + "grad_norm": 0.4052645175970106, + "learning_rate": 4.788449831675248e-05, + "loss": 0.6118, + "step": 1130 + }, + { + "epoch": 0.1582925122463261, + "grad_norm": 0.4205527476901149, + "learning_rate": 4.7879933277156884e-05, + "loss": 0.6098, + "step": 1131 + }, + { + "epoch": 0.15843247025892232, + "grad_norm": 0.4232443251461348, + "learning_rate": 4.787536353550285e-05, + "loss": 0.5962, + "step": 1132 + }, + { + "epoch": 0.15857242827151855, + "grad_norm": 0.4555075573119837, + "learning_rate": 4.787078909272951e-05, + "loss": 0.6099, + "step": 1133 + }, + { + "epoch": 0.15871238628411477, + "grad_norm": 0.45597750842746004, + "learning_rate": 4.786620994977695e-05, + "loss": 0.6403, + "step": 1134 + }, + { + "epoch": 0.158852344296711, + "grad_norm": 0.4523579064101052, + "learning_rate": 4.7861626107586236e-05, + "loss": 0.6276, + "step": 1135 + }, + { + "epoch": 0.15899230230930722, + "grad_norm": 0.4496756804618233, + "learning_rate": 4.785703756709939e-05, + "loss": 0.625, + "step": 1136 + }, + { + "epoch": 0.15913226032190342, + "grad_norm": 0.44479315166484706, + "learning_rate": 4.78524443292594e-05, + "loss": 0.6219, + "step": 1137 + }, + { + "epoch": 0.15927221833449964, + "grad_norm": 0.423002589052915, + "learning_rate": 4.784784639501024e-05, + "loss": 0.6142, + "step": 1138 + }, + { + "epoch": 0.15941217634709587, + "grad_norm": 0.45783730451738547, + "learning_rate": 4.7843243765296816e-05, + "loss": 0.6327, + "step": 1139 + }, + { + "epoch": 0.1595521343596921, + "grad_norm": 0.41727555581040476, + "learning_rate": 4.783863644106502e-05, + "loss": 0.6314, + "step": 1140 + }, + { + "epoch": 0.15969209237228832, + "grad_norm": 0.4433704880299786, + "learning_rate": 4.7834024423261715e-05, + "loss": 0.6297, + "step": 1141 + }, + { + "epoch": 0.15983205038488454, + "grad_norm": 0.44605075439465414, + "learning_rate": 4.78294077128347e-05, + "loss": 0.6052, + "step": 1142 + }, + { + "epoch": 0.15997200839748077, + "grad_norm": 0.4252564366376628, + "learning_rate": 4.7824786310732754e-05, + "loss": 0.5952, + "step": 1143 + }, + { + "epoch": 0.16011196641007697, + "grad_norm": 0.4487603412437455, + "learning_rate": 4.782016021790564e-05, + "loss": 0.5993, + "step": 1144 + }, + { + "epoch": 0.1602519244226732, + "grad_norm": 0.4335684662398312, + "learning_rate": 4.781552943530405e-05, + "loss": 0.6119, + "step": 1145 + }, + { + "epoch": 0.16039188243526942, + "grad_norm": 0.48534201807046695, + "learning_rate": 4.781089396387968e-05, + "loss": 0.6566, + "step": 1146 + }, + { + "epoch": 0.16053184044786564, + "grad_norm": 0.4657809557886318, + "learning_rate": 4.780625380458513e-05, + "loss": 0.6188, + "step": 1147 + }, + { + "epoch": 0.16067179846046187, + "grad_norm": 0.463002737360727, + "learning_rate": 4.7801608958374034e-05, + "loss": 0.6281, + "step": 1148 + }, + { + "epoch": 0.1608117564730581, + "grad_norm": 0.4327965386696676, + "learning_rate": 4.779695942620094e-05, + "loss": 0.5676, + "step": 1149 + }, + { + "epoch": 0.1609517144856543, + "grad_norm": 0.43456914658498524, + "learning_rate": 4.779230520902138e-05, + "loss": 0.6018, + "step": 1150 + }, + { + "epoch": 0.16109167249825052, + "grad_norm": 0.4340562614707508, + "learning_rate": 4.778764630779183e-05, + "loss": 0.5634, + "step": 1151 + }, + { + "epoch": 0.16123163051084674, + "grad_norm": 0.5384914426514081, + "learning_rate": 4.778298272346976e-05, + "loss": 0.6159, + "step": 1152 + }, + { + "epoch": 0.16137158852344297, + "grad_norm": 0.45988369030455006, + "learning_rate": 4.7778314457013565e-05, + "loss": 0.6306, + "step": 1153 + }, + { + "epoch": 0.1615115465360392, + "grad_norm": 0.4650532211327251, + "learning_rate": 4.777364150938263e-05, + "loss": 0.6192, + "step": 1154 + }, + { + "epoch": 0.16165150454863542, + "grad_norm": 0.4253518192932048, + "learning_rate": 4.77689638815373e-05, + "loss": 0.6373, + "step": 1155 + }, + { + "epoch": 0.16179146256123164, + "grad_norm": 0.45004968895984576, + "learning_rate": 4.776428157443886e-05, + "loss": 0.6173, + "step": 1156 + }, + { + "epoch": 0.16193142057382784, + "grad_norm": 0.4555578881747446, + "learning_rate": 4.775959458904958e-05, + "loss": 0.6011, + "step": 1157 + }, + { + "epoch": 0.16207137858642406, + "grad_norm": 0.4034724262013599, + "learning_rate": 4.775490292633269e-05, + "loss": 0.5945, + "step": 1158 + }, + { + "epoch": 0.1622113365990203, + "grad_norm": 0.44601195862385407, + "learning_rate": 4.7750206587252366e-05, + "loss": 0.5964, + "step": 1159 + }, + { + "epoch": 0.16235129461161651, + "grad_norm": 0.45007693700753115, + "learning_rate": 4.7745505572773754e-05, + "loss": 0.6187, + "step": 1160 + }, + { + "epoch": 0.16249125262421274, + "grad_norm": 0.45228216299402807, + "learning_rate": 4.774079988386296e-05, + "loss": 0.6235, + "step": 1161 + }, + { + "epoch": 0.16263121063680896, + "grad_norm": 0.43817158113391474, + "learning_rate": 4.773608952148706e-05, + "loss": 0.6181, + "step": 1162 + }, + { + "epoch": 0.1627711686494052, + "grad_norm": 0.4218728284381608, + "learning_rate": 4.7731374486614056e-05, + "loss": 0.6472, + "step": 1163 + }, + { + "epoch": 0.1629111266620014, + "grad_norm": 0.4385041088709967, + "learning_rate": 4.772665478021296e-05, + "loss": 0.6263, + "step": 1164 + }, + { + "epoch": 0.1630510846745976, + "grad_norm": 0.4575475392945415, + "learning_rate": 4.7721930403253714e-05, + "loss": 0.6136, + "step": 1165 + }, + { + "epoch": 0.16319104268719384, + "grad_norm": 0.4457426524641399, + "learning_rate": 4.771720135670722e-05, + "loss": 0.623, + "step": 1166 + }, + { + "epoch": 0.16333100069979006, + "grad_norm": 0.4257770146896383, + "learning_rate": 4.7712467641545354e-05, + "loss": 0.6371, + "step": 1167 + }, + { + "epoch": 0.1634709587123863, + "grad_norm": 0.5052851757471238, + "learning_rate": 4.770772925874093e-05, + "loss": 0.6249, + "step": 1168 + }, + { + "epoch": 0.1636109167249825, + "grad_norm": 0.47522529944132186, + "learning_rate": 4.7702986209267745e-05, + "loss": 0.6378, + "step": 1169 + }, + { + "epoch": 0.16375087473757874, + "grad_norm": 0.42207340820026673, + "learning_rate": 4.7698238494100536e-05, + "loss": 0.6271, + "step": 1170 + }, + { + "epoch": 0.16389083275017494, + "grad_norm": 0.42129003632456846, + "learning_rate": 4.7693486114215015e-05, + "loss": 0.6135, + "step": 1171 + }, + { + "epoch": 0.16403079076277116, + "grad_norm": 0.4929479783722899, + "learning_rate": 4.768872907058783e-05, + "loss": 0.6043, + "step": 1172 + }, + { + "epoch": 0.1641707487753674, + "grad_norm": 0.4810208017615139, + "learning_rate": 4.7683967364196624e-05, + "loss": 0.65, + "step": 1173 + }, + { + "epoch": 0.1643107067879636, + "grad_norm": 0.47080599343079316, + "learning_rate": 4.767920099601996e-05, + "loss": 0.6215, + "step": 1174 + }, + { + "epoch": 0.16445066480055984, + "grad_norm": 0.4354882117508445, + "learning_rate": 4.767442996703737e-05, + "loss": 0.6066, + "step": 1175 + }, + { + "epoch": 0.16459062281315606, + "grad_norm": 0.4284983370361917, + "learning_rate": 4.766965427822936e-05, + "loss": 0.5834, + "step": 1176 + }, + { + "epoch": 0.1647305808257523, + "grad_norm": 0.45013293062633963, + "learning_rate": 4.7664873930577383e-05, + "loss": 0.6747, + "step": 1177 + }, + { + "epoch": 0.16487053883834848, + "grad_norm": 0.4616970172832583, + "learning_rate": 4.766008892506384e-05, + "loss": 0.6331, + "step": 1178 + }, + { + "epoch": 0.1650104968509447, + "grad_norm": 0.43031916863208525, + "learning_rate": 4.765529926267211e-05, + "loss": 0.6218, + "step": 1179 + }, + { + "epoch": 0.16515045486354094, + "grad_norm": 0.4663955970462797, + "learning_rate": 4.765050494438651e-05, + "loss": 0.6508, + "step": 1180 + }, + { + "epoch": 0.16529041287613716, + "grad_norm": 0.439471188683419, + "learning_rate": 4.7645705971192315e-05, + "loss": 0.6199, + "step": 1181 + }, + { + "epoch": 0.16543037088873339, + "grad_norm": 0.45371569435156506, + "learning_rate": 4.764090234407577e-05, + "loss": 0.6266, + "step": 1182 + }, + { + "epoch": 0.1655703289013296, + "grad_norm": 0.43706679884940447, + "learning_rate": 4.7636094064024076e-05, + "loss": 0.6358, + "step": 1183 + }, + { + "epoch": 0.16571028691392584, + "grad_norm": 0.4383648085023705, + "learning_rate": 4.763128113202537e-05, + "loss": 0.6363, + "step": 1184 + }, + { + "epoch": 0.16585024492652203, + "grad_norm": 0.4332543518196199, + "learning_rate": 4.762646354906877e-05, + "loss": 0.6284, + "step": 1185 + }, + { + "epoch": 0.16599020293911826, + "grad_norm": 0.42760152759621617, + "learning_rate": 4.7621641316144325e-05, + "loss": 0.6212, + "step": 1186 + }, + { + "epoch": 0.16613016095171448, + "grad_norm": 0.4530135732031799, + "learning_rate": 4.761681443424306e-05, + "loss": 0.664, + "step": 1187 + }, + { + "epoch": 0.1662701189643107, + "grad_norm": 0.4459844470321281, + "learning_rate": 4.7611982904356954e-05, + "loss": 0.6384, + "step": 1188 + }, + { + "epoch": 0.16641007697690693, + "grad_norm": 0.41924312442399453, + "learning_rate": 4.7607146727478935e-05, + "loss": 0.5901, + "step": 1189 + }, + { + "epoch": 0.16655003498950316, + "grad_norm": 0.43860632090683305, + "learning_rate": 4.760230590460287e-05, + "loss": 0.6021, + "step": 1190 + }, + { + "epoch": 0.16668999300209936, + "grad_norm": 0.45006765458458414, + "learning_rate": 4.759746043672362e-05, + "loss": 0.6365, + "step": 1191 + }, + { + "epoch": 0.16682995101469558, + "grad_norm": 0.4440700448858348, + "learning_rate": 4.7592610324836955e-05, + "loss": 0.6349, + "step": 1192 + }, + { + "epoch": 0.1669699090272918, + "grad_norm": 0.42964762641344484, + "learning_rate": 4.758775556993964e-05, + "loss": 0.573, + "step": 1193 + }, + { + "epoch": 0.16710986703988803, + "grad_norm": 0.4348088254948326, + "learning_rate": 4.758289617302937e-05, + "loss": 0.6094, + "step": 1194 + }, + { + "epoch": 0.16724982505248426, + "grad_norm": 0.42297107816490986, + "learning_rate": 4.7578032135104796e-05, + "loss": 0.6102, + "step": 1195 + }, + { + "epoch": 0.16738978306508048, + "grad_norm": 0.4494883284181831, + "learning_rate": 4.7573163457165534e-05, + "loss": 0.6071, + "step": 1196 + }, + { + "epoch": 0.1675297410776767, + "grad_norm": 0.4339048883611629, + "learning_rate": 4.7568290140212145e-05, + "loss": 0.6213, + "step": 1197 + }, + { + "epoch": 0.1676696990902729, + "grad_norm": 0.48344849855571825, + "learning_rate": 4.7563412185246145e-05, + "loss": 0.6433, + "step": 1198 + }, + { + "epoch": 0.16780965710286913, + "grad_norm": 0.4488129613842796, + "learning_rate": 4.7558529593269996e-05, + "loss": 0.654, + "step": 1199 + }, + { + "epoch": 0.16794961511546536, + "grad_norm": 0.43481250974353425, + "learning_rate": 4.755364236528713e-05, + "loss": 0.6316, + "step": 1200 + }, + { + "epoch": 0.16808957312806158, + "grad_norm": 0.41793689356043273, + "learning_rate": 4.754875050230192e-05, + "loss": 0.5783, + "step": 1201 + }, + { + "epoch": 0.1682295311406578, + "grad_norm": 0.43753416598422584, + "learning_rate": 4.754385400531969e-05, + "loss": 0.5949, + "step": 1202 + }, + { + "epoch": 0.16836948915325403, + "grad_norm": 0.44668014693071806, + "learning_rate": 4.753895287534673e-05, + "loss": 0.5863, + "step": 1203 + }, + { + "epoch": 0.16850944716585026, + "grad_norm": 0.42805637156559095, + "learning_rate": 4.753404711339026e-05, + "loss": 0.6357, + "step": 1204 + }, + { + "epoch": 0.16864940517844645, + "grad_norm": 0.41847675435581816, + "learning_rate": 4.752913672045846e-05, + "loss": 0.6138, + "step": 1205 + }, + { + "epoch": 0.16878936319104268, + "grad_norm": 0.4553821804652506, + "learning_rate": 4.752422169756048e-05, + "loss": 0.6179, + "step": 1206 + }, + { + "epoch": 0.1689293212036389, + "grad_norm": 0.45084883922329716, + "learning_rate": 4.75193020457064e-05, + "loss": 0.6321, + "step": 1207 + }, + { + "epoch": 0.16906927921623513, + "grad_norm": 0.42851103889071873, + "learning_rate": 4.751437776590726e-05, + "loss": 0.639, + "step": 1208 + }, + { + "epoch": 0.16920923722883136, + "grad_norm": 0.4555925682379364, + "learning_rate": 4.7509448859175043e-05, + "loss": 0.6185, + "step": 1209 + }, + { + "epoch": 0.16934919524142758, + "grad_norm": 0.45021258994097924, + "learning_rate": 4.75045153265227e-05, + "loss": 0.6086, + "step": 1210 + }, + { + "epoch": 0.1694891532540238, + "grad_norm": 0.4369626572960402, + "learning_rate": 4.749957716896412e-05, + "loss": 0.6409, + "step": 1211 + }, + { + "epoch": 0.16962911126662, + "grad_norm": 0.4320366971573474, + "learning_rate": 4.749463438751413e-05, + "loss": 0.5844, + "step": 1212 + }, + { + "epoch": 0.16976906927921623, + "grad_norm": 0.4163870413689256, + "learning_rate": 4.7489686983188535e-05, + "loss": 0.5803, + "step": 1213 + }, + { + "epoch": 0.16990902729181245, + "grad_norm": 0.44949676404016337, + "learning_rate": 4.748473495700408e-05, + "loss": 0.6234, + "step": 1214 + }, + { + "epoch": 0.17004898530440868, + "grad_norm": 0.44171736970520764, + "learning_rate": 4.747977830997845e-05, + "loss": 0.593, + "step": 1215 + }, + { + "epoch": 0.1701889433170049, + "grad_norm": 0.44772610657802425, + "learning_rate": 4.747481704313028e-05, + "loss": 0.5924, + "step": 1216 + }, + { + "epoch": 0.17032890132960113, + "grad_norm": 0.4549687449320509, + "learning_rate": 4.7469851157479177e-05, + "loss": 0.6332, + "step": 1217 + }, + { + "epoch": 0.17046885934219735, + "grad_norm": 0.4682088646897931, + "learning_rate": 4.746488065404567e-05, + "loss": 0.6189, + "step": 1218 + }, + { + "epoch": 0.17060881735479355, + "grad_norm": 0.44383085207451206, + "learning_rate": 4.7459905533851246e-05, + "loss": 0.6176, + "step": 1219 + }, + { + "epoch": 0.17074877536738978, + "grad_norm": 0.4282311331266807, + "learning_rate": 4.745492579791835e-05, + "loss": 0.5548, + "step": 1220 + }, + { + "epoch": 0.170888733379986, + "grad_norm": 0.4542898636825727, + "learning_rate": 4.744994144727036e-05, + "loss": 0.6161, + "step": 1221 + }, + { + "epoch": 0.17102869139258223, + "grad_norm": 0.4667149443909015, + "learning_rate": 4.7444952482931626e-05, + "loss": 0.6133, + "step": 1222 + }, + { + "epoch": 0.17116864940517845, + "grad_norm": 0.4512113910763386, + "learning_rate": 4.743995890592742e-05, + "loss": 0.629, + "step": 1223 + }, + { + "epoch": 0.17130860741777468, + "grad_norm": 0.42697256281999074, + "learning_rate": 4.743496071728396e-05, + "loss": 0.61, + "step": 1224 + }, + { + "epoch": 0.17144856543037088, + "grad_norm": 0.4510644382875255, + "learning_rate": 4.7429957918028444e-05, + "loss": 0.6247, + "step": 1225 + }, + { + "epoch": 0.1715885234429671, + "grad_norm": 0.4432778439526579, + "learning_rate": 4.7424950509188995e-05, + "loss": 0.6192, + "step": 1226 + }, + { + "epoch": 0.17172848145556333, + "grad_norm": 0.4215901680439938, + "learning_rate": 4.741993849179468e-05, + "loss": 0.5841, + "step": 1227 + }, + { + "epoch": 0.17186843946815955, + "grad_norm": 0.43500291765639604, + "learning_rate": 4.7414921866875524e-05, + "loss": 0.6216, + "step": 1228 + }, + { + "epoch": 0.17200839748075578, + "grad_norm": 0.43425438707929986, + "learning_rate": 4.740990063546249e-05, + "loss": 0.6358, + "step": 1229 + }, + { + "epoch": 0.172148355493352, + "grad_norm": 0.45544131433595525, + "learning_rate": 4.7404874798587494e-05, + "loss": 0.636, + "step": 1230 + }, + { + "epoch": 0.17228831350594823, + "grad_norm": 0.4359451447095212, + "learning_rate": 4.7399844357283397e-05, + "loss": 0.6124, + "step": 1231 + }, + { + "epoch": 0.17242827151854442, + "grad_norm": 0.4435735452706431, + "learning_rate": 4.739480931258401e-05, + "loss": 0.5854, + "step": 1232 + }, + { + "epoch": 0.17256822953114065, + "grad_norm": 0.4550094611824221, + "learning_rate": 4.738976966552407e-05, + "loss": 0.649, + "step": 1233 + }, + { + "epoch": 0.17270818754373687, + "grad_norm": 0.4417929722405249, + "learning_rate": 4.738472541713931e-05, + "loss": 0.6156, + "step": 1234 + }, + { + "epoch": 0.1728481455563331, + "grad_norm": 0.46054175505288897, + "learning_rate": 4.737967656846633e-05, + "loss": 0.5911, + "step": 1235 + }, + { + "epoch": 0.17298810356892932, + "grad_norm": 0.44199250556635683, + "learning_rate": 4.737462312054275e-05, + "loss": 0.6378, + "step": 1236 + }, + { + "epoch": 0.17312806158152555, + "grad_norm": 0.4365770107069145, + "learning_rate": 4.736956507440709e-05, + "loss": 0.6321, + "step": 1237 + }, + { + "epoch": 0.17326801959412177, + "grad_norm": 0.4569326418275404, + "learning_rate": 4.7364502431098844e-05, + "loss": 0.6128, + "step": 1238 + }, + { + "epoch": 0.17340797760671797, + "grad_norm": 0.4331079335375279, + "learning_rate": 4.7359435191658425e-05, + "loss": 0.6375, + "step": 1239 + }, + { + "epoch": 0.1735479356193142, + "grad_norm": 0.45213254965556654, + "learning_rate": 4.7354363357127204e-05, + "loss": 0.6024, + "step": 1240 + }, + { + "epoch": 0.17368789363191042, + "grad_norm": 0.4301062314752303, + "learning_rate": 4.7349286928547494e-05, + "loss": 0.6164, + "step": 1241 + }, + { + "epoch": 0.17382785164450665, + "grad_norm": 0.43058324357912753, + "learning_rate": 4.7344205906962555e-05, + "loss": 0.6005, + "step": 1242 + }, + { + "epoch": 0.17396780965710287, + "grad_norm": 0.4180674272631518, + "learning_rate": 4.7339120293416594e-05, + "loss": 0.6088, + "step": 1243 + }, + { + "epoch": 0.1741077676696991, + "grad_norm": 0.4546933788993979, + "learning_rate": 4.733403008895474e-05, + "loss": 0.6279, + "step": 1244 + }, + { + "epoch": 0.17424772568229532, + "grad_norm": 0.4477652947027751, + "learning_rate": 4.73289352946231e-05, + "loss": 0.6189, + "step": 1245 + }, + { + "epoch": 0.17438768369489152, + "grad_norm": 0.4240511875731597, + "learning_rate": 4.732383591146869e-05, + "loss": 0.6028, + "step": 1246 + }, + { + "epoch": 0.17452764170748775, + "grad_norm": 0.4292894608181786, + "learning_rate": 4.73187319405395e-05, + "loss": 0.6109, + "step": 1247 + }, + { + "epoch": 0.17466759972008397, + "grad_norm": 0.4268503231804948, + "learning_rate": 4.7313623382884435e-05, + "loss": 0.6014, + "step": 1248 + }, + { + "epoch": 0.1748075577326802, + "grad_norm": 0.44496488332160694, + "learning_rate": 4.730851023955337e-05, + "loss": 0.611, + "step": 1249 + }, + { + "epoch": 0.17494751574527642, + "grad_norm": 0.43212438694423877, + "learning_rate": 4.730339251159709e-05, + "loss": 0.5553, + "step": 1250 + }, + { + "epoch": 0.17508747375787265, + "grad_norm": 0.45661821214759873, + "learning_rate": 4.729827020006735e-05, + "loss": 0.6071, + "step": 1251 + }, + { + "epoch": 0.17522743177046887, + "grad_norm": 0.43901674416520003, + "learning_rate": 4.7293143306016836e-05, + "loss": 0.5777, + "step": 1252 + }, + { + "epoch": 0.17536738978306507, + "grad_norm": 0.4544033001494578, + "learning_rate": 4.728801183049918e-05, + "loss": 0.639, + "step": 1253 + }, + { + "epoch": 0.1755073477956613, + "grad_norm": 0.4520331595590865, + "learning_rate": 4.728287577456894e-05, + "loss": 0.6089, + "step": 1254 + }, + { + "epoch": 0.17564730580825752, + "grad_norm": 0.44402099958528857, + "learning_rate": 4.7277735139281645e-05, + "loss": 0.6056, + "step": 1255 + }, + { + "epoch": 0.17578726382085375, + "grad_norm": 0.4599500342986939, + "learning_rate": 4.7272589925693735e-05, + "loss": 0.6, + "step": 1256 + }, + { + "epoch": 0.17592722183344997, + "grad_norm": 0.41917281344225943, + "learning_rate": 4.7267440134862604e-05, + "loss": 0.6204, + "step": 1257 + }, + { + "epoch": 0.1760671798460462, + "grad_norm": 0.4365380087308256, + "learning_rate": 4.72622857678466e-05, + "loss": 0.6364, + "step": 1258 + }, + { + "epoch": 0.17620713785864242, + "grad_norm": 0.4302123239208661, + "learning_rate": 4.725712682570498e-05, + "loss": 0.6077, + "step": 1259 + }, + { + "epoch": 0.17634709587123862, + "grad_norm": 0.4388230610863713, + "learning_rate": 4.725196330949797e-05, + "loss": 0.5846, + "step": 1260 + }, + { + "epoch": 0.17648705388383484, + "grad_norm": 0.43045983689708245, + "learning_rate": 4.724679522028672e-05, + "loss": 0.5945, + "step": 1261 + }, + { + "epoch": 0.17662701189643107, + "grad_norm": 0.45979393311365935, + "learning_rate": 4.7241622559133325e-05, + "loss": 0.679, + "step": 1262 + }, + { + "epoch": 0.1767669699090273, + "grad_norm": 0.4383956739878393, + "learning_rate": 4.723644532710082e-05, + "loss": 0.6391, + "step": 1263 + }, + { + "epoch": 0.17690692792162352, + "grad_norm": 0.4173574514314508, + "learning_rate": 4.723126352525318e-05, + "loss": 0.5922, + "step": 1264 + }, + { + "epoch": 0.17704688593421974, + "grad_norm": 0.4480684757490661, + "learning_rate": 4.722607715465532e-05, + "loss": 0.607, + "step": 1265 + }, + { + "epoch": 0.17718684394681594, + "grad_norm": 0.40606023013599496, + "learning_rate": 4.722088621637309e-05, + "loss": 0.5901, + "step": 1266 + }, + { + "epoch": 0.17732680195941217, + "grad_norm": 0.5273306368158803, + "learning_rate": 4.7215690711473275e-05, + "loss": 0.6502, + "step": 1267 + }, + { + "epoch": 0.1774667599720084, + "grad_norm": 0.9204038885794614, + "learning_rate": 4.7210490641023615e-05, + "loss": 0.6287, + "step": 1268 + }, + { + "epoch": 0.17760671798460462, + "grad_norm": 0.40746942485511245, + "learning_rate": 4.7205286006092764e-05, + "loss": 0.6017, + "step": 1269 + }, + { + "epoch": 0.17774667599720084, + "grad_norm": 0.41902531090541245, + "learning_rate": 4.720007680775034e-05, + "loss": 0.5875, + "step": 1270 + }, + { + "epoch": 0.17788663400979707, + "grad_norm": 0.4117792800186485, + "learning_rate": 4.719486304706687e-05, + "loss": 0.6485, + "step": 1271 + }, + { + "epoch": 0.1780265920223933, + "grad_norm": 0.4470634029440558, + "learning_rate": 4.718964472511386e-05, + "loss": 0.6527, + "step": 1272 + }, + { + "epoch": 0.1781665500349895, + "grad_norm": 0.43032660391655064, + "learning_rate": 4.71844218429637e-05, + "loss": 0.6161, + "step": 1273 + }, + { + "epoch": 0.17830650804758572, + "grad_norm": 0.44832712777708833, + "learning_rate": 4.7179194401689764e-05, + "loss": 0.6199, + "step": 1274 + }, + { + "epoch": 0.17844646606018194, + "grad_norm": 0.42568085262950894, + "learning_rate": 4.7173962402366334e-05, + "loss": 0.6131, + "step": 1275 + }, + { + "epoch": 0.17858642407277817, + "grad_norm": 0.43106804328266973, + "learning_rate": 4.716872584606865e-05, + "loss": 0.6086, + "step": 1276 + }, + { + "epoch": 0.1787263820853744, + "grad_norm": 0.4339145485054148, + "learning_rate": 4.716348473387286e-05, + "loss": 0.581, + "step": 1277 + }, + { + "epoch": 0.17886634009797062, + "grad_norm": 0.559402125506372, + "learning_rate": 4.715823906685609e-05, + "loss": 0.5788, + "step": 1278 + }, + { + "epoch": 0.17900629811056684, + "grad_norm": 0.4466695949938704, + "learning_rate": 4.715298884609636e-05, + "loss": 0.6169, + "step": 1279 + }, + { + "epoch": 0.17914625612316304, + "grad_norm": 0.42599041437598534, + "learning_rate": 4.7147734072672644e-05, + "loss": 0.5892, + "step": 1280 + }, + { + "epoch": 0.17928621413575926, + "grad_norm": 0.4193811493005937, + "learning_rate": 4.7142474747664856e-05, + "loss": 0.5716, + "step": 1281 + }, + { + "epoch": 0.1794261721483555, + "grad_norm": 0.4283159486252786, + "learning_rate": 4.7137210872153844e-05, + "loss": 0.5828, + "step": 1282 + }, + { + "epoch": 0.17956613016095171, + "grad_norm": 0.5396297802303109, + "learning_rate": 4.713194244722138e-05, + "loss": 0.6024, + "step": 1283 + }, + { + "epoch": 0.17970608817354794, + "grad_norm": 0.45852485695580586, + "learning_rate": 4.712666947395018e-05, + "loss": 0.5972, + "step": 1284 + }, + { + "epoch": 0.17984604618614417, + "grad_norm": 0.4385616625675766, + "learning_rate": 4.71213919534239e-05, + "loss": 0.6379, + "step": 1285 + }, + { + "epoch": 0.1799860041987404, + "grad_norm": 0.45421354404722714, + "learning_rate": 4.711610988672712e-05, + "loss": 0.6314, + "step": 1286 + }, + { + "epoch": 0.1801259622113366, + "grad_norm": 0.4174346560548907, + "learning_rate": 4.711082327494536e-05, + "loss": 0.582, + "step": 1287 + }, + { + "epoch": 0.1802659202239328, + "grad_norm": 0.46268786301319254, + "learning_rate": 4.7105532119165066e-05, + "loss": 0.6187, + "step": 1288 + }, + { + "epoch": 0.18040587823652904, + "grad_norm": 0.4225894844217519, + "learning_rate": 4.710023642047364e-05, + "loss": 0.6044, + "step": 1289 + }, + { + "epoch": 0.18054583624912526, + "grad_norm": 0.4212476755455102, + "learning_rate": 4.709493617995938e-05, + "loss": 0.6139, + "step": 1290 + }, + { + "epoch": 0.1806857942617215, + "grad_norm": 0.43172618509391675, + "learning_rate": 4.7089631398711556e-05, + "loss": 0.6195, + "step": 1291 + }, + { + "epoch": 0.1808257522743177, + "grad_norm": 0.47068060331043937, + "learning_rate": 4.7084322077820345e-05, + "loss": 0.6475, + "step": 1292 + }, + { + "epoch": 0.18096571028691394, + "grad_norm": 0.42995345865450285, + "learning_rate": 4.707900821837686e-05, + "loss": 0.6263, + "step": 1293 + }, + { + "epoch": 0.18110566829951014, + "grad_norm": 0.4574009601171825, + "learning_rate": 4.707368982147318e-05, + "loss": 0.6676, + "step": 1294 + }, + { + "epoch": 0.18124562631210636, + "grad_norm": 0.44635706573473055, + "learning_rate": 4.7068366888202264e-05, + "loss": 0.6532, + "step": 1295 + }, + { + "epoch": 0.1813855843247026, + "grad_norm": 0.41536175859929064, + "learning_rate": 4.7063039419658035e-05, + "loss": 0.6001, + "step": 1296 + }, + { + "epoch": 0.1815255423372988, + "grad_norm": 0.431878399826314, + "learning_rate": 4.705770741693535e-05, + "loss": 0.5995, + "step": 1297 + }, + { + "epoch": 0.18166550034989504, + "grad_norm": 0.43247283157231065, + "learning_rate": 4.7052370881129976e-05, + "loss": 0.6269, + "step": 1298 + }, + { + "epoch": 0.18180545836249126, + "grad_norm": 0.44705916198880513, + "learning_rate": 4.704702981333864e-05, + "loss": 0.5862, + "step": 1299 + }, + { + "epoch": 0.1819454163750875, + "grad_norm": 0.4435892818551313, + "learning_rate": 4.704168421465896e-05, + "loss": 0.6206, + "step": 1300 + }, + { + "epoch": 0.18208537438768368, + "grad_norm": 0.8694903989330446, + "learning_rate": 4.7036334086189555e-05, + "loss": 0.6592, + "step": 1301 + }, + { + "epoch": 0.1822253324002799, + "grad_norm": 0.4457381902676728, + "learning_rate": 4.703097942902989e-05, + "loss": 0.5962, + "step": 1302 + }, + { + "epoch": 0.18236529041287614, + "grad_norm": 0.4105292740202644, + "learning_rate": 4.702562024428042e-05, + "loss": 0.6314, + "step": 1303 + }, + { + "epoch": 0.18250524842547236, + "grad_norm": 0.43538468918998957, + "learning_rate": 4.70202565330425e-05, + "loss": 0.6348, + "step": 1304 + }, + { + "epoch": 0.18264520643806859, + "grad_norm": 0.42334962592608394, + "learning_rate": 4.701488829641845e-05, + "loss": 0.6271, + "step": 1305 + }, + { + "epoch": 0.1827851644506648, + "grad_norm": 0.44222681529868485, + "learning_rate": 4.700951553551148e-05, + "loss": 0.6139, + "step": 1306 + }, + { + "epoch": 0.182925122463261, + "grad_norm": 0.41298228492856454, + "learning_rate": 4.700413825142574e-05, + "loss": 0.6059, + "step": 1307 + }, + { + "epoch": 0.18306508047585723, + "grad_norm": 0.48053391897594716, + "learning_rate": 4.6998756445266336e-05, + "loss": 0.6252, + "step": 1308 + }, + { + "epoch": 0.18320503848845346, + "grad_norm": 0.422350382910677, + "learning_rate": 4.6993370118139264e-05, + "loss": 0.594, + "step": 1309 + }, + { + "epoch": 0.18334499650104968, + "grad_norm": 0.45501043482560677, + "learning_rate": 4.698797927115148e-05, + "loss": 0.6039, + "step": 1310 + }, + { + "epoch": 0.1834849545136459, + "grad_norm": 0.42755574101023547, + "learning_rate": 4.698258390541086e-05, + "loss": 0.635, + "step": 1311 + }, + { + "epoch": 0.18362491252624213, + "grad_norm": 0.4212712876216914, + "learning_rate": 4.69771840220262e-05, + "loss": 0.6187, + "step": 1312 + }, + { + "epoch": 0.18376487053883836, + "grad_norm": 0.4474515144815738, + "learning_rate": 4.697177962210722e-05, + "loss": 0.6044, + "step": 1313 + }, + { + "epoch": 0.18390482855143456, + "grad_norm": 0.4574401172788082, + "learning_rate": 4.696637070676462e-05, + "loss": 0.6465, + "step": 1314 + }, + { + "epoch": 0.18404478656403078, + "grad_norm": 0.47635542166003236, + "learning_rate": 4.696095727710994e-05, + "loss": 0.5871, + "step": 1315 + }, + { + "epoch": 0.184184744576627, + "grad_norm": 0.44962202431159326, + "learning_rate": 4.6955539334255716e-05, + "loss": 0.6232, + "step": 1316 + }, + { + "epoch": 0.18432470258922323, + "grad_norm": 0.45431685769602653, + "learning_rate": 4.6950116879315385e-05, + "loss": 0.621, + "step": 1317 + }, + { + "epoch": 0.18446466060181946, + "grad_norm": 0.4388675789360797, + "learning_rate": 4.6944689913403326e-05, + "loss": 0.6183, + "step": 1318 + }, + { + "epoch": 0.18460461861441568, + "grad_norm": 0.4523470765103244, + "learning_rate": 4.693925843763483e-05, + "loss": 0.6212, + "step": 1319 + }, + { + "epoch": 0.1847445766270119, + "grad_norm": 0.4807021100408991, + "learning_rate": 4.693382245312612e-05, + "loss": 0.6097, + "step": 1320 + }, + { + "epoch": 0.1848845346396081, + "grad_norm": 0.4306193212134266, + "learning_rate": 4.6928381960994336e-05, + "loss": 0.6438, + "step": 1321 + }, + { + "epoch": 0.18502449265220433, + "grad_norm": 0.447094803519498, + "learning_rate": 4.692293696235758e-05, + "loss": 0.6263, + "step": 1322 + }, + { + "epoch": 0.18516445066480056, + "grad_norm": 0.4200693798341761, + "learning_rate": 4.6917487458334824e-05, + "loss": 0.5863, + "step": 1323 + }, + { + "epoch": 0.18530440867739678, + "grad_norm": 0.4459700079709483, + "learning_rate": 4.691203345004602e-05, + "loss": 0.6113, + "step": 1324 + }, + { + "epoch": 0.185444366689993, + "grad_norm": 0.42812241397789785, + "learning_rate": 4.6906574938612e-05, + "loss": 0.613, + "step": 1325 + }, + { + "epoch": 0.18558432470258923, + "grad_norm": 0.44482736985800875, + "learning_rate": 4.690111192515457e-05, + "loss": 0.6087, + "step": 1326 + }, + { + "epoch": 0.18572428271518546, + "grad_norm": 0.43543296488997046, + "learning_rate": 4.6895644410796416e-05, + "loss": 0.6055, + "step": 1327 + }, + { + "epoch": 0.18586424072778165, + "grad_norm": 0.44518530430540526, + "learning_rate": 4.689017239666117e-05, + "loss": 0.6094, + "step": 1328 + }, + { + "epoch": 0.18600419874037788, + "grad_norm": 0.41902567935143953, + "learning_rate": 4.688469588387339e-05, + "loss": 0.5978, + "step": 1329 + }, + { + "epoch": 0.1861441567529741, + "grad_norm": 0.423883271911657, + "learning_rate": 4.6879214873558565e-05, + "loss": 0.5982, + "step": 1330 + }, + { + "epoch": 0.18628411476557033, + "grad_norm": 0.44582071516941296, + "learning_rate": 4.6873729366843075e-05, + "loss": 0.562, + "step": 1331 + }, + { + "epoch": 0.18642407277816656, + "grad_norm": 0.4660473204875247, + "learning_rate": 4.686823936485426e-05, + "loss": 0.6367, + "step": 1332 + }, + { + "epoch": 0.18656403079076278, + "grad_norm": 0.4769376469582944, + "learning_rate": 4.6862744868720374e-05, + "loss": 0.6652, + "step": 1333 + }, + { + "epoch": 0.186703988803359, + "grad_norm": 0.4618831962024523, + "learning_rate": 4.6857245879570585e-05, + "loss": 0.6317, + "step": 1334 + }, + { + "epoch": 0.1868439468159552, + "grad_norm": 0.4550475574225298, + "learning_rate": 4.685174239853499e-05, + "loss": 0.5892, + "step": 1335 + }, + { + "epoch": 0.18698390482855143, + "grad_norm": 0.43735879865032035, + "learning_rate": 4.684623442674463e-05, + "loss": 0.5915, + "step": 1336 + }, + { + "epoch": 0.18712386284114765, + "grad_norm": 0.42856554004367964, + "learning_rate": 4.684072196533142e-05, + "loss": 0.6211, + "step": 1337 + }, + { + "epoch": 0.18726382085374388, + "grad_norm": 0.4359596291931712, + "learning_rate": 4.6835205015428246e-05, + "loss": 0.5957, + "step": 1338 + }, + { + "epoch": 0.1874037788663401, + "grad_norm": 0.44381328220368255, + "learning_rate": 4.682968357816889e-05, + "loss": 0.634, + "step": 1339 + }, + { + "epoch": 0.18754373687893633, + "grad_norm": 0.45368741213258845, + "learning_rate": 4.682415765468807e-05, + "loss": 0.6242, + "step": 1340 + }, + { + "epoch": 0.18768369489153253, + "grad_norm": 0.433054739833902, + "learning_rate": 4.681862724612141e-05, + "loss": 0.6155, + "step": 1341 + }, + { + "epoch": 0.18782365290412875, + "grad_norm": 0.4455643661885221, + "learning_rate": 4.681309235360546e-05, + "loss": 0.6061, + "step": 1342 + }, + { + "epoch": 0.18796361091672498, + "grad_norm": 0.4348159376755847, + "learning_rate": 4.6807552978277725e-05, + "loss": 0.6016, + "step": 1343 + }, + { + "epoch": 0.1881035689293212, + "grad_norm": 0.43841673430612965, + "learning_rate": 4.6802009121276566e-05, + "loss": 0.6139, + "step": 1344 + }, + { + "epoch": 0.18824352694191743, + "grad_norm": 0.43548395332890844, + "learning_rate": 4.679646078374133e-05, + "loss": 0.6335, + "step": 1345 + }, + { + "epoch": 0.18838348495451365, + "grad_norm": 0.4311071382664937, + "learning_rate": 4.679090796681225e-05, + "loss": 0.6493, + "step": 1346 + }, + { + "epoch": 0.18852344296710988, + "grad_norm": 0.43011170445061725, + "learning_rate": 4.6785350671630467e-05, + "loss": 0.5733, + "step": 1347 + }, + { + "epoch": 0.18866340097970608, + "grad_norm": 0.44356466777661, + "learning_rate": 4.6779788899338095e-05, + "loss": 0.5936, + "step": 1348 + }, + { + "epoch": 0.1888033589923023, + "grad_norm": 0.4408469771886245, + "learning_rate": 4.6774222651078106e-05, + "loss": 0.6, + "step": 1349 + }, + { + "epoch": 0.18894331700489853, + "grad_norm": 0.4174867184800617, + "learning_rate": 4.6768651927994434e-05, + "loss": 0.5785, + "step": 1350 + }, + { + "epoch": 0.18908327501749475, + "grad_norm": 0.4202539862345758, + "learning_rate": 4.6763076731231916e-05, + "loss": 0.6061, + "step": 1351 + }, + { + "epoch": 0.18922323303009098, + "grad_norm": 0.4362329726223066, + "learning_rate": 4.675749706193631e-05, + "loss": 0.6105, + "step": 1352 + }, + { + "epoch": 0.1893631910426872, + "grad_norm": 0.4463015177283665, + "learning_rate": 4.67519129212543e-05, + "loss": 0.6196, + "step": 1353 + }, + { + "epoch": 0.18950314905528343, + "grad_norm": 0.4213305303975094, + "learning_rate": 4.674632431033348e-05, + "loss": 0.5771, + "step": 1354 + }, + { + "epoch": 0.18964310706787962, + "grad_norm": 0.4523660797014365, + "learning_rate": 4.674073123032236e-05, + "loss": 0.5614, + "step": 1355 + }, + { + "epoch": 0.18978306508047585, + "grad_norm": 0.42558959589979956, + "learning_rate": 4.673513368237039e-05, + "loss": 0.5987, + "step": 1356 + }, + { + "epoch": 0.18992302309307207, + "grad_norm": 0.4566223530786513, + "learning_rate": 4.6729531667627905e-05, + "loss": 0.6081, + "step": 1357 + }, + { + "epoch": 0.1900629811056683, + "grad_norm": 0.4324304893039991, + "learning_rate": 4.672392518724619e-05, + "loss": 0.6132, + "step": 1358 + }, + { + "epoch": 0.19020293911826452, + "grad_norm": 0.43913060355532496, + "learning_rate": 4.671831424237743e-05, + "loss": 0.5989, + "step": 1359 + }, + { + "epoch": 0.19034289713086075, + "grad_norm": 0.44320622872500787, + "learning_rate": 4.671269883417473e-05, + "loss": 0.5998, + "step": 1360 + }, + { + "epoch": 0.19048285514345698, + "grad_norm": 0.4298692464735462, + "learning_rate": 4.670707896379211e-05, + "loss": 0.555, + "step": 1361 + }, + { + "epoch": 0.19062281315605317, + "grad_norm": 0.4212199124955075, + "learning_rate": 4.670145463238451e-05, + "loss": 0.6177, + "step": 1362 + }, + { + "epoch": 0.1907627711686494, + "grad_norm": 0.4448003823408276, + "learning_rate": 4.669582584110779e-05, + "loss": 0.5932, + "step": 1363 + }, + { + "epoch": 0.19090272918124562, + "grad_norm": 0.44697385174479454, + "learning_rate": 4.6690192591118734e-05, + "loss": 0.6026, + "step": 1364 + }, + { + "epoch": 0.19104268719384185, + "grad_norm": 0.4671259243593673, + "learning_rate": 4.668455488357502e-05, + "loss": 0.6156, + "step": 1365 + }, + { + "epoch": 0.19118264520643807, + "grad_norm": 0.43396652865433255, + "learning_rate": 4.6678912719635246e-05, + "loss": 0.5764, + "step": 1366 + }, + { + "epoch": 0.1913226032190343, + "grad_norm": 0.4375416377619642, + "learning_rate": 4.667326610045895e-05, + "loss": 0.6031, + "step": 1367 + }, + { + "epoch": 0.19146256123163052, + "grad_norm": 0.441530782470694, + "learning_rate": 4.6667615027206564e-05, + "loss": 0.5773, + "step": 1368 + }, + { + "epoch": 0.19160251924422672, + "grad_norm": 0.43470376998767046, + "learning_rate": 4.6661959501039446e-05, + "loss": 0.5991, + "step": 1369 + }, + { + "epoch": 0.19174247725682295, + "grad_norm": 0.4389060530511099, + "learning_rate": 4.665629952311985e-05, + "loss": 0.5692, + "step": 1370 + }, + { + "epoch": 0.19188243526941917, + "grad_norm": 0.4752733107460391, + "learning_rate": 4.665063509461097e-05, + "loss": 0.6525, + "step": 1371 + }, + { + "epoch": 0.1920223932820154, + "grad_norm": 0.43452630074557697, + "learning_rate": 4.66449662166769e-05, + "loss": 0.5925, + "step": 1372 + }, + { + "epoch": 0.19216235129461162, + "grad_norm": 0.4472660808670533, + "learning_rate": 4.663929289048266e-05, + "loss": 0.6175, + "step": 1373 + }, + { + "epoch": 0.19230230930720785, + "grad_norm": 0.4151649360316345, + "learning_rate": 4.6633615117194165e-05, + "loss": 0.6048, + "step": 1374 + }, + { + "epoch": 0.19244226731980407, + "grad_norm": 0.41991921626461726, + "learning_rate": 4.6627932897978254e-05, + "loss": 0.6141, + "step": 1375 + }, + { + "epoch": 0.19258222533240027, + "grad_norm": 0.44865262717314497, + "learning_rate": 4.6622246234002686e-05, + "loss": 0.6108, + "step": 1376 + }, + { + "epoch": 0.1927221833449965, + "grad_norm": 0.42390019369736476, + "learning_rate": 4.6616555126436134e-05, + "loss": 0.6178, + "step": 1377 + }, + { + "epoch": 0.19286214135759272, + "grad_norm": 0.47111888994325224, + "learning_rate": 4.6610859576448176e-05, + "loss": 0.6351, + "step": 1378 + }, + { + "epoch": 0.19300209937018895, + "grad_norm": 0.4694892453227565, + "learning_rate": 4.660515958520929e-05, + "loss": 0.6313, + "step": 1379 + }, + { + "epoch": 0.19314205738278517, + "grad_norm": 0.43419016166335284, + "learning_rate": 4.65994551538909e-05, + "loss": 0.592, + "step": 1380 + }, + { + "epoch": 0.1932820153953814, + "grad_norm": 0.4133745853690666, + "learning_rate": 4.659374628366532e-05, + "loss": 0.572, + "step": 1381 + }, + { + "epoch": 0.1934219734079776, + "grad_norm": 0.4786575561548947, + "learning_rate": 4.658803297570577e-05, + "loss": 0.6007, + "step": 1382 + }, + { + "epoch": 0.19356193142057382, + "grad_norm": 0.4524476179381755, + "learning_rate": 4.658231523118641e-05, + "loss": 0.5966, + "step": 1383 + }, + { + "epoch": 0.19370188943317004, + "grad_norm": 0.4349438663500916, + "learning_rate": 4.6576593051282286e-05, + "loss": 0.5862, + "step": 1384 + }, + { + "epoch": 0.19384184744576627, + "grad_norm": 0.45493985969941036, + "learning_rate": 4.657086643716936e-05, + "loss": 0.6482, + "step": 1385 + }, + { + "epoch": 0.1939818054583625, + "grad_norm": 0.42740591145801066, + "learning_rate": 4.6565135390024515e-05, + "loss": 0.5906, + "step": 1386 + }, + { + "epoch": 0.19412176347095872, + "grad_norm": 0.47535147700181984, + "learning_rate": 4.6559399911025545e-05, + "loss": 0.5951, + "step": 1387 + }, + { + "epoch": 0.19426172148355494, + "grad_norm": 0.4333924774938176, + "learning_rate": 4.655366000135114e-05, + "loss": 0.6083, + "step": 1388 + }, + { + "epoch": 0.19440167949615114, + "grad_norm": 0.4197374995007798, + "learning_rate": 4.6547915662180905e-05, + "loss": 0.5758, + "step": 1389 + }, + { + "epoch": 0.19454163750874737, + "grad_norm": 0.4209565338579636, + "learning_rate": 4.6542166894695366e-05, + "loss": 0.6219, + "step": 1390 + }, + { + "epoch": 0.1946815955213436, + "grad_norm": 0.4533900200145802, + "learning_rate": 4.653641370007596e-05, + "loss": 0.6284, + "step": 1391 + }, + { + "epoch": 0.19482155353393982, + "grad_norm": 0.43123676441236164, + "learning_rate": 4.653065607950502e-05, + "loss": 0.6172, + "step": 1392 + }, + { + "epoch": 0.19496151154653604, + "grad_norm": 0.43240025310515345, + "learning_rate": 4.652489403416579e-05, + "loss": 0.6231, + "step": 1393 + }, + { + "epoch": 0.19510146955913227, + "grad_norm": 0.4105873841245408, + "learning_rate": 4.651912756524244e-05, + "loss": 0.6351, + "step": 1394 + }, + { + "epoch": 0.1952414275717285, + "grad_norm": 0.4342716468025768, + "learning_rate": 4.651335667392003e-05, + "loss": 0.5944, + "step": 1395 + }, + { + "epoch": 0.1953813855843247, + "grad_norm": 0.42929266034842917, + "learning_rate": 4.6507581361384537e-05, + "loss": 0.5949, + "step": 1396 + }, + { + "epoch": 0.19552134359692092, + "grad_norm": 0.43375421894645294, + "learning_rate": 4.650180162882285e-05, + "loss": 0.5674, + "step": 1397 + }, + { + "epoch": 0.19566130160951714, + "grad_norm": 0.4605858134606264, + "learning_rate": 4.649601747742277e-05, + "loss": 0.6276, + "step": 1398 + }, + { + "epoch": 0.19580125962211337, + "grad_norm": 0.4096793441616307, + "learning_rate": 4.649022890837298e-05, + "loss": 0.6066, + "step": 1399 + }, + { + "epoch": 0.1959412176347096, + "grad_norm": 0.43325226542495887, + "learning_rate": 4.6484435922863105e-05, + "loss": 0.578, + "step": 1400 + }, + { + "epoch": 0.19608117564730582, + "grad_norm": 0.45203303336392237, + "learning_rate": 4.6478638522083654e-05, + "loss": 0.6247, + "step": 1401 + }, + { + "epoch": 0.19622113365990204, + "grad_norm": 0.43768863421738247, + "learning_rate": 4.6472836707226065e-05, + "loss": 0.6089, + "step": 1402 + }, + { + "epoch": 0.19636109167249824, + "grad_norm": 0.4561097031131965, + "learning_rate": 4.646703047948264e-05, + "loss": 0.6228, + "step": 1403 + }, + { + "epoch": 0.19650104968509446, + "grad_norm": 0.430780639976112, + "learning_rate": 4.6461219840046654e-05, + "loss": 0.5712, + "step": 1404 + }, + { + "epoch": 0.1966410076976907, + "grad_norm": 0.4421729743171197, + "learning_rate": 4.645540479011223e-05, + "loss": 0.608, + "step": 1405 + }, + { + "epoch": 0.19678096571028691, + "grad_norm": 0.4527004953532786, + "learning_rate": 4.644958533087443e-05, + "loss": 0.5794, + "step": 1406 + }, + { + "epoch": 0.19692092372288314, + "grad_norm": 0.43655870981476536, + "learning_rate": 4.64437614635292e-05, + "loss": 0.6072, + "step": 1407 + }, + { + "epoch": 0.19706088173547937, + "grad_norm": 0.46922192783691635, + "learning_rate": 4.643793318927342e-05, + "loss": 0.6129, + "step": 1408 + }, + { + "epoch": 0.1972008397480756, + "grad_norm": 0.4096022513008127, + "learning_rate": 4.6432100509304843e-05, + "loss": 0.6139, + "step": 1409 + }, + { + "epoch": 0.1973407977606718, + "grad_norm": 0.44858888050025963, + "learning_rate": 4.642626342482215e-05, + "loss": 0.6494, + "step": 1410 + }, + { + "epoch": 0.197480755773268, + "grad_norm": 0.42709765753910495, + "learning_rate": 4.642042193702493e-05, + "loss": 0.6144, + "step": 1411 + }, + { + "epoch": 0.19762071378586424, + "grad_norm": 0.43179502942515346, + "learning_rate": 4.6414576047113655e-05, + "loss": 0.6533, + "step": 1412 + }, + { + "epoch": 0.19776067179846046, + "grad_norm": 0.4271262128439598, + "learning_rate": 4.640872575628973e-05, + "loss": 0.5787, + "step": 1413 + }, + { + "epoch": 0.1979006298110567, + "grad_norm": 0.44047825759880344, + "learning_rate": 4.640287106575543e-05, + "loss": 0.5908, + "step": 1414 + }, + { + "epoch": 0.1980405878236529, + "grad_norm": 0.42112850121439344, + "learning_rate": 4.639701197671397e-05, + "loss": 0.6233, + "step": 1415 + }, + { + "epoch": 0.19818054583624914, + "grad_norm": 0.4372841966734705, + "learning_rate": 4.639114849036944e-05, + "loss": 0.6486, + "step": 1416 + }, + { + "epoch": 0.19832050384884534, + "grad_norm": 0.4296545377209088, + "learning_rate": 4.638528060792685e-05, + "loss": 0.5862, + "step": 1417 + }, + { + "epoch": 0.19846046186144156, + "grad_norm": 0.4242826054839983, + "learning_rate": 4.637940833059211e-05, + "loss": 0.6224, + "step": 1418 + }, + { + "epoch": 0.1986004198740378, + "grad_norm": 0.4386897733085964, + "learning_rate": 4.637353165957203e-05, + "loss": 0.6062, + "step": 1419 + }, + { + "epoch": 0.198740377886634, + "grad_norm": 0.43253628217173173, + "learning_rate": 4.636765059607434e-05, + "loss": 0.5796, + "step": 1420 + }, + { + "epoch": 0.19888033589923024, + "grad_norm": 0.42456136906021713, + "learning_rate": 4.6361765141307645e-05, + "loss": 0.6257, + "step": 1421 + }, + { + "epoch": 0.19902029391182646, + "grad_norm": 0.4546276363960964, + "learning_rate": 4.635587529648146e-05, + "loss": 0.6193, + "step": 1422 + }, + { + "epoch": 0.19916025192442266, + "grad_norm": 0.4511658476734362, + "learning_rate": 4.634998106280622e-05, + "loss": 0.5936, + "step": 1423 + }, + { + "epoch": 0.19930020993701889, + "grad_norm": 0.44597648248378585, + "learning_rate": 4.634408244149324e-05, + "loss": 0.6051, + "step": 1424 + }, + { + "epoch": 0.1994401679496151, + "grad_norm": 0.43808391202287855, + "learning_rate": 4.6338179433754756e-05, + "loss": 0.5995, + "step": 1425 + }, + { + "epoch": 0.19958012596221134, + "grad_norm": 0.48296291796144863, + "learning_rate": 4.6332272040803895e-05, + "loss": 0.5904, + "step": 1426 + }, + { + "epoch": 0.19972008397480756, + "grad_norm": 0.43753233592691976, + "learning_rate": 4.632636026385468e-05, + "loss": 0.5993, + "step": 1427 + }, + { + "epoch": 0.19986004198740379, + "grad_norm": 0.441993440207615, + "learning_rate": 4.632044410412204e-05, + "loss": 0.5981, + "step": 1428 + }, + { + "epoch": 0.2, + "grad_norm": 0.44046702891678347, + "learning_rate": 4.631452356282182e-05, + "loss": 0.5711, + "step": 1429 + }, + { + "epoch": 0.2001399580125962, + "grad_norm": 0.42955814782218116, + "learning_rate": 4.630859864117073e-05, + "loss": 0.5847, + "step": 1430 + }, + { + "epoch": 0.20027991602519243, + "grad_norm": 0.43651504606811536, + "learning_rate": 4.630266934038642e-05, + "loss": 0.6069, + "step": 1431 + }, + { + "epoch": 0.20041987403778866, + "grad_norm": 0.428395840664284, + "learning_rate": 4.629673566168741e-05, + "loss": 0.5963, + "step": 1432 + }, + { + "epoch": 0.20055983205038488, + "grad_norm": 0.4138324728369602, + "learning_rate": 4.629079760629313e-05, + "loss": 0.5647, + "step": 1433 + }, + { + "epoch": 0.2006997900629811, + "grad_norm": 0.4454769448792182, + "learning_rate": 4.628485517542392e-05, + "loss": 0.5915, + "step": 1434 + }, + { + "epoch": 0.20083974807557733, + "grad_norm": 0.43842498265287877, + "learning_rate": 4.627890837030101e-05, + "loss": 0.6333, + "step": 1435 + }, + { + "epoch": 0.20097970608817356, + "grad_norm": 0.42848973296459963, + "learning_rate": 4.627295719214653e-05, + "loss": 0.5877, + "step": 1436 + }, + { + "epoch": 0.20111966410076976, + "grad_norm": 0.44101122405778176, + "learning_rate": 4.6267001642183496e-05, + "loss": 0.6049, + "step": 1437 + }, + { + "epoch": 0.20125962211336598, + "grad_norm": 0.552921270157986, + "learning_rate": 4.6261041721635834e-05, + "loss": 0.6243, + "step": 1438 + }, + { + "epoch": 0.2013995801259622, + "grad_norm": 0.4518649937814017, + "learning_rate": 4.625507743172838e-05, + "loss": 0.6189, + "step": 1439 + }, + { + "epoch": 0.20153953813855843, + "grad_norm": 0.4231937860204061, + "learning_rate": 4.6249108773686846e-05, + "loss": 0.5851, + "step": 1440 + }, + { + "epoch": 0.20167949615115466, + "grad_norm": 0.42182226251472993, + "learning_rate": 4.6243135748737864e-05, + "loss": 0.6401, + "step": 1441 + }, + { + "epoch": 0.20181945416375088, + "grad_norm": 0.48057969413105356, + "learning_rate": 4.623715835810893e-05, + "loss": 0.6243, + "step": 1442 + }, + { + "epoch": 0.2019594121763471, + "grad_norm": 0.4437078265678712, + "learning_rate": 4.6231176603028484e-05, + "loss": 0.6328, + "step": 1443 + }, + { + "epoch": 0.2020993701889433, + "grad_norm": 0.4507520837571798, + "learning_rate": 4.6225190484725824e-05, + "loss": 0.5904, + "step": 1444 + }, + { + "epoch": 0.20223932820153953, + "grad_norm": 0.43096343316315816, + "learning_rate": 4.6219200004431154e-05, + "loss": 0.6205, + "step": 1445 + }, + { + "epoch": 0.20237928621413576, + "grad_norm": 0.44215631216303225, + "learning_rate": 4.6213205163375586e-05, + "loss": 0.6171, + "step": 1446 + }, + { + "epoch": 0.20251924422673198, + "grad_norm": 0.4601057485158553, + "learning_rate": 4.620720596279112e-05, + "loss": 0.6355, + "step": 1447 + }, + { + "epoch": 0.2026592022393282, + "grad_norm": 0.42229237803528424, + "learning_rate": 4.620120240391065e-05, + "loss": 0.5762, + "step": 1448 + }, + { + "epoch": 0.20279916025192443, + "grad_norm": 0.44247463233153156, + "learning_rate": 4.619519448796797e-05, + "loss": 0.6178, + "step": 1449 + }, + { + "epoch": 0.20293911826452066, + "grad_norm": 0.44572237683301125, + "learning_rate": 4.6189182216197766e-05, + "loss": 0.5782, + "step": 1450 + }, + { + "epoch": 0.20307907627711685, + "grad_norm": 0.44819354829925007, + "learning_rate": 4.618316558983562e-05, + "loss": 0.5575, + "step": 1451 + }, + { + "epoch": 0.20321903428971308, + "grad_norm": 0.462621341218452, + "learning_rate": 4.617714461011802e-05, + "loss": 0.6263, + "step": 1452 + }, + { + "epoch": 0.2033589923023093, + "grad_norm": 0.4191206562398054, + "learning_rate": 4.6171119278282315e-05, + "loss": 0.5823, + "step": 1453 + }, + { + "epoch": 0.20349895031490553, + "grad_norm": 0.4234985010775575, + "learning_rate": 4.6165089595566795e-05, + "loss": 0.5935, + "step": 1454 + }, + { + "epoch": 0.20363890832750176, + "grad_norm": 0.4138224710610466, + "learning_rate": 4.6159055563210604e-05, + "loss": 0.5753, + "step": 1455 + }, + { + "epoch": 0.20377886634009798, + "grad_norm": 0.4657076630993664, + "learning_rate": 4.6153017182453814e-05, + "loss": 0.6038, + "step": 1456 + }, + { + "epoch": 0.20391882435269418, + "grad_norm": 0.4271574918448891, + "learning_rate": 4.6146974454537374e-05, + "loss": 0.613, + "step": 1457 + }, + { + "epoch": 0.2040587823652904, + "grad_norm": 0.43052348959054654, + "learning_rate": 4.61409273807031e-05, + "loss": 0.578, + "step": 1458 + }, + { + "epoch": 0.20419874037788663, + "grad_norm": 0.446875776248068, + "learning_rate": 4.613487596219376e-05, + "loss": 0.6203, + "step": 1459 + }, + { + "epoch": 0.20433869839048285, + "grad_norm": 0.44280885463456926, + "learning_rate": 4.6128820200252954e-05, + "loss": 0.6178, + "step": 1460 + }, + { + "epoch": 0.20447865640307908, + "grad_norm": 0.4627088307800896, + "learning_rate": 4.612276009612522e-05, + "loss": 0.6083, + "step": 1461 + }, + { + "epoch": 0.2046186144156753, + "grad_norm": 0.40566282511231716, + "learning_rate": 4.611669565105596e-05, + "loss": 0.5897, + "step": 1462 + }, + { + "epoch": 0.20475857242827153, + "grad_norm": 0.4392817676918071, + "learning_rate": 4.6110626866291485e-05, + "loss": 0.6058, + "step": 1463 + }, + { + "epoch": 0.20489853044086773, + "grad_norm": 0.42409351023845426, + "learning_rate": 4.6104553743078996e-05, + "loss": 0.5942, + "step": 1464 + }, + { + "epoch": 0.20503848845346395, + "grad_norm": 0.43598866958142785, + "learning_rate": 4.609847628266657e-05, + "loss": 0.6271, + "step": 1465 + }, + { + "epoch": 0.20517844646606018, + "grad_norm": 0.4347982090854984, + "learning_rate": 4.60923944863032e-05, + "loss": 0.611, + "step": 1466 + }, + { + "epoch": 0.2053184044786564, + "grad_norm": 0.44333205254509317, + "learning_rate": 4.608630835523875e-05, + "loss": 0.6337, + "step": 1467 + }, + { + "epoch": 0.20545836249125263, + "grad_norm": 0.43208298348400775, + "learning_rate": 4.608021789072398e-05, + "loss": 0.5602, + "step": 1468 + }, + { + "epoch": 0.20559832050384885, + "grad_norm": 0.450291453219847, + "learning_rate": 4.607412309401054e-05, + "loss": 0.6653, + "step": 1469 + }, + { + "epoch": 0.20573827851644508, + "grad_norm": 0.4218568966937232, + "learning_rate": 4.606802396635098e-05, + "loss": 0.6039, + "step": 1470 + }, + { + "epoch": 0.20587823652904128, + "grad_norm": 0.4372151368221289, + "learning_rate": 4.6061920508998735e-05, + "loss": 0.6251, + "step": 1471 + }, + { + "epoch": 0.2060181945416375, + "grad_norm": 0.42362258866936114, + "learning_rate": 4.6055812723208114e-05, + "loss": 0.5752, + "step": 1472 + }, + { + "epoch": 0.20615815255423373, + "grad_norm": 0.4281637412896991, + "learning_rate": 4.604970061023434e-05, + "loss": 0.5909, + "step": 1473 + }, + { + "epoch": 0.20629811056682995, + "grad_norm": 0.4473289913366391, + "learning_rate": 4.604358417133351e-05, + "loss": 0.5825, + "step": 1474 + }, + { + "epoch": 0.20643806857942618, + "grad_norm": 0.44127895646180343, + "learning_rate": 4.6037463407762616e-05, + "loss": 0.591, + "step": 1475 + }, + { + "epoch": 0.2065780265920224, + "grad_norm": 0.4250252005670499, + "learning_rate": 4.6031338320779534e-05, + "loss": 0.61, + "step": 1476 + }, + { + "epoch": 0.20671798460461863, + "grad_norm": 0.4195465649774795, + "learning_rate": 4.602520891164304e-05, + "loss": 0.6044, + "step": 1477 + }, + { + "epoch": 0.20685794261721482, + "grad_norm": 0.4546098361873912, + "learning_rate": 4.601907518161277e-05, + "loss": 0.6216, + "step": 1478 + }, + { + "epoch": 0.20699790062981105, + "grad_norm": 0.4475804565840221, + "learning_rate": 4.601293713194929e-05, + "loss": 0.6793, + "step": 1479 + }, + { + "epoch": 0.20713785864240727, + "grad_norm": 0.43358956969279566, + "learning_rate": 4.600679476391402e-05, + "loss": 0.5491, + "step": 1480 + }, + { + "epoch": 0.2072778166550035, + "grad_norm": 0.42850643221967644, + "learning_rate": 4.600064807876929e-05, + "loss": 0.6122, + "step": 1481 + }, + { + "epoch": 0.20741777466759972, + "grad_norm": 0.4238910915601293, + "learning_rate": 4.599449707777829e-05, + "loss": 0.5886, + "step": 1482 + }, + { + "epoch": 0.20755773268019595, + "grad_norm": 0.44358369925735036, + "learning_rate": 4.5988341762205125e-05, + "loss": 0.6162, + "step": 1483 + }, + { + "epoch": 0.20769769069279218, + "grad_norm": 0.40620154391670654, + "learning_rate": 4.5982182133314765e-05, + "loss": 0.5666, + "step": 1484 + }, + { + "epoch": 0.20783764870538837, + "grad_norm": 0.4474470089623319, + "learning_rate": 4.5976018192373086e-05, + "loss": 0.6601, + "step": 1485 + }, + { + "epoch": 0.2079776067179846, + "grad_norm": 0.41753387085884, + "learning_rate": 4.5969849940646834e-05, + "loss": 0.6043, + "step": 1486 + }, + { + "epoch": 0.20811756473058082, + "grad_norm": 0.42752671322476543, + "learning_rate": 4.596367737940366e-05, + "loss": 0.6443, + "step": 1487 + }, + { + "epoch": 0.20825752274317705, + "grad_norm": 0.42699115737832166, + "learning_rate": 4.595750050991207e-05, + "loss": 0.6025, + "step": 1488 + }, + { + "epoch": 0.20839748075577327, + "grad_norm": 0.45076801294244706, + "learning_rate": 4.595131933344148e-05, + "loss": 0.6386, + "step": 1489 + }, + { + "epoch": 0.2085374387683695, + "grad_norm": 0.4322528381202846, + "learning_rate": 4.594513385126218e-05, + "loss": 0.5979, + "step": 1490 + }, + { + "epoch": 0.20867739678096572, + "grad_norm": 0.4400100496903223, + "learning_rate": 4.593894406464537e-05, + "loss": 0.6226, + "step": 1491 + }, + { + "epoch": 0.20881735479356192, + "grad_norm": 0.4424527757233975, + "learning_rate": 4.593274997486309e-05, + "loss": 0.594, + "step": 1492 + }, + { + "epoch": 0.20895731280615815, + "grad_norm": 0.42378140749175675, + "learning_rate": 4.592655158318829e-05, + "loss": 0.6122, + "step": 1493 + }, + { + "epoch": 0.20909727081875437, + "grad_norm": 0.43529413412957524, + "learning_rate": 4.592034889089482e-05, + "loss": 0.6246, + "step": 1494 + }, + { + "epoch": 0.2092372288313506, + "grad_norm": 0.4336529824699541, + "learning_rate": 4.591414189925739e-05, + "loss": 0.5778, + "step": 1495 + }, + { + "epoch": 0.20937718684394682, + "grad_norm": 0.4397235048236114, + "learning_rate": 4.5907930609551584e-05, + "loss": 0.6468, + "step": 1496 + }, + { + "epoch": 0.20951714485654305, + "grad_norm": 0.41913091394972535, + "learning_rate": 4.59017150230539e-05, + "loss": 0.6343, + "step": 1497 + }, + { + "epoch": 0.20965710286913924, + "grad_norm": 0.4402774808830381, + "learning_rate": 4.58954951410417e-05, + "loss": 0.6032, + "step": 1498 + }, + { + "epoch": 0.20979706088173547, + "grad_norm": 0.40993950169452836, + "learning_rate": 4.588927096479323e-05, + "loss": 0.5885, + "step": 1499 + }, + { + "epoch": 0.2099370188943317, + "grad_norm": 0.4319112358311373, + "learning_rate": 4.5883042495587637e-05, + "loss": 0.6243, + "step": 1500 + }, + { + "epoch": 0.21007697690692792, + "grad_norm": 0.4312366729366661, + "learning_rate": 4.587680973470491e-05, + "loss": 0.6154, + "step": 1501 + }, + { + "epoch": 0.21021693491952415, + "grad_norm": 0.4262355331023565, + "learning_rate": 4.587057268342597e-05, + "loss": 0.5766, + "step": 1502 + }, + { + "epoch": 0.21035689293212037, + "grad_norm": 0.4142051911565897, + "learning_rate": 4.586433134303257e-05, + "loss": 0.5607, + "step": 1503 + }, + { + "epoch": 0.2104968509447166, + "grad_norm": 0.4286460156115378, + "learning_rate": 4.5858085714807384e-05, + "loss": 0.6047, + "step": 1504 + }, + { + "epoch": 0.2106368089573128, + "grad_norm": 0.42524352191943315, + "learning_rate": 4.585183580003395e-05, + "loss": 0.6386, + "step": 1505 + }, + { + "epoch": 0.21077676696990902, + "grad_norm": 0.4287194392081206, + "learning_rate": 4.584558159999668e-05, + "loss": 0.5917, + "step": 1506 + }, + { + "epoch": 0.21091672498250524, + "grad_norm": 0.43676644200274517, + "learning_rate": 4.583932311598089e-05, + "loss": 0.6053, + "step": 1507 + }, + { + "epoch": 0.21105668299510147, + "grad_norm": 0.4555410229607422, + "learning_rate": 4.583306034927275e-05, + "loss": 0.5908, + "step": 1508 + }, + { + "epoch": 0.2111966410076977, + "grad_norm": 0.45902732656697165, + "learning_rate": 4.582679330115933e-05, + "loss": 0.6039, + "step": 1509 + }, + { + "epoch": 0.21133659902029392, + "grad_norm": 0.4025365324445434, + "learning_rate": 4.582052197292856e-05, + "loss": 0.581, + "step": 1510 + }, + { + "epoch": 0.21147655703289014, + "grad_norm": 0.3996651665725778, + "learning_rate": 4.581424636586929e-05, + "loss": 0.5978, + "step": 1511 + }, + { + "epoch": 0.21161651504548634, + "grad_norm": 0.4015353636671318, + "learning_rate": 4.580796648127118e-05, + "loss": 0.5862, + "step": 1512 + }, + { + "epoch": 0.21175647305808257, + "grad_norm": 0.44031192059103713, + "learning_rate": 4.580168232042484e-05, + "loss": 0.6566, + "step": 1513 + }, + { + "epoch": 0.2118964310706788, + "grad_norm": 0.42413167616716185, + "learning_rate": 4.579539388462173e-05, + "loss": 0.6368, + "step": 1514 + }, + { + "epoch": 0.21203638908327502, + "grad_norm": 0.4357570843764536, + "learning_rate": 4.578910117515416e-05, + "loss": 0.5574, + "step": 1515 + }, + { + "epoch": 0.21217634709587124, + "grad_norm": 0.4443836991084177, + "learning_rate": 4.578280419331538e-05, + "loss": 0.6129, + "step": 1516 + }, + { + "epoch": 0.21231630510846747, + "grad_norm": 0.413705821328803, + "learning_rate": 4.5776502940399454e-05, + "loss": 0.5594, + "step": 1517 + }, + { + "epoch": 0.2124562631210637, + "grad_norm": 0.4167212326575854, + "learning_rate": 4.5770197417701365e-05, + "loss": 0.6189, + "step": 1518 + }, + { + "epoch": 0.2125962211336599, + "grad_norm": 0.4143995752113595, + "learning_rate": 4.576388762651697e-05, + "loss": 0.611, + "step": 1519 + }, + { + "epoch": 0.21273617914625612, + "grad_norm": 0.42271548484517474, + "learning_rate": 4.575757356814299e-05, + "loss": 0.6259, + "step": 1520 + }, + { + "epoch": 0.21287613715885234, + "grad_norm": 0.44271943318208246, + "learning_rate": 4.5751255243877015e-05, + "loss": 0.6315, + "step": 1521 + }, + { + "epoch": 0.21301609517144857, + "grad_norm": 0.4701146597325822, + "learning_rate": 4.574493265501755e-05, + "loss": 0.5963, + "step": 1522 + }, + { + "epoch": 0.2131560531840448, + "grad_norm": 0.4347056827455416, + "learning_rate": 4.573860580286392e-05, + "loss": 0.5977, + "step": 1523 + }, + { + "epoch": 0.21329601119664102, + "grad_norm": 0.42401546255892897, + "learning_rate": 4.573227468871639e-05, + "loss": 0.6176, + "step": 1524 + }, + { + "epoch": 0.21343596920923724, + "grad_norm": 0.4263159567462123, + "learning_rate": 4.572593931387604e-05, + "loss": 0.5695, + "step": 1525 + }, + { + "epoch": 0.21357592722183344, + "grad_norm": 0.4121278643431697, + "learning_rate": 4.571959967964488e-05, + "loss": 0.5902, + "step": 1526 + }, + { + "epoch": 0.21371588523442966, + "grad_norm": 0.4158048949697275, + "learning_rate": 4.571325578732575e-05, + "loss": 0.626, + "step": 1527 + }, + { + "epoch": 0.2138558432470259, + "grad_norm": 0.4183332266945186, + "learning_rate": 4.5706907638222385e-05, + "loss": 0.6295, + "step": 1528 + }, + { + "epoch": 0.21399580125962211, + "grad_norm": 0.42044986223624303, + "learning_rate": 4.57005552336394e-05, + "loss": 0.5776, + "step": 1529 + }, + { + "epoch": 0.21413575927221834, + "grad_norm": 0.4237626863099945, + "learning_rate": 4.569419857488228e-05, + "loss": 0.602, + "step": 1530 + }, + { + "epoch": 0.21427571728481457, + "grad_norm": 0.4235915955493986, + "learning_rate": 4.568783766325738e-05, + "loss": 0.6088, + "step": 1531 + }, + { + "epoch": 0.2144156752974108, + "grad_norm": 0.4429373953075251, + "learning_rate": 4.568147250007193e-05, + "loss": 0.6244, + "step": 1532 + }, + { + "epoch": 0.214555633310007, + "grad_norm": 0.4312825642404986, + "learning_rate": 4.567510308663404e-05, + "loss": 0.5844, + "step": 1533 + }, + { + "epoch": 0.2146955913226032, + "grad_norm": 0.447768930946618, + "learning_rate": 4.5668729424252686e-05, + "loss": 0.6446, + "step": 1534 + }, + { + "epoch": 0.21483554933519944, + "grad_norm": 0.4031120444622804, + "learning_rate": 4.5662351514237725e-05, + "loss": 0.6134, + "step": 1535 + }, + { + "epoch": 0.21497550734779566, + "grad_norm": 0.4102236640467674, + "learning_rate": 4.5655969357899874e-05, + "loss": 0.5515, + "step": 1536 + }, + { + "epoch": 0.2151154653603919, + "grad_norm": 0.4184306996067782, + "learning_rate": 4.564958295655074e-05, + "loss": 0.5886, + "step": 1537 + }, + { + "epoch": 0.21525542337298811, + "grad_norm": 0.4061735300983383, + "learning_rate": 4.564319231150278e-05, + "loss": 0.5966, + "step": 1538 + }, + { + "epoch": 0.2153953813855843, + "grad_norm": 0.4337440887551534, + "learning_rate": 4.563679742406935e-05, + "loss": 0.5803, + "step": 1539 + }, + { + "epoch": 0.21553533939818054, + "grad_norm": 0.4509938114765241, + "learning_rate": 4.5630398295564656e-05, + "loss": 0.6328, + "step": 1540 + }, + { + "epoch": 0.21567529741077676, + "grad_norm": 0.4152270913030905, + "learning_rate": 4.562399492730379e-05, + "loss": 0.5656, + "step": 1541 + }, + { + "epoch": 0.215815255423373, + "grad_norm": 0.42027118148073633, + "learning_rate": 4.561758732060271e-05, + "loss": 0.6141, + "step": 1542 + }, + { + "epoch": 0.2159552134359692, + "grad_norm": 0.4187821883367385, + "learning_rate": 4.561117547677824e-05, + "loss": 0.5893, + "step": 1543 + }, + { + "epoch": 0.21609517144856544, + "grad_norm": 0.45872625216573915, + "learning_rate": 4.5604759397148076e-05, + "loss": 0.6301, + "step": 1544 + }, + { + "epoch": 0.21623512946116166, + "grad_norm": 0.46161111705747715, + "learning_rate": 4.559833908303079e-05, + "loss": 0.6104, + "step": 1545 + }, + { + "epoch": 0.21637508747375786, + "grad_norm": 0.41864006503435036, + "learning_rate": 4.559191453574582e-05, + "loss": 0.5886, + "step": 1546 + }, + { + "epoch": 0.21651504548635409, + "grad_norm": 0.4526739247776462, + "learning_rate": 4.5585485756613486e-05, + "loss": 0.6058, + "step": 1547 + }, + { + "epoch": 0.2166550034989503, + "grad_norm": 0.43967142639314527, + "learning_rate": 4.5579052746954955e-05, + "loss": 0.5684, + "step": 1548 + }, + { + "epoch": 0.21679496151154654, + "grad_norm": 0.6571015727605314, + "learning_rate": 4.557261550809228e-05, + "loss": 0.6116, + "step": 1549 + }, + { + "epoch": 0.21693491952414276, + "grad_norm": 0.42734729410443334, + "learning_rate": 4.5566174041348374e-05, + "loss": 0.5922, + "step": 1550 + }, + { + "epoch": 0.21707487753673899, + "grad_norm": 0.45611337371107136, + "learning_rate": 4.555972834804704e-05, + "loss": 0.6523, + "step": 1551 + }, + { + "epoch": 0.2172148355493352, + "grad_norm": 0.40089163017994156, + "learning_rate": 4.5553278429512914e-05, + "loss": 0.5866, + "step": 1552 + }, + { + "epoch": 0.2173547935619314, + "grad_norm": 0.40519806476960724, + "learning_rate": 4.554682428707153e-05, + "loss": 0.6494, + "step": 1553 + }, + { + "epoch": 0.21749475157452763, + "grad_norm": 0.41428429253275884, + "learning_rate": 4.5540365922049275e-05, + "loss": 0.6035, + "step": 1554 + }, + { + "epoch": 0.21763470958712386, + "grad_norm": 0.4216731233907631, + "learning_rate": 4.553390333577342e-05, + "loss": 0.5913, + "step": 1555 + }, + { + "epoch": 0.21777466759972008, + "grad_norm": 0.4773910626390572, + "learning_rate": 4.552743652957208e-05, + "loss": 0.5793, + "step": 1556 + }, + { + "epoch": 0.2179146256123163, + "grad_norm": 0.4386315899864147, + "learning_rate": 4.5520965504774246e-05, + "loss": 0.598, + "step": 1557 + }, + { + "epoch": 0.21805458362491253, + "grad_norm": 0.41469985672115917, + "learning_rate": 4.551449026270979e-05, + "loss": 0.5831, + "step": 1558 + }, + { + "epoch": 0.21819454163750876, + "grad_norm": 0.41848055045754795, + "learning_rate": 4.5508010804709434e-05, + "loss": 0.5948, + "step": 1559 + }, + { + "epoch": 0.21833449965010496, + "grad_norm": 0.4093401936521381, + "learning_rate": 4.550152713210478e-05, + "loss": 0.6166, + "step": 1560 + }, + { + "epoch": 0.21847445766270118, + "grad_norm": 0.39080382196363417, + "learning_rate": 4.5495039246228274e-05, + "loss": 0.5615, + "step": 1561 + }, + { + "epoch": 0.2186144156752974, + "grad_norm": 0.4548223020948238, + "learning_rate": 4.548854714841326e-05, + "loss": 0.6275, + "step": 1562 + }, + { + "epoch": 0.21875437368789363, + "grad_norm": 0.4423455420604275, + "learning_rate": 4.548205083999392e-05, + "loss": 0.6453, + "step": 1563 + }, + { + "epoch": 0.21889433170048986, + "grad_norm": 0.43733043542862093, + "learning_rate": 4.547555032230531e-05, + "loss": 0.624, + "step": 1564 + }, + { + "epoch": 0.21903428971308608, + "grad_norm": 0.4353839094146027, + "learning_rate": 4.546904559668335e-05, + "loss": 0.6306, + "step": 1565 + }, + { + "epoch": 0.2191742477256823, + "grad_norm": 0.43066663949560247, + "learning_rate": 4.546253666446484e-05, + "loss": 0.5603, + "step": 1566 + }, + { + "epoch": 0.2193142057382785, + "grad_norm": 0.43310274897745943, + "learning_rate": 4.545602352698742e-05, + "loss": 0.6276, + "step": 1567 + }, + { + "epoch": 0.21945416375087473, + "grad_norm": 0.4406898711500198, + "learning_rate": 4.544950618558961e-05, + "loss": 0.6111, + "step": 1568 + }, + { + "epoch": 0.21959412176347096, + "grad_norm": 0.44678818353637145, + "learning_rate": 4.544298464161079e-05, + "loss": 0.6159, + "step": 1569 + }, + { + "epoch": 0.21973407977606718, + "grad_norm": 0.43974972225616304, + "learning_rate": 4.54364588963912e-05, + "loss": 0.6103, + "step": 1570 + }, + { + "epoch": 0.2198740377886634, + "grad_norm": 0.4450084247496375, + "learning_rate": 4.542992895127195e-05, + "loss": 0.6351, + "step": 1571 + }, + { + "epoch": 0.22001399580125963, + "grad_norm": 0.4301181992679436, + "learning_rate": 4.5423394807595005e-05, + "loss": 0.5929, + "step": 1572 + }, + { + "epoch": 0.22015395381385583, + "grad_norm": 0.4354118703683763, + "learning_rate": 4.541685646670321e-05, + "loss": 0.6417, + "step": 1573 + }, + { + "epoch": 0.22029391182645205, + "grad_norm": 0.42020542688000095, + "learning_rate": 4.5410313929940244e-05, + "loss": 0.5609, + "step": 1574 + }, + { + "epoch": 0.22043386983904828, + "grad_norm": 0.405854594592669, + "learning_rate": 4.5403767198650683e-05, + "loss": 0.6229, + "step": 1575 + }, + { + "epoch": 0.2205738278516445, + "grad_norm": 0.4325219516484385, + "learning_rate": 4.5397216274179934e-05, + "loss": 0.5922, + "step": 1576 + }, + { + "epoch": 0.22071378586424073, + "grad_norm": 0.42884858732377196, + "learning_rate": 4.539066115787427e-05, + "loss": 0.5937, + "step": 1577 + }, + { + "epoch": 0.22085374387683696, + "grad_norm": 0.4397303655395074, + "learning_rate": 4.5384101851080864e-05, + "loss": 0.6208, + "step": 1578 + }, + { + "epoch": 0.22099370188943318, + "grad_norm": 0.4196787008923647, + "learning_rate": 4.537753835514769e-05, + "loss": 0.5936, + "step": 1579 + }, + { + "epoch": 0.22113365990202938, + "grad_norm": 0.4450498558445989, + "learning_rate": 4.537097067142363e-05, + "loss": 0.5795, + "step": 1580 + }, + { + "epoch": 0.2212736179146256, + "grad_norm": 0.4533941983871524, + "learning_rate": 4.5364398801258396e-05, + "loss": 0.5856, + "step": 1581 + }, + { + "epoch": 0.22141357592722183, + "grad_norm": 0.4417713842234416, + "learning_rate": 4.5357822746002586e-05, + "loss": 0.6313, + "step": 1582 + }, + { + "epoch": 0.22155353393981805, + "grad_norm": 0.4463677159437907, + "learning_rate": 4.535124250700764e-05, + "loss": 0.625, + "step": 1583 + }, + { + "epoch": 0.22169349195241428, + "grad_norm": 0.40975804674998356, + "learning_rate": 4.534465808562587e-05, + "loss": 0.5485, + "step": 1584 + }, + { + "epoch": 0.2218334499650105, + "grad_norm": 0.4543432488989666, + "learning_rate": 4.533806948321044e-05, + "loss": 0.6165, + "step": 1585 + }, + { + "epoch": 0.22197340797760673, + "grad_norm": 0.44872454774080367, + "learning_rate": 4.5331476701115366e-05, + "loss": 0.6361, + "step": 1586 + }, + { + "epoch": 0.22211336599020293, + "grad_norm": 0.44712318439982773, + "learning_rate": 4.532487974069554e-05, + "loss": 0.5872, + "step": 1587 + }, + { + "epoch": 0.22225332400279915, + "grad_norm": 0.4396204586538607, + "learning_rate": 4.53182786033067e-05, + "loss": 0.6105, + "step": 1588 + }, + { + "epoch": 0.22239328201539538, + "grad_norm": 0.4455977172934511, + "learning_rate": 4.531167329030545e-05, + "loss": 0.624, + "step": 1589 + }, + { + "epoch": 0.2225332400279916, + "grad_norm": 0.4556263334519999, + "learning_rate": 4.530506380304925e-05, + "loss": 0.6265, + "step": 1590 + }, + { + "epoch": 0.22267319804058783, + "grad_norm": 0.42577140585892975, + "learning_rate": 4.529845014289642e-05, + "loss": 0.6105, + "step": 1591 + }, + { + "epoch": 0.22281315605318405, + "grad_norm": 0.4180730424078042, + "learning_rate": 4.529183231120612e-05, + "loss": 0.5895, + "step": 1592 + }, + { + "epoch": 0.22295311406578028, + "grad_norm": 0.40699329023671954, + "learning_rate": 4.528521030933839e-05, + "loss": 0.6161, + "step": 1593 + }, + { + "epoch": 0.22309307207837648, + "grad_norm": 0.4222068544403408, + "learning_rate": 4.5278584138654116e-05, + "loss": 0.5605, + "step": 1594 + }, + { + "epoch": 0.2232330300909727, + "grad_norm": 0.4095845718488545, + "learning_rate": 4.527195380051505e-05, + "loss": 0.6219, + "step": 1595 + }, + { + "epoch": 0.22337298810356893, + "grad_norm": 0.4314280489339947, + "learning_rate": 4.526531929628379e-05, + "loss": 0.5701, + "step": 1596 + }, + { + "epoch": 0.22351294611616515, + "grad_norm": 0.4074259435374443, + "learning_rate": 4.525868062732379e-05, + "loss": 0.5631, + "step": 1597 + }, + { + "epoch": 0.22365290412876138, + "grad_norm": 0.4532813167302565, + "learning_rate": 4.5252037794999375e-05, + "loss": 0.589, + "step": 1598 + }, + { + "epoch": 0.2237928621413576, + "grad_norm": 0.4513647564836139, + "learning_rate": 4.52453908006757e-05, + "loss": 0.5948, + "step": 1599 + }, + { + "epoch": 0.22393282015395383, + "grad_norm": 0.41293390054677387, + "learning_rate": 4.52387396457188e-05, + "loss": 0.585, + "step": 1600 + }, + { + "epoch": 0.22407277816655002, + "grad_norm": 0.43716711655654145, + "learning_rate": 4.523208433149555e-05, + "loss": 0.5717, + "step": 1601 + }, + { + "epoch": 0.22421273617914625, + "grad_norm": 0.4258916507285024, + "learning_rate": 4.522542485937369e-05, + "loss": 0.6024, + "step": 1602 + }, + { + "epoch": 0.22435269419174247, + "grad_norm": 0.42215745803677884, + "learning_rate": 4.52187612307218e-05, + "loss": 0.564, + "step": 1603 + }, + { + "epoch": 0.2244926522043387, + "grad_norm": 0.4429228384575338, + "learning_rate": 4.521209344690933e-05, + "loss": 0.6278, + "step": 1604 + }, + { + "epoch": 0.22463261021693492, + "grad_norm": 0.4784015488798018, + "learning_rate": 4.5205421509306576e-05, + "loss": 0.6018, + "step": 1605 + }, + { + "epoch": 0.22477256822953115, + "grad_norm": 0.43024752872126554, + "learning_rate": 4.519874541928469e-05, + "loss": 0.6077, + "step": 1606 + }, + { + "epoch": 0.22491252624212738, + "grad_norm": 0.41398612904638815, + "learning_rate": 4.519206517821567e-05, + "loss": 0.5996, + "step": 1607 + }, + { + "epoch": 0.22505248425472357, + "grad_norm": 0.4141848810116437, + "learning_rate": 4.5185380787472384e-05, + "loss": 0.6349, + "step": 1608 + }, + { + "epoch": 0.2251924422673198, + "grad_norm": 0.43155153008345165, + "learning_rate": 4.5178692248428536e-05, + "loss": 0.5882, + "step": 1609 + }, + { + "epoch": 0.22533240027991602, + "grad_norm": 0.43080194308434416, + "learning_rate": 4.517199956245869e-05, + "loss": 0.6111, + "step": 1610 + }, + { + "epoch": 0.22547235829251225, + "grad_norm": 0.42258541276252165, + "learning_rate": 4.516530273093825e-05, + "loss": 0.5848, + "step": 1611 + }, + { + "epoch": 0.22561231630510847, + "grad_norm": 0.43019795244061954, + "learning_rate": 4.5158601755243505e-05, + "loss": 0.5885, + "step": 1612 + }, + { + "epoch": 0.2257522743177047, + "grad_norm": 0.4034883685193691, + "learning_rate": 4.5151896636751556e-05, + "loss": 0.6009, + "step": 1613 + }, + { + "epoch": 0.2258922323303009, + "grad_norm": 0.41958257656866, + "learning_rate": 4.514518737684038e-05, + "loss": 0.5881, + "step": 1614 + }, + { + "epoch": 0.22603219034289712, + "grad_norm": 0.4074178901403192, + "learning_rate": 4.513847397688879e-05, + "loss": 0.5606, + "step": 1615 + }, + { + "epoch": 0.22617214835549335, + "grad_norm": 0.44325729803840913, + "learning_rate": 4.513175643827647e-05, + "loss": 0.6336, + "step": 1616 + }, + { + "epoch": 0.22631210636808957, + "grad_norm": 0.43312998668445646, + "learning_rate": 4.5125034762383936e-05, + "loss": 0.607, + "step": 1617 + }, + { + "epoch": 0.2264520643806858, + "grad_norm": 0.40174739442199564, + "learning_rate": 4.511830895059255e-05, + "loss": 0.5623, + "step": 1618 + }, + { + "epoch": 0.22659202239328202, + "grad_norm": 0.43059705945767945, + "learning_rate": 4.511157900428456e-05, + "loss": 0.5922, + "step": 1619 + }, + { + "epoch": 0.22673198040587825, + "grad_norm": 0.42985175056584884, + "learning_rate": 4.5104844924843016e-05, + "loss": 0.5719, + "step": 1620 + }, + { + "epoch": 0.22687193841847444, + "grad_norm": 0.4503501491245042, + "learning_rate": 4.5098106713651846e-05, + "loss": 0.5888, + "step": 1621 + }, + { + "epoch": 0.22701189643107067, + "grad_norm": 0.4306003466493788, + "learning_rate": 4.509136437209582e-05, + "loss": 0.5624, + "step": 1622 + }, + { + "epoch": 0.2271518544436669, + "grad_norm": 0.413588915370572, + "learning_rate": 4.508461790156056e-05, + "loss": 0.5822, + "step": 1623 + }, + { + "epoch": 0.22729181245626312, + "grad_norm": 0.43438032996872095, + "learning_rate": 4.5077867303432546e-05, + "loss": 0.6452, + "step": 1624 + }, + { + "epoch": 0.22743177046885935, + "grad_norm": 0.4364680489398804, + "learning_rate": 4.5071112579099074e-05, + "loss": 0.6027, + "step": 1625 + }, + { + "epoch": 0.22757172848145557, + "grad_norm": 0.44709606835984195, + "learning_rate": 4.5064353729948315e-05, + "loss": 0.6131, + "step": 1626 + }, + { + "epoch": 0.2277116864940518, + "grad_norm": 0.43519671900411855, + "learning_rate": 4.505759075736929e-05, + "loss": 0.591, + "step": 1627 + }, + { + "epoch": 0.227851644506648, + "grad_norm": 0.41612778582560767, + "learning_rate": 4.505082366275184e-05, + "loss": 0.5877, + "step": 1628 + }, + { + "epoch": 0.22799160251924422, + "grad_norm": 0.4218216656661162, + "learning_rate": 4.504405244748669e-05, + "loss": 0.5534, + "step": 1629 + }, + { + "epoch": 0.22813156053184044, + "grad_norm": 0.42964453812050457, + "learning_rate": 4.503727711296538e-05, + "loss": 0.6389, + "step": 1630 + }, + { + "epoch": 0.22827151854443667, + "grad_norm": 0.42585594110280106, + "learning_rate": 4.503049766058033e-05, + "loss": 0.6084, + "step": 1631 + }, + { + "epoch": 0.2284114765570329, + "grad_norm": 0.423186467723224, + "learning_rate": 4.5023714091724756e-05, + "loss": 0.6029, + "step": 1632 + }, + { + "epoch": 0.22855143456962912, + "grad_norm": 0.43009182540969443, + "learning_rate": 4.5016926407792774e-05, + "loss": 0.6404, + "step": 1633 + }, + { + "epoch": 0.22869139258222534, + "grad_norm": 0.4291224518654508, + "learning_rate": 4.501013461017931e-05, + "loss": 0.6167, + "step": 1634 + }, + { + "epoch": 0.22883135059482154, + "grad_norm": 0.41098068932209736, + "learning_rate": 4.500333870028016e-05, + "loss": 0.581, + "step": 1635 + }, + { + "epoch": 0.22897130860741777, + "grad_norm": 0.4297903813744973, + "learning_rate": 4.499653867949194e-05, + "loss": 0.5923, + "step": 1636 + }, + { + "epoch": 0.229111266620014, + "grad_norm": 0.4194573186329626, + "learning_rate": 4.4989734549212125e-05, + "loss": 0.5893, + "step": 1637 + }, + { + "epoch": 0.22925122463261022, + "grad_norm": 0.44155487858058556, + "learning_rate": 4.498292631083904e-05, + "loss": 0.5734, + "step": 1638 + }, + { + "epoch": 0.22939118264520644, + "grad_norm": 0.4463968226022601, + "learning_rate": 4.4976113965771835e-05, + "loss": 0.6054, + "step": 1639 + }, + { + "epoch": 0.22953114065780267, + "grad_norm": 0.43124302519709157, + "learning_rate": 4.496929751541054e-05, + "loss": 0.5788, + "step": 1640 + }, + { + "epoch": 0.2296710986703989, + "grad_norm": 0.4206213171563187, + "learning_rate": 4.4962476961155976e-05, + "loss": 0.5675, + "step": 1641 + }, + { + "epoch": 0.2298110566829951, + "grad_norm": 0.4696499710097156, + "learning_rate": 4.495565230440985e-05, + "loss": 0.6295, + "step": 1642 + }, + { + "epoch": 0.22995101469559132, + "grad_norm": 0.3979651141981672, + "learning_rate": 4.494882354657469e-05, + "loss": 0.584, + "step": 1643 + }, + { + "epoch": 0.23009097270818754, + "grad_norm": 0.5843917120508454, + "learning_rate": 4.4941990689053886e-05, + "loss": 0.5725, + "step": 1644 + }, + { + "epoch": 0.23023093072078377, + "grad_norm": 0.4077407210872767, + "learning_rate": 4.4935153733251656e-05, + "loss": 0.5744, + "step": 1645 + }, + { + "epoch": 0.23037088873338, + "grad_norm": 0.4392056225941937, + "learning_rate": 4.4928312680573064e-05, + "loss": 0.6168, + "step": 1646 + }, + { + "epoch": 0.23051084674597622, + "grad_norm": 0.4356793893175191, + "learning_rate": 4.492146753242401e-05, + "loss": 0.5972, + "step": 1647 + }, + { + "epoch": 0.23065080475857244, + "grad_norm": 0.42512045974302, + "learning_rate": 4.491461829021125e-05, + "loss": 0.6265, + "step": 1648 + }, + { + "epoch": 0.23079076277116864, + "grad_norm": 0.4378392920345575, + "learning_rate": 4.490776495534237e-05, + "loss": 0.593, + "step": 1649 + }, + { + "epoch": 0.23093072078376486, + "grad_norm": 11.092127253150068, + "learning_rate": 4.49009075292258e-05, + "loss": 0.5982, + "step": 1650 + }, + { + "epoch": 0.2310706787963611, + "grad_norm": 0.4211855457116042, + "learning_rate": 4.489404601327081e-05, + "loss": 0.5885, + "step": 1651 + }, + { + "epoch": 0.23121063680895731, + "grad_norm": 0.44452065502823107, + "learning_rate": 4.48871804088875e-05, + "loss": 0.6286, + "step": 1652 + }, + { + "epoch": 0.23135059482155354, + "grad_norm": 0.44745394315873227, + "learning_rate": 4.488031071748684e-05, + "loss": 0.6036, + "step": 1653 + }, + { + "epoch": 0.23149055283414977, + "grad_norm": 0.41971430239918867, + "learning_rate": 4.487343694048061e-05, + "loss": 0.5691, + "step": 1654 + }, + { + "epoch": 0.23163051084674596, + "grad_norm": 0.43793140979246065, + "learning_rate": 4.4866559079281447e-05, + "loss": 0.6025, + "step": 1655 + }, + { + "epoch": 0.2317704688593422, + "grad_norm": 0.45584544502664887, + "learning_rate": 4.485967713530281e-05, + "loss": 0.5985, + "step": 1656 + }, + { + "epoch": 0.2319104268719384, + "grad_norm": 0.4393986821966947, + "learning_rate": 4.485279110995903e-05, + "loss": 0.6181, + "step": 1657 + }, + { + "epoch": 0.23205038488453464, + "grad_norm": 0.41398671250331304, + "learning_rate": 4.4845901004665234e-05, + "loss": 0.5659, + "step": 1658 + }, + { + "epoch": 0.23219034289713086, + "grad_norm": 0.40845641035568697, + "learning_rate": 4.483900682083742e-05, + "loss": 0.6004, + "step": 1659 + }, + { + "epoch": 0.2323303009097271, + "grad_norm": 0.41163784471475046, + "learning_rate": 4.4832108559892406e-05, + "loss": 0.5676, + "step": 1660 + }, + { + "epoch": 0.23247025892232331, + "grad_norm": 0.40520538312381466, + "learning_rate": 4.4825206223247855e-05, + "loss": 0.5709, + "step": 1661 + }, + { + "epoch": 0.2326102169349195, + "grad_norm": 0.4052889024628849, + "learning_rate": 4.481829981232227e-05, + "loss": 0.6205, + "step": 1662 + }, + { + "epoch": 0.23275017494751574, + "grad_norm": 0.44570039573163067, + "learning_rate": 4.481138932853499e-05, + "loss": 0.6339, + "step": 1663 + }, + { + "epoch": 0.23289013296011196, + "grad_norm": 0.42218605234809253, + "learning_rate": 4.480447477330619e-05, + "loss": 0.582, + "step": 1664 + }, + { + "epoch": 0.2330300909727082, + "grad_norm": 0.4139552246868811, + "learning_rate": 4.479755614805688e-05, + "loss": 0.6185, + "step": 1665 + }, + { + "epoch": 0.2331700489853044, + "grad_norm": 0.438909444526491, + "learning_rate": 4.4790633454208904e-05, + "loss": 0.582, + "step": 1666 + }, + { + "epoch": 0.23331000699790064, + "grad_norm": 0.41733390361062983, + "learning_rate": 4.478370669318494e-05, + "loss": 0.5954, + "step": 1667 + }, + { + "epoch": 0.23344996501049686, + "grad_norm": 0.43169892360959883, + "learning_rate": 4.477677586640854e-05, + "loss": 0.6044, + "step": 1668 + }, + { + "epoch": 0.23358992302309306, + "grad_norm": 0.41695287451508345, + "learning_rate": 4.4769840975304014e-05, + "loss": 0.5781, + "step": 1669 + }, + { + "epoch": 0.23372988103568929, + "grad_norm": 0.4287731524988866, + "learning_rate": 4.476290202129658e-05, + "loss": 0.6271, + "step": 1670 + }, + { + "epoch": 0.2338698390482855, + "grad_norm": 0.41617003120095575, + "learning_rate": 4.4755959005812256e-05, + "loss": 0.5917, + "step": 1671 + }, + { + "epoch": 0.23400979706088174, + "grad_norm": 0.4317600967426944, + "learning_rate": 4.474901193027791e-05, + "loss": 0.6371, + "step": 1672 + }, + { + "epoch": 0.23414975507347796, + "grad_norm": 0.4245879897664653, + "learning_rate": 4.474206079612122e-05, + "loss": 0.6104, + "step": 1673 + }, + { + "epoch": 0.23428971308607419, + "grad_norm": 0.4493803376375899, + "learning_rate": 4.4735105604770735e-05, + "loss": 0.5892, + "step": 1674 + }, + { + "epoch": 0.2344296710986704, + "grad_norm": 0.4250874727480562, + "learning_rate": 4.4728146357655795e-05, + "loss": 0.5675, + "step": 1675 + }, + { + "epoch": 0.2345696291112666, + "grad_norm": 0.4374362501965964, + "learning_rate": 4.4721183056206614e-05, + "loss": 0.6075, + "step": 1676 + }, + { + "epoch": 0.23470958712386283, + "grad_norm": 0.41760478685320906, + "learning_rate": 4.4714215701854225e-05, + "loss": 0.5986, + "step": 1677 + }, + { + "epoch": 0.23484954513645906, + "grad_norm": 0.42350699224694727, + "learning_rate": 4.4707244296030464e-05, + "loss": 0.6486, + "step": 1678 + }, + { + "epoch": 0.23498950314905528, + "grad_norm": 0.4256916273457718, + "learning_rate": 4.4700268840168045e-05, + "loss": 0.6174, + "step": 1679 + }, + { + "epoch": 0.2351294611616515, + "grad_norm": 0.42531800370790246, + "learning_rate": 4.469328933570051e-05, + "loss": 0.551, + "step": 1680 + }, + { + "epoch": 0.23526941917424773, + "grad_norm": 0.4181940913192066, + "learning_rate": 4.468630578406218e-05, + "loss": 0.5856, + "step": 1681 + }, + { + "epoch": 0.23540937718684396, + "grad_norm": 0.4105263397513692, + "learning_rate": 4.467931818668827e-05, + "loss": 0.5633, + "step": 1682 + }, + { + "epoch": 0.23554933519944016, + "grad_norm": 0.4081654944687357, + "learning_rate": 4.46723265450148e-05, + "loss": 0.5808, + "step": 1683 + }, + { + "epoch": 0.23568929321203638, + "grad_norm": 0.3907211987137891, + "learning_rate": 4.466533086047861e-05, + "loss": 0.5571, + "step": 1684 + }, + { + "epoch": 0.2358292512246326, + "grad_norm": 0.4250060420027131, + "learning_rate": 4.465833113451741e-05, + "loss": 0.6208, + "step": 1685 + }, + { + "epoch": 0.23596920923722883, + "grad_norm": 0.3937263336098636, + "learning_rate": 4.465132736856969e-05, + "loss": 0.5621, + "step": 1686 + }, + { + "epoch": 0.23610916724982506, + "grad_norm": 0.41965579506262274, + "learning_rate": 4.464431956407481e-05, + "loss": 0.6031, + "step": 1687 + }, + { + "epoch": 0.23624912526242128, + "grad_norm": 0.426646222987036, + "learning_rate": 4.463730772247293e-05, + "loss": 0.6033, + "step": 1688 + }, + { + "epoch": 0.2363890832750175, + "grad_norm": 0.4439770950702544, + "learning_rate": 4.463029184520507e-05, + "loss": 0.604, + "step": 1689 + }, + { + "epoch": 0.2365290412876137, + "grad_norm": 0.4761499260018663, + "learning_rate": 4.4623271933713065e-05, + "loss": 0.6307, + "step": 1690 + }, + { + "epoch": 0.23666899930020993, + "grad_norm": 0.4335225372190029, + "learning_rate": 4.4616247989439565e-05, + "loss": 0.576, + "step": 1691 + }, + { + "epoch": 0.23680895731280616, + "grad_norm": 0.43316503301129894, + "learning_rate": 4.4609220013828065e-05, + "loss": 0.6275, + "step": 1692 + }, + { + "epoch": 0.23694891532540238, + "grad_norm": 0.4313464290288283, + "learning_rate": 4.46021880083229e-05, + "loss": 0.6067, + "step": 1693 + }, + { + "epoch": 0.2370888733379986, + "grad_norm": 0.41535036276436027, + "learning_rate": 4.45951519743692e-05, + "loss": 0.6192, + "step": 1694 + }, + { + "epoch": 0.23722883135059483, + "grad_norm": 0.4310228331540852, + "learning_rate": 4.4588111913412945e-05, + "loss": 0.5915, + "step": 1695 + }, + { + "epoch": 0.23736878936319103, + "grad_norm": 0.42874110588166797, + "learning_rate": 4.458106782690094e-05, + "loss": 0.5912, + "step": 1696 + }, + { + "epoch": 0.23750874737578725, + "grad_norm": 0.43089863664872624, + "learning_rate": 4.4574019716280824e-05, + "loss": 0.613, + "step": 1697 + }, + { + "epoch": 0.23764870538838348, + "grad_norm": 0.4181283245016506, + "learning_rate": 4.4566967583001046e-05, + "loss": 0.5918, + "step": 1698 + }, + { + "epoch": 0.2377886634009797, + "grad_norm": 0.4521121054953592, + "learning_rate": 4.4559911428510895e-05, + "loss": 0.6326, + "step": 1699 + }, + { + "epoch": 0.23792862141357593, + "grad_norm": 0.41538878241716964, + "learning_rate": 4.4552851254260484e-05, + "loss": 0.5994, + "step": 1700 + }, + { + "epoch": 0.23806857942617216, + "grad_norm": 0.4693907751456843, + "learning_rate": 4.454578706170075e-05, + "loss": 0.6444, + "step": 1701 + }, + { + "epoch": 0.23820853743876838, + "grad_norm": 0.47587489105004577, + "learning_rate": 4.453871885228345e-05, + "loss": 0.6053, + "step": 1702 + }, + { + "epoch": 0.23834849545136458, + "grad_norm": 0.44284115011605213, + "learning_rate": 4.4531646627461175e-05, + "loss": 0.6107, + "step": 1703 + }, + { + "epoch": 0.2384884534639608, + "grad_norm": 0.42954706530716213, + "learning_rate": 4.452457038868735e-05, + "loss": 0.6082, + "step": 1704 + }, + { + "epoch": 0.23862841147655703, + "grad_norm": 0.4558514723051049, + "learning_rate": 4.4517490137416196e-05, + "loss": 0.5926, + "step": 1705 + }, + { + "epoch": 0.23876836948915325, + "grad_norm": 0.41271838706018094, + "learning_rate": 4.451040587510279e-05, + "loss": 0.5552, + "step": 1706 + }, + { + "epoch": 0.23890832750174948, + "grad_norm": 0.4234721479869862, + "learning_rate": 4.450331760320302e-05, + "loss": 0.5907, + "step": 1707 + }, + { + "epoch": 0.2390482855143457, + "grad_norm": 0.4301944314412581, + "learning_rate": 4.449622532317359e-05, + "loss": 0.5637, + "step": 1708 + }, + { + "epoch": 0.23918824352694193, + "grad_norm": 0.3992534678208053, + "learning_rate": 4.448912903647203e-05, + "loss": 0.5746, + "step": 1709 + }, + { + "epoch": 0.23932820153953813, + "grad_norm": 0.4151340108937535, + "learning_rate": 4.448202874455673e-05, + "loss": 0.5953, + "step": 1710 + }, + { + "epoch": 0.23946815955213435, + "grad_norm": 0.42307801303730386, + "learning_rate": 4.447492444888682e-05, + "loss": 0.5985, + "step": 1711 + }, + { + "epoch": 0.23960811756473058, + "grad_norm": 0.4254157003906138, + "learning_rate": 4.446781615092235e-05, + "loss": 0.5706, + "step": 1712 + }, + { + "epoch": 0.2397480755773268, + "grad_norm": 0.4221420053361986, + "learning_rate": 4.446070385212414e-05, + "loss": 0.5865, + "step": 1713 + }, + { + "epoch": 0.23988803358992303, + "grad_norm": 0.44169866453949225, + "learning_rate": 4.445358755395382e-05, + "loss": 0.6131, + "step": 1714 + }, + { + "epoch": 0.24002799160251925, + "grad_norm": 0.41028257323034817, + "learning_rate": 4.444646725787387e-05, + "loss": 0.6391, + "step": 1715 + }, + { + "epoch": 0.24016794961511548, + "grad_norm": 0.42430000545950824, + "learning_rate": 4.4439342965347595e-05, + "loss": 0.5713, + "step": 1716 + }, + { + "epoch": 0.24030790762771168, + "grad_norm": 0.40781271006014086, + "learning_rate": 4.4432214677839095e-05, + "loss": 0.5508, + "step": 1717 + }, + { + "epoch": 0.2404478656403079, + "grad_norm": 0.4231812063581877, + "learning_rate": 4.442508239681331e-05, + "loss": 0.59, + "step": 1718 + }, + { + "epoch": 0.24058782365290413, + "grad_norm": 0.4175498825745329, + "learning_rate": 4.4417946123736e-05, + "loss": 0.6073, + "step": 1719 + }, + { + "epoch": 0.24072778166550035, + "grad_norm": 0.43499143607770346, + "learning_rate": 4.4410805860073736e-05, + "loss": 0.562, + "step": 1720 + }, + { + "epoch": 0.24086773967809658, + "grad_norm": 0.4405506003884184, + "learning_rate": 4.440366160729392e-05, + "loss": 0.5963, + "step": 1721 + }, + { + "epoch": 0.2410076976906928, + "grad_norm": 0.4048128591600598, + "learning_rate": 4.4396513366864765e-05, + "loss": 0.5533, + "step": 1722 + }, + { + "epoch": 0.24114765570328903, + "grad_norm": 0.41424145471376667, + "learning_rate": 4.4389361140255306e-05, + "loss": 0.5814, + "step": 1723 + }, + { + "epoch": 0.24128761371588522, + "grad_norm": 0.41036719244303405, + "learning_rate": 4.43822049289354e-05, + "loss": 0.57, + "step": 1724 + }, + { + "epoch": 0.24142757172848145, + "grad_norm": 0.41201535335271977, + "learning_rate": 4.4375044734375724e-05, + "loss": 0.5484, + "step": 1725 + }, + { + "epoch": 0.24156752974107767, + "grad_norm": 0.4111942850121887, + "learning_rate": 4.436788055804777e-05, + "loss": 0.5706, + "step": 1726 + }, + { + "epoch": 0.2417074877536739, + "grad_norm": 0.4519161404524503, + "learning_rate": 4.436071240142383e-05, + "loss": 0.6018, + "step": 1727 + }, + { + "epoch": 0.24184744576627012, + "grad_norm": 0.4403101160025735, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.6072, + "step": 1728 + }, + { + "epoch": 0.24198740377886635, + "grad_norm": 0.49703818437945607, + "learning_rate": 4.4346364153181397e-05, + "loss": 0.6081, + "step": 1729 + }, + { + "epoch": 0.24212736179146255, + "grad_norm": 0.41243187874993914, + "learning_rate": 4.433918406451161e-05, + "loss": 0.5901, + "step": 1730 + }, + { + "epoch": 0.24226731980405877, + "grad_norm": 0.40745642609215477, + "learning_rate": 4.433200000144326e-05, + "loss": 0.5789, + "step": 1731 + }, + { + "epoch": 0.242407277816655, + "grad_norm": 0.42520024238957543, + "learning_rate": 4.4324811965452764e-05, + "loss": 0.5929, + "step": 1732 + }, + { + "epoch": 0.24254723582925122, + "grad_norm": 0.4483762787233949, + "learning_rate": 4.431761995801733e-05, + "loss": 0.6412, + "step": 1733 + }, + { + "epoch": 0.24268719384184745, + "grad_norm": 0.42388738271936166, + "learning_rate": 4.431042398061499e-05, + "loss": 0.6215, + "step": 1734 + }, + { + "epoch": 0.24282715185444367, + "grad_norm": 0.42006944785214295, + "learning_rate": 4.430322403472459e-05, + "loss": 0.6243, + "step": 1735 + }, + { + "epoch": 0.2429671098670399, + "grad_norm": 0.41526878760328473, + "learning_rate": 4.429602012182579e-05, + "loss": 0.5868, + "step": 1736 + }, + { + "epoch": 0.2431070678796361, + "grad_norm": 0.40029320601405294, + "learning_rate": 4.428881224339907e-05, + "loss": 0.6238, + "step": 1737 + }, + { + "epoch": 0.24324702589223232, + "grad_norm": 0.424773544460618, + "learning_rate": 4.4281600400925725e-05, + "loss": 0.6287, + "step": 1738 + }, + { + "epoch": 0.24338698390482855, + "grad_norm": 0.43089065054686443, + "learning_rate": 4.4274384595887855e-05, + "loss": 0.579, + "step": 1739 + }, + { + "epoch": 0.24352694191742477, + "grad_norm": 0.39335625658540807, + "learning_rate": 4.426716482976838e-05, + "loss": 0.5945, + "step": 1740 + }, + { + "epoch": 0.243666899930021, + "grad_norm": 0.4321714579592816, + "learning_rate": 4.425994110405105e-05, + "loss": 0.6165, + "step": 1741 + }, + { + "epoch": 0.24380685794261722, + "grad_norm": 0.3868262436570827, + "learning_rate": 4.425271342022039e-05, + "loss": 0.6322, + "step": 1742 + }, + { + "epoch": 0.24394681595521345, + "grad_norm": 0.4194791382722549, + "learning_rate": 4.424548177976179e-05, + "loss": 0.6155, + "step": 1743 + }, + { + "epoch": 0.24408677396780964, + "grad_norm": 0.4118315161780323, + "learning_rate": 4.42382461841614e-05, + "loss": 0.6144, + "step": 1744 + }, + { + "epoch": 0.24422673198040587, + "grad_norm": 0.40215952380747805, + "learning_rate": 4.4231006634906224e-05, + "loss": 0.5487, + "step": 1745 + }, + { + "epoch": 0.2443666899930021, + "grad_norm": 0.42510463260619374, + "learning_rate": 4.422376313348405e-05, + "loss": 0.5674, + "step": 1746 + }, + { + "epoch": 0.24450664800559832, + "grad_norm": 0.4220078299124082, + "learning_rate": 4.4216515681383505e-05, + "loss": 0.6159, + "step": 1747 + }, + { + "epoch": 0.24464660601819455, + "grad_norm": 0.42050182431963373, + "learning_rate": 4.420926428009401e-05, + "loss": 0.6066, + "step": 1748 + }, + { + "epoch": 0.24478656403079077, + "grad_norm": 0.43689368971554526, + "learning_rate": 4.4202008931105795e-05, + "loss": 0.5568, + "step": 1749 + }, + { + "epoch": 0.244926522043387, + "grad_norm": 0.43739145581183353, + "learning_rate": 4.4194749635909924e-05, + "loss": 0.5984, + "step": 1750 + }, + { + "epoch": 0.2450664800559832, + "grad_norm": 0.4595283770036913, + "learning_rate": 4.4187486395998235e-05, + "loss": 0.6204, + "step": 1751 + }, + { + "epoch": 0.24520643806857942, + "grad_norm": 0.4332223020604194, + "learning_rate": 4.4180219212863404e-05, + "loss": 0.6114, + "step": 1752 + }, + { + "epoch": 0.24534639608117564, + "grad_norm": 0.44490455320321953, + "learning_rate": 4.417294808799892e-05, + "loss": 0.5771, + "step": 1753 + }, + { + "epoch": 0.24548635409377187, + "grad_norm": 0.42354862515455083, + "learning_rate": 4.416567302289907e-05, + "loss": 0.6092, + "step": 1754 + }, + { + "epoch": 0.2456263121063681, + "grad_norm": 0.4252852477883581, + "learning_rate": 4.4158394019058944e-05, + "loss": 0.6034, + "step": 1755 + }, + { + "epoch": 0.24576627011896432, + "grad_norm": 0.4376952545699014, + "learning_rate": 4.415111107797445e-05, + "loss": 0.623, + "step": 1756 + }, + { + "epoch": 0.24590622813156054, + "grad_norm": 0.41667144416831586, + "learning_rate": 4.414382420114233e-05, + "loss": 0.6042, + "step": 1757 + }, + { + "epoch": 0.24604618614415674, + "grad_norm": 0.42933664014516487, + "learning_rate": 4.413653339006008e-05, + "loss": 0.6022, + "step": 1758 + }, + { + "epoch": 0.24618614415675297, + "grad_norm": 0.4162921306132023, + "learning_rate": 4.4129238646226055e-05, + "loss": 0.5887, + "step": 1759 + }, + { + "epoch": 0.2463261021693492, + "grad_norm": 0.4306173347777482, + "learning_rate": 4.4121939971139403e-05, + "loss": 0.5574, + "step": 1760 + }, + { + "epoch": 0.24646606018194542, + "grad_norm": 0.43621303487831553, + "learning_rate": 4.411463736630006e-05, + "loss": 0.5876, + "step": 1761 + }, + { + "epoch": 0.24660601819454164, + "grad_norm": 0.419540410185526, + "learning_rate": 4.410733083320879e-05, + "loss": 0.599, + "step": 1762 + }, + { + "epoch": 0.24674597620713787, + "grad_norm": 0.4510539099176357, + "learning_rate": 4.4100020373367166e-05, + "loss": 0.6325, + "step": 1763 + }, + { + "epoch": 0.2468859342197341, + "grad_norm": 0.42372431177296327, + "learning_rate": 4.409270598827756e-05, + "loss": 0.5969, + "step": 1764 + }, + { + "epoch": 0.2470258922323303, + "grad_norm": 0.45611399870187513, + "learning_rate": 4.408538767944315e-05, + "loss": 0.6076, + "step": 1765 + }, + { + "epoch": 0.24716585024492652, + "grad_norm": 0.4401540888444283, + "learning_rate": 4.407806544836792e-05, + "loss": 0.6134, + "step": 1766 + }, + { + "epoch": 0.24730580825752274, + "grad_norm": 0.43386652619598914, + "learning_rate": 4.407073929655666e-05, + "loss": 0.6015, + "step": 1767 + }, + { + "epoch": 0.24744576627011897, + "grad_norm": 0.43265086343988496, + "learning_rate": 4.406340922551499e-05, + "loss": 0.6059, + "step": 1768 + }, + { + "epoch": 0.2475857242827152, + "grad_norm": 0.4200705255951236, + "learning_rate": 4.4056075236749285e-05, + "loss": 0.5596, + "step": 1769 + }, + { + "epoch": 0.24772568229531142, + "grad_norm": 0.4327675200614844, + "learning_rate": 4.404873733176678e-05, + "loss": 0.5673, + "step": 1770 + }, + { + "epoch": 0.24786564030790761, + "grad_norm": 0.41060345330483095, + "learning_rate": 4.4041395512075464e-05, + "loss": 0.5658, + "step": 1771 + }, + { + "epoch": 0.24800559832050384, + "grad_norm": 0.4177460191821631, + "learning_rate": 4.403404977918417e-05, + "loss": 0.5871, + "step": 1772 + }, + { + "epoch": 0.24814555633310006, + "grad_norm": 0.3995644804936815, + "learning_rate": 4.402670013460252e-05, + "loss": 0.5767, + "step": 1773 + }, + { + "epoch": 0.2482855143456963, + "grad_norm": 0.40455718790823125, + "learning_rate": 4.401934657984094e-05, + "loss": 0.6232, + "step": 1774 + }, + { + "epoch": 0.24842547235829252, + "grad_norm": 0.40147880141712233, + "learning_rate": 4.401198911641066e-05, + "loss": 0.587, + "step": 1775 + }, + { + "epoch": 0.24856543037088874, + "grad_norm": 0.40405116225388327, + "learning_rate": 4.400462774582371e-05, + "loss": 0.5708, + "step": 1776 + }, + { + "epoch": 0.24870538838348497, + "grad_norm": 0.4227092160628934, + "learning_rate": 4.399726246959293e-05, + "loss": 0.6278, + "step": 1777 + }, + { + "epoch": 0.24884534639608116, + "grad_norm": 0.40251616958969094, + "learning_rate": 4.3989893289231954e-05, + "loss": 0.6146, + "step": 1778 + }, + { + "epoch": 0.2489853044086774, + "grad_norm": 0.4113564198991073, + "learning_rate": 4.398252020625523e-05, + "loss": 0.5772, + "step": 1779 + }, + { + "epoch": 0.2491252624212736, + "grad_norm": 0.4218976966069186, + "learning_rate": 4.3975143222178e-05, + "loss": 0.5956, + "step": 1780 + }, + { + "epoch": 0.24926522043386984, + "grad_norm": 0.4153141525664469, + "learning_rate": 4.39677623385163e-05, + "loss": 0.5925, + "step": 1781 + }, + { + "epoch": 0.24940517844646606, + "grad_norm": 0.408421965231002, + "learning_rate": 4.3960377556787e-05, + "loss": 0.597, + "step": 1782 + }, + { + "epoch": 0.2495451364590623, + "grad_norm": 0.41009081087351246, + "learning_rate": 4.3952988878507714e-05, + "loss": 0.5997, + "step": 1783 + }, + { + "epoch": 0.24968509447165851, + "grad_norm": 0.4126702795308356, + "learning_rate": 4.3945596305196925e-05, + "loss": 0.5847, + "step": 1784 + }, + { + "epoch": 0.2498250524842547, + "grad_norm": 0.40686925420136566, + "learning_rate": 4.393819983837385e-05, + "loss": 0.5748, + "step": 1785 + }, + { + "epoch": 0.24996501049685094, + "grad_norm": 0.42764000046837597, + "learning_rate": 4.393079947955856e-05, + "loss": 0.5689, + "step": 1786 + }, + { + "epoch": 0.2501049685094472, + "grad_norm": 0.43927412707903785, + "learning_rate": 4.39233952302719e-05, + "loss": 0.6304, + "step": 1787 + }, + { + "epoch": 0.2502449265220434, + "grad_norm": 0.41444855744545284, + "learning_rate": 4.3915987092035505e-05, + "loss": 0.5849, + "step": 1788 + }, + { + "epoch": 0.2503848845346396, + "grad_norm": 0.42682475396958397, + "learning_rate": 4.3908575066371835e-05, + "loss": 0.5862, + "step": 1789 + }, + { + "epoch": 0.25052484254723584, + "grad_norm": 0.4771191183081023, + "learning_rate": 4.390115915480414e-05, + "loss": 0.6329, + "step": 1790 + }, + { + "epoch": 0.25066480055983204, + "grad_norm": 0.4308111583911404, + "learning_rate": 4.389373935885646e-05, + "loss": 0.6113, + "step": 1791 + }, + { + "epoch": 0.2508047585724283, + "grad_norm": 0.42487713253039067, + "learning_rate": 4.388631568005364e-05, + "loss": 0.6097, + "step": 1792 + }, + { + "epoch": 0.2509447165850245, + "grad_norm": 0.44680223218467024, + "learning_rate": 4.387888811992131e-05, + "loss": 0.5852, + "step": 1793 + }, + { + "epoch": 0.25108467459762074, + "grad_norm": 0.43930277215742447, + "learning_rate": 4.387145667998591e-05, + "loss": 0.6279, + "step": 1794 + }, + { + "epoch": 0.25122463261021694, + "grad_norm": 0.45405868972289387, + "learning_rate": 4.38640213617747e-05, + "loss": 0.5879, + "step": 1795 + }, + { + "epoch": 0.25136459062281313, + "grad_norm": 0.41454960378404326, + "learning_rate": 4.385658216681569e-05, + "loss": 0.5724, + "step": 1796 + }, + { + "epoch": 0.2515045486354094, + "grad_norm": 0.4061553712342834, + "learning_rate": 4.384913909663772e-05, + "loss": 0.5411, + "step": 1797 + }, + { + "epoch": 0.2516445066480056, + "grad_norm": 0.4276501719660472, + "learning_rate": 4.384169215277041e-05, + "loss": 0.557, + "step": 1798 + }, + { + "epoch": 0.25178446466060184, + "grad_norm": 0.386857810391419, + "learning_rate": 4.383424133674419e-05, + "loss": 0.5897, + "step": 1799 + }, + { + "epoch": 0.25192442267319803, + "grad_norm": 0.4310189731603924, + "learning_rate": 4.382678665009028e-05, + "loss": 0.6095, + "step": 1800 + }, + { + "epoch": 0.2520643806857943, + "grad_norm": 0.39537072584058847, + "learning_rate": 4.381932809434068e-05, + "loss": 0.5557, + "step": 1801 + }, + { + "epoch": 0.2522043386983905, + "grad_norm": 0.3973617854406729, + "learning_rate": 4.3811865671028206e-05, + "loss": 0.5778, + "step": 1802 + }, + { + "epoch": 0.2523442967109867, + "grad_norm": 0.4326957921508078, + "learning_rate": 4.380439938168647e-05, + "loss": 0.5888, + "step": 1803 + }, + { + "epoch": 0.25248425472358293, + "grad_norm": 0.40126247732516246, + "learning_rate": 4.379692922784986e-05, + "loss": 0.5733, + "step": 1804 + }, + { + "epoch": 0.25262421273617913, + "grad_norm": 0.3962832983075247, + "learning_rate": 4.378945521105357e-05, + "loss": 0.5701, + "step": 1805 + }, + { + "epoch": 0.2527641707487754, + "grad_norm": 0.4292056775606421, + "learning_rate": 4.378197733283359e-05, + "loss": 0.6177, + "step": 1806 + }, + { + "epoch": 0.2529041287613716, + "grad_norm": 0.4598576468560811, + "learning_rate": 4.377449559472669e-05, + "loss": 0.5838, + "step": 1807 + }, + { + "epoch": 0.25304408677396784, + "grad_norm": 0.4943287880849202, + "learning_rate": 4.3767009998270464e-05, + "loss": 0.6519, + "step": 1808 + }, + { + "epoch": 0.25318404478656403, + "grad_norm": 0.41197329102085845, + "learning_rate": 4.375952054500326e-05, + "loss": 0.5685, + "step": 1809 + }, + { + "epoch": 0.25332400279916023, + "grad_norm": 0.4473419746095371, + "learning_rate": 4.375202723646424e-05, + "loss": 0.5987, + "step": 1810 + }, + { + "epoch": 0.2534639608117565, + "grad_norm": 0.42470540560047887, + "learning_rate": 4.374453007419336e-05, + "loss": 0.5944, + "step": 1811 + }, + { + "epoch": 0.2536039188243527, + "grad_norm": 0.4319391359458571, + "learning_rate": 4.373702905973135e-05, + "loss": 0.6269, + "step": 1812 + }, + { + "epoch": 0.25374387683694893, + "grad_norm": 0.4025733974145466, + "learning_rate": 4.3729524194619766e-05, + "loss": 0.5944, + "step": 1813 + }, + { + "epoch": 0.25388383484954513, + "grad_norm": 0.4420608064693045, + "learning_rate": 4.3722015480400916e-05, + "loss": 0.5987, + "step": 1814 + }, + { + "epoch": 0.2540237928621414, + "grad_norm": 0.3993911334683989, + "learning_rate": 4.371450291861792e-05, + "loss": 0.5578, + "step": 1815 + }, + { + "epoch": 0.2541637508747376, + "grad_norm": 0.43011254515928227, + "learning_rate": 4.370698651081469e-05, + "loss": 0.6051, + "step": 1816 + }, + { + "epoch": 0.2543037088873338, + "grad_norm": 0.4109991007436434, + "learning_rate": 4.369946625853593e-05, + "loss": 0.5478, + "step": 1817 + }, + { + "epoch": 0.25444366689993003, + "grad_norm": 0.42138731559636133, + "learning_rate": 4.369194216332712e-05, + "loss": 0.5464, + "step": 1818 + }, + { + "epoch": 0.25458362491252623, + "grad_norm": 0.43137639859587146, + "learning_rate": 4.368441422673453e-05, + "loss": 0.6625, + "step": 1819 + }, + { + "epoch": 0.2547235829251225, + "grad_norm": 0.42368932399104675, + "learning_rate": 4.367688245030523e-05, + "loss": 0.6049, + "step": 1820 + }, + { + "epoch": 0.2548635409377187, + "grad_norm": 0.40789716476560073, + "learning_rate": 4.36693468355871e-05, + "loss": 0.5915, + "step": 1821 + }, + { + "epoch": 0.25500349895031493, + "grad_norm": 0.4209915793540894, + "learning_rate": 4.366180738412876e-05, + "loss": 0.5932, + "step": 1822 + }, + { + "epoch": 0.25514345696291113, + "grad_norm": 0.4046845978271354, + "learning_rate": 4.365426409747965e-05, + "loss": 0.5933, + "step": 1823 + }, + { + "epoch": 0.25528341497550733, + "grad_norm": 0.3975587329369015, + "learning_rate": 4.3646716977189996e-05, + "loss": 0.5731, + "step": 1824 + }, + { + "epoch": 0.2554233729881036, + "grad_norm": 0.42825231948537235, + "learning_rate": 4.36391660248108e-05, + "loss": 0.575, + "step": 1825 + }, + { + "epoch": 0.2555633310006998, + "grad_norm": 0.4099650326475359, + "learning_rate": 4.3631611241893874e-05, + "loss": 0.612, + "step": 1826 + }, + { + "epoch": 0.25570328901329603, + "grad_norm": 0.4163478892519659, + "learning_rate": 4.362405262999178e-05, + "loss": 0.5941, + "step": 1827 + }, + { + "epoch": 0.25584324702589223, + "grad_norm": 0.4192607935058611, + "learning_rate": 4.361649019065791e-05, + "loss": 0.5861, + "step": 1828 + }, + { + "epoch": 0.2559832050384884, + "grad_norm": 0.4056602965223505, + "learning_rate": 4.3608923925446424e-05, + "loss": 0.5517, + "step": 1829 + }, + { + "epoch": 0.2561231630510847, + "grad_norm": 0.4291339795549124, + "learning_rate": 4.360135383591224e-05, + "loss": 0.6211, + "step": 1830 + }, + { + "epoch": 0.2562631210636809, + "grad_norm": 0.4214799959182231, + "learning_rate": 4.3593779923611114e-05, + "loss": 0.6101, + "step": 1831 + }, + { + "epoch": 0.25640307907627713, + "grad_norm": 0.45878889417856267, + "learning_rate": 4.3586202190099555e-05, + "loss": 0.6222, + "step": 1832 + }, + { + "epoch": 0.2565430370888733, + "grad_norm": 0.401016178107381, + "learning_rate": 4.357862063693486e-05, + "loss": 0.5759, + "step": 1833 + }, + { + "epoch": 0.2566829951014696, + "grad_norm": 0.4096702585349008, + "learning_rate": 4.357103526567511e-05, + "loss": 0.6322, + "step": 1834 + }, + { + "epoch": 0.2568229531140658, + "grad_norm": 0.4133870618187997, + "learning_rate": 4.3563446077879194e-05, + "loss": 0.5974, + "step": 1835 + }, + { + "epoch": 0.256962911126662, + "grad_norm": 0.41053680469933324, + "learning_rate": 4.355585307510675e-05, + "loss": 0.5636, + "step": 1836 + }, + { + "epoch": 0.2571028691392582, + "grad_norm": 0.4286300619381813, + "learning_rate": 4.354825625891822e-05, + "loss": 0.5705, + "step": 1837 + }, + { + "epoch": 0.2572428271518544, + "grad_norm": 0.5243484730511359, + "learning_rate": 4.354065563087484e-05, + "loss": 0.5945, + "step": 1838 + }, + { + "epoch": 0.2573827851644507, + "grad_norm": 0.4165723735814597, + "learning_rate": 4.3533051192538596e-05, + "loss": 0.5785, + "step": 1839 + }, + { + "epoch": 0.2575227431770469, + "grad_norm": 0.4192843833101577, + "learning_rate": 4.3525442945472294e-05, + "loss": 0.5873, + "step": 1840 + }, + { + "epoch": 0.25766270118964313, + "grad_norm": 0.42759680733088257, + "learning_rate": 4.3517830891239496e-05, + "loss": 0.6008, + "step": 1841 + }, + { + "epoch": 0.2578026592022393, + "grad_norm": 0.41416919735625124, + "learning_rate": 4.351021503140456e-05, + "loss": 0.6047, + "step": 1842 + }, + { + "epoch": 0.2579426172148355, + "grad_norm": 0.4183249907357826, + "learning_rate": 4.350259536753262e-05, + "loss": 0.5839, + "step": 1843 + }, + { + "epoch": 0.2580825752274318, + "grad_norm": 0.41171514577228524, + "learning_rate": 4.34949719011896e-05, + "loss": 0.5826, + "step": 1844 + }, + { + "epoch": 0.258222533240028, + "grad_norm": 0.4240796706040656, + "learning_rate": 4.348734463394219e-05, + "loss": 0.5988, + "step": 1845 + }, + { + "epoch": 0.2583624912526242, + "grad_norm": 0.4231976779667788, + "learning_rate": 4.3479713567357886e-05, + "loss": 0.5686, + "step": 1846 + }, + { + "epoch": 0.2585024492652204, + "grad_norm": 0.4027448525781189, + "learning_rate": 4.347207870300494e-05, + "loss": 0.5991, + "step": 1847 + }, + { + "epoch": 0.2586424072778167, + "grad_norm": 0.44912825496869785, + "learning_rate": 4.346444004245239e-05, + "loss": 0.5893, + "step": 1848 + }, + { + "epoch": 0.2587823652904129, + "grad_norm": 0.4267680067654748, + "learning_rate": 4.3456797587270066e-05, + "loss": 0.5951, + "step": 1849 + }, + { + "epoch": 0.25892232330300907, + "grad_norm": 0.43586393714352484, + "learning_rate": 4.344915133902856e-05, + "loss": 0.5675, + "step": 1850 + }, + { + "epoch": 0.2590622813156053, + "grad_norm": 0.4559790753468547, + "learning_rate": 4.344150129929927e-05, + "loss": 0.6084, + "step": 1851 + }, + { + "epoch": 0.2592022393282015, + "grad_norm": 0.4196684079525518, + "learning_rate": 4.3433847469654344e-05, + "loss": 0.5434, + "step": 1852 + }, + { + "epoch": 0.2593421973407978, + "grad_norm": 0.43364642603356207, + "learning_rate": 4.342618985166672e-05, + "loss": 0.602, + "step": 1853 + }, + { + "epoch": 0.259482155353394, + "grad_norm": 0.43456208436691296, + "learning_rate": 4.341852844691012e-05, + "loss": 0.5678, + "step": 1854 + }, + { + "epoch": 0.2596221133659902, + "grad_norm": 0.4445616632907679, + "learning_rate": 4.341086325695905e-05, + "loss": 0.6011, + "step": 1855 + }, + { + "epoch": 0.2597620713785864, + "grad_norm": 0.4148631528766163, + "learning_rate": 4.340319428338877e-05, + "loss": 0.5984, + "step": 1856 + }, + { + "epoch": 0.2599020293911826, + "grad_norm": 0.42267402439629853, + "learning_rate": 4.339552152777534e-05, + "loss": 0.6198, + "step": 1857 + }, + { + "epoch": 0.2600419874037789, + "grad_norm": 0.43701220197978513, + "learning_rate": 4.338784499169559e-05, + "loss": 0.5792, + "step": 1858 + }, + { + "epoch": 0.26018194541637507, + "grad_norm": 0.4498233901707297, + "learning_rate": 4.338016467672712e-05, + "loss": 0.5822, + "step": 1859 + }, + { + "epoch": 0.2603219034289713, + "grad_norm": 0.4302218516293081, + "learning_rate": 4.337248058444832e-05, + "loss": 0.6329, + "step": 1860 + }, + { + "epoch": 0.2604618614415675, + "grad_norm": 0.4149612252531319, + "learning_rate": 4.336479271643833e-05, + "loss": 0.5676, + "step": 1861 + }, + { + "epoch": 0.2606018194541638, + "grad_norm": 0.4482912811055662, + "learning_rate": 4.335710107427711e-05, + "loss": 0.6405, + "step": 1862 + }, + { + "epoch": 0.26074177746675997, + "grad_norm": 0.43212615840984736, + "learning_rate": 4.3349405659545365e-05, + "loss": 0.6291, + "step": 1863 + }, + { + "epoch": 0.26088173547935617, + "grad_norm": 0.43790968743419006, + "learning_rate": 4.334170647382457e-05, + "loss": 0.6161, + "step": 1864 + }, + { + "epoch": 0.2610216934919524, + "grad_norm": 0.4057107337252783, + "learning_rate": 4.333400351869699e-05, + "loss": 0.5897, + "step": 1865 + }, + { + "epoch": 0.2611616515045486, + "grad_norm": 0.4169972715195528, + "learning_rate": 4.332629679574566e-05, + "loss": 0.5807, + "step": 1866 + }, + { + "epoch": 0.2613016095171449, + "grad_norm": 0.4148009483085235, + "learning_rate": 4.3318586306554394e-05, + "loss": 0.5784, + "step": 1867 + }, + { + "epoch": 0.26144156752974107, + "grad_norm": 0.3916134068027622, + "learning_rate": 4.331087205270777e-05, + "loss": 0.6159, + "step": 1868 + }, + { + "epoch": 0.2615815255423373, + "grad_norm": 0.413939159109484, + "learning_rate": 4.3303154035791164e-05, + "loss": 0.5965, + "step": 1869 + }, + { + "epoch": 0.2617214835549335, + "grad_norm": 0.3965210541189402, + "learning_rate": 4.329543225739068e-05, + "loss": 0.5793, + "step": 1870 + }, + { + "epoch": 0.2618614415675297, + "grad_norm": 0.42358611314483735, + "learning_rate": 4.328770671909323e-05, + "loss": 0.5817, + "step": 1871 + }, + { + "epoch": 0.26200139958012597, + "grad_norm": 0.4039935315082308, + "learning_rate": 4.32799774224865e-05, + "loss": 0.5825, + "step": 1872 + }, + { + "epoch": 0.26214135759272217, + "grad_norm": 0.42958959652262796, + "learning_rate": 4.327224436915893e-05, + "loss": 0.5831, + "step": 1873 + }, + { + "epoch": 0.2622813156053184, + "grad_norm": 0.4520950932964586, + "learning_rate": 4.3264507560699746e-05, + "loss": 0.6003, + "step": 1874 + }, + { + "epoch": 0.2624212736179146, + "grad_norm": 0.4177363042656859, + "learning_rate": 4.3256766998698936e-05, + "loss": 0.6011, + "step": 1875 + }, + { + "epoch": 0.26256123163051087, + "grad_norm": 0.4416091225807165, + "learning_rate": 4.324902268474727e-05, + "loss": 0.6244, + "step": 1876 + }, + { + "epoch": 0.26270118964310707, + "grad_norm": 0.6065946464455126, + "learning_rate": 4.324127462043627e-05, + "loss": 0.6069, + "step": 1877 + }, + { + "epoch": 0.26284114765570327, + "grad_norm": 0.41146045832841216, + "learning_rate": 4.323352280735826e-05, + "loss": 0.6005, + "step": 1878 + }, + { + "epoch": 0.2629811056682995, + "grad_norm": 0.4003315872022978, + "learning_rate": 4.322576724710631e-05, + "loss": 0.575, + "step": 1879 + }, + { + "epoch": 0.2631210636808957, + "grad_norm": 0.40221521105567576, + "learning_rate": 4.3218007941274264e-05, + "loss": 0.5528, + "step": 1880 + }, + { + "epoch": 0.26326102169349197, + "grad_norm": 0.4161742129214681, + "learning_rate": 4.321024489145673e-05, + "loss": 0.6148, + "step": 1881 + }, + { + "epoch": 0.26340097970608817, + "grad_norm": 0.4130366243572411, + "learning_rate": 4.3202478099249105e-05, + "loss": 0.6184, + "step": 1882 + }, + { + "epoch": 0.2635409377186844, + "grad_norm": 0.44069492817846667, + "learning_rate": 4.3194707566247537e-05, + "loss": 0.5936, + "step": 1883 + }, + { + "epoch": 0.2636808957312806, + "grad_norm": 0.43421883092849856, + "learning_rate": 4.318693329404896e-05, + "loss": 0.6115, + "step": 1884 + }, + { + "epoch": 0.2638208537438768, + "grad_norm": 0.41021601219211107, + "learning_rate": 4.317915528425106e-05, + "loss": 0.5749, + "step": 1885 + }, + { + "epoch": 0.26396081175647307, + "grad_norm": 0.42123839224718174, + "learning_rate": 4.317137353845229e-05, + "loss": 0.598, + "step": 1886 + }, + { + "epoch": 0.26410076976906927, + "grad_norm": 0.431025530329678, + "learning_rate": 4.316358805825188e-05, + "loss": 0.5969, + "step": 1887 + }, + { + "epoch": 0.2642407277816655, + "grad_norm": 0.40969971019500684, + "learning_rate": 4.3155798845249827e-05, + "loss": 0.5962, + "step": 1888 + }, + { + "epoch": 0.2643806857942617, + "grad_norm": 0.4155454566156464, + "learning_rate": 4.3148005901046905e-05, + "loss": 0.5936, + "step": 1889 + }, + { + "epoch": 0.26452064380685797, + "grad_norm": 0.4317150467528267, + "learning_rate": 4.3140209227244624e-05, + "loss": 0.6169, + "step": 1890 + }, + { + "epoch": 0.26466060181945417, + "grad_norm": 0.3970247011419925, + "learning_rate": 4.313240882544529e-05, + "loss": 0.5761, + "step": 1891 + }, + { + "epoch": 0.26480055983205036, + "grad_norm": 0.4414850369173905, + "learning_rate": 4.3124604697251963e-05, + "loss": 0.5991, + "step": 1892 + }, + { + "epoch": 0.2649405178446466, + "grad_norm": 0.4499059882632428, + "learning_rate": 4.311679684426847e-05, + "loss": 0.5507, + "step": 1893 + }, + { + "epoch": 0.2650804758572428, + "grad_norm": 0.4195714682577236, + "learning_rate": 4.310898526809941e-05, + "loss": 0.5908, + "step": 1894 + }, + { + "epoch": 0.26522043386983907, + "grad_norm": 0.44322553286670324, + "learning_rate": 4.3101169970350125e-05, + "loss": 0.5807, + "step": 1895 + }, + { + "epoch": 0.26536039188243526, + "grad_norm": 0.424382131322031, + "learning_rate": 4.309335095262676e-05, + "loss": 0.5867, + "step": 1896 + }, + { + "epoch": 0.2655003498950315, + "grad_norm": 0.43960102391817163, + "learning_rate": 4.308552821653618e-05, + "loss": 0.5984, + "step": 1897 + }, + { + "epoch": 0.2656403079076277, + "grad_norm": 0.39684220956322386, + "learning_rate": 4.307770176368605e-05, + "loss": 0.5701, + "step": 1898 + }, + { + "epoch": 0.2657802659202239, + "grad_norm": 0.43672589760925556, + "learning_rate": 4.306987159568479e-05, + "loss": 0.5973, + "step": 1899 + }, + { + "epoch": 0.26592022393282017, + "grad_norm": 0.40233707019737064, + "learning_rate": 4.3062037714141565e-05, + "loss": 0.613, + "step": 1900 + }, + { + "epoch": 0.26606018194541636, + "grad_norm": 0.4264494610214143, + "learning_rate": 4.3054200120666334e-05, + "loss": 0.606, + "step": 1901 + }, + { + "epoch": 0.2662001399580126, + "grad_norm": 0.4098467135771313, + "learning_rate": 4.304635881686978e-05, + "loss": 0.5857, + "step": 1902 + }, + { + "epoch": 0.2663400979706088, + "grad_norm": 0.4240466866788527, + "learning_rate": 4.303851380436339e-05, + "loss": 0.6153, + "step": 1903 + }, + { + "epoch": 0.266480055983205, + "grad_norm": 0.403958075146994, + "learning_rate": 4.303066508475939e-05, + "loss": 0.5663, + "step": 1904 + }, + { + "epoch": 0.26662001399580126, + "grad_norm": 0.4143420243493558, + "learning_rate": 4.302281265967076e-05, + "loss": 0.5963, + "step": 1905 + }, + { + "epoch": 0.26675997200839746, + "grad_norm": 0.4444719253392314, + "learning_rate": 4.301495653071126e-05, + "loss": 0.6231, + "step": 1906 + }, + { + "epoch": 0.2668999300209937, + "grad_norm": 0.40995195237809584, + "learning_rate": 4.3007096699495406e-05, + "loss": 0.5821, + "step": 1907 + }, + { + "epoch": 0.2670398880335899, + "grad_norm": 0.405726468503472, + "learning_rate": 4.299923316763848e-05, + "loss": 0.604, + "step": 1908 + }, + { + "epoch": 0.26717984604618616, + "grad_norm": 0.4078344765445211, + "learning_rate": 4.2991365936756505e-05, + "loss": 0.576, + "step": 1909 + }, + { + "epoch": 0.26731980405878236, + "grad_norm": 0.4227354700436783, + "learning_rate": 4.2983495008466276e-05, + "loss": 0.6146, + "step": 1910 + }, + { + "epoch": 0.26745976207137856, + "grad_norm": 0.4030353788382993, + "learning_rate": 4.2975620384385364e-05, + "loss": 0.5752, + "step": 1911 + }, + { + "epoch": 0.2675997200839748, + "grad_norm": 0.4012686664661831, + "learning_rate": 4.296774206613207e-05, + "loss": 0.5892, + "step": 1912 + }, + { + "epoch": 0.267739678096571, + "grad_norm": 0.41404486946782215, + "learning_rate": 4.2959860055325474e-05, + "loss": 0.5702, + "step": 1913 + }, + { + "epoch": 0.26787963610916726, + "grad_norm": 0.41190311027197035, + "learning_rate": 4.295197435358541e-05, + "loss": 0.5803, + "step": 1914 + }, + { + "epoch": 0.26801959412176346, + "grad_norm": 0.4245596260036112, + "learning_rate": 4.294408496253246e-05, + "loss": 0.6054, + "step": 1915 + }, + { + "epoch": 0.2681595521343597, + "grad_norm": 0.4261060945315179, + "learning_rate": 4.293619188378798e-05, + "loss": 0.5609, + "step": 1916 + }, + { + "epoch": 0.2682995101469559, + "grad_norm": 0.42776522346359913, + "learning_rate": 4.292829511897409e-05, + "loss": 0.5822, + "step": 1917 + }, + { + "epoch": 0.2684394681595521, + "grad_norm": 0.41648930810429086, + "learning_rate": 4.292039466971364e-05, + "loss": 0.5746, + "step": 1918 + }, + { + "epoch": 0.26857942617214836, + "grad_norm": 0.3887697670352693, + "learning_rate": 4.2912490537630255e-05, + "loss": 0.587, + "step": 1919 + }, + { + "epoch": 0.26871938418474456, + "grad_norm": 0.44349778409152874, + "learning_rate": 4.2904582724348316e-05, + "loss": 0.6146, + "step": 1920 + }, + { + "epoch": 0.2688593421973408, + "grad_norm": 0.4179386506999582, + "learning_rate": 4.2896671231492966e-05, + "loss": 0.6399, + "step": 1921 + }, + { + "epoch": 0.268999300209937, + "grad_norm": 0.465029201453624, + "learning_rate": 4.288875606069008e-05, + "loss": 0.6134, + "step": 1922 + }, + { + "epoch": 0.26913925822253326, + "grad_norm": 0.4425974682516534, + "learning_rate": 4.2880837213566326e-05, + "loss": 0.6037, + "step": 1923 + }, + { + "epoch": 0.26927921623512946, + "grad_norm": 0.42747672129154957, + "learning_rate": 4.2872914691749086e-05, + "loss": 0.6102, + "step": 1924 + }, + { + "epoch": 0.26941917424772566, + "grad_norm": 0.4492107075459924, + "learning_rate": 4.286498849686654e-05, + "loss": 0.591, + "step": 1925 + }, + { + "epoch": 0.2695591322603219, + "grad_norm": 0.4016863287653028, + "learning_rate": 4.2857058630547594e-05, + "loss": 0.5677, + "step": 1926 + }, + { + "epoch": 0.2696990902729181, + "grad_norm": 0.4143898674573289, + "learning_rate": 4.2849125094421905e-05, + "loss": 0.5673, + "step": 1927 + }, + { + "epoch": 0.26983904828551436, + "grad_norm": 0.39235286773607386, + "learning_rate": 4.284118789011991e-05, + "loss": 0.5805, + "step": 1928 + }, + { + "epoch": 0.26997900629811056, + "grad_norm": 0.39515207584942524, + "learning_rate": 4.2833247019272775e-05, + "loss": 0.5898, + "step": 1929 + }, + { + "epoch": 0.2701189643107068, + "grad_norm": 0.4201779430133444, + "learning_rate": 4.2825302483512444e-05, + "loss": 0.6402, + "step": 1930 + }, + { + "epoch": 0.270258922323303, + "grad_norm": 0.41576770047699163, + "learning_rate": 4.281735428447158e-05, + "loss": 0.5649, + "step": 1931 + }, + { + "epoch": 0.2703988803358992, + "grad_norm": 0.41462546934983685, + "learning_rate": 4.2809402423783624e-05, + "loss": 0.5938, + "step": 1932 + }, + { + "epoch": 0.27053883834849546, + "grad_norm": 0.4096839436305104, + "learning_rate": 4.280144690308277e-05, + "loss": 0.5786, + "step": 1933 + }, + { + "epoch": 0.27067879636109166, + "grad_norm": 0.41756540359737393, + "learning_rate": 4.279348772400395e-05, + "loss": 0.5666, + "step": 1934 + }, + { + "epoch": 0.2708187543736879, + "grad_norm": 0.39443631101683757, + "learning_rate": 4.278552488818288e-05, + "loss": 0.5919, + "step": 1935 + }, + { + "epoch": 0.2709587123862841, + "grad_norm": 0.438020775582902, + "learning_rate": 4.277755839725598e-05, + "loss": 0.5852, + "step": 1936 + }, + { + "epoch": 0.27109867039888036, + "grad_norm": 0.4043207874267435, + "learning_rate": 4.2769588252860434e-05, + "loss": 0.5628, + "step": 1937 + }, + { + "epoch": 0.27123862841147656, + "grad_norm": 0.4455988825173519, + "learning_rate": 4.276161445663423e-05, + "loss": 0.6069, + "step": 1938 + }, + { + "epoch": 0.27137858642407275, + "grad_norm": 0.41404820018714156, + "learning_rate": 4.275363701021602e-05, + "loss": 0.5833, + "step": 1939 + }, + { + "epoch": 0.271518544436669, + "grad_norm": 0.4207829197564249, + "learning_rate": 4.2745655915245266e-05, + "loss": 0.5698, + "step": 1940 + }, + { + "epoch": 0.2716585024492652, + "grad_norm": 0.4300264097184269, + "learning_rate": 4.273767117336217e-05, + "loss": 0.6043, + "step": 1941 + }, + { + "epoch": 0.27179846046186146, + "grad_norm": 0.4230423530894155, + "learning_rate": 4.272968278620768e-05, + "loss": 0.5954, + "step": 1942 + }, + { + "epoch": 0.27193841847445765, + "grad_norm": 0.4104705230949498, + "learning_rate": 4.272169075542348e-05, + "loss": 0.6132, + "step": 1943 + }, + { + "epoch": 0.2720783764870539, + "grad_norm": 0.4412681694744553, + "learning_rate": 4.2713695082652015e-05, + "loss": 0.634, + "step": 1944 + }, + { + "epoch": 0.2722183344996501, + "grad_norm": 0.41281029518549284, + "learning_rate": 4.270569576953648e-05, + "loss": 0.5803, + "step": 1945 + }, + { + "epoch": 0.2723582925122463, + "grad_norm": 0.4326754906394364, + "learning_rate": 4.269769281772082e-05, + "loss": 0.5982, + "step": 1946 + }, + { + "epoch": 0.27249825052484256, + "grad_norm": 0.4143197407799747, + "learning_rate": 4.2689686228849716e-05, + "loss": 0.6203, + "step": 1947 + }, + { + "epoch": 0.27263820853743875, + "grad_norm": 0.44499388653333827, + "learning_rate": 4.26816760045686e-05, + "loss": 0.5921, + "step": 1948 + }, + { + "epoch": 0.272778166550035, + "grad_norm": 0.4198979954091138, + "learning_rate": 4.267366214652366e-05, + "loss": 0.5864, + "step": 1949 + }, + { + "epoch": 0.2729181245626312, + "grad_norm": 0.43526258690475156, + "learning_rate": 4.266564465636183e-05, + "loss": 0.5948, + "step": 1950 + }, + { + "epoch": 0.27305808257522746, + "grad_norm": 0.3940644065133908, + "learning_rate": 4.265762353573077e-05, + "loss": 0.585, + "step": 1951 + }, + { + "epoch": 0.27319804058782365, + "grad_norm": 0.40794104308496737, + "learning_rate": 4.264959878627891e-05, + "loss": 0.6, + "step": 1952 + }, + { + "epoch": 0.27333799860041985, + "grad_norm": 0.3823510808207485, + "learning_rate": 4.264157040965543e-05, + "loss": 0.547, + "step": 1953 + }, + { + "epoch": 0.2734779566130161, + "grad_norm": 0.42277154396243044, + "learning_rate": 4.263353840751022e-05, + "loss": 0.5996, + "step": 1954 + }, + { + "epoch": 0.2736179146256123, + "grad_norm": 0.39495247008059065, + "learning_rate": 4.2625502781493955e-05, + "loss": 0.5769, + "step": 1955 + }, + { + "epoch": 0.27375787263820855, + "grad_norm": 0.43172939634218205, + "learning_rate": 4.261746353325804e-05, + "loss": 0.5667, + "step": 1956 + }, + { + "epoch": 0.27389783065080475, + "grad_norm": 0.42748966931166393, + "learning_rate": 4.26094206644546e-05, + "loss": 0.619, + "step": 1957 + }, + { + "epoch": 0.274037788663401, + "grad_norm": 0.41520591372518717, + "learning_rate": 4.260137417673655e-05, + "loss": 0.5495, + "step": 1958 + }, + { + "epoch": 0.2741777466759972, + "grad_norm": 0.41675727817849284, + "learning_rate": 4.259332407175751e-05, + "loss": 0.5678, + "step": 1959 + }, + { + "epoch": 0.2743177046885934, + "grad_norm": 0.42389412857665876, + "learning_rate": 4.258527035117187e-05, + "loss": 0.5776, + "step": 1960 + }, + { + "epoch": 0.27445766270118965, + "grad_norm": 0.42650038389451433, + "learning_rate": 4.257721301663474e-05, + "loss": 0.5926, + "step": 1961 + }, + { + "epoch": 0.27459762071378585, + "grad_norm": 0.4547693632893839, + "learning_rate": 4.2569152069801994e-05, + "loss": 0.6109, + "step": 1962 + }, + { + "epoch": 0.2747375787263821, + "grad_norm": 0.4096367527079948, + "learning_rate": 4.256108751233023e-05, + "loss": 0.5742, + "step": 1963 + }, + { + "epoch": 0.2748775367389783, + "grad_norm": 0.43852841060550257, + "learning_rate": 4.255301934587679e-05, + "loss": 0.6161, + "step": 1964 + }, + { + "epoch": 0.27501749475157455, + "grad_norm": 0.4108034529541987, + "learning_rate": 4.254494757209979e-05, + "loss": 0.5909, + "step": 1965 + }, + { + "epoch": 0.27515745276417075, + "grad_norm": 0.4179747177285123, + "learning_rate": 4.2536872192658036e-05, + "loss": 0.5902, + "step": 1966 + }, + { + "epoch": 0.27529741077676695, + "grad_norm": 0.4075584545924878, + "learning_rate": 4.2528793209211106e-05, + "loss": 0.573, + "step": 1967 + }, + { + "epoch": 0.2754373687893632, + "grad_norm": 0.4201389140014726, + "learning_rate": 4.252071062341933e-05, + "loss": 0.5944, + "step": 1968 + }, + { + "epoch": 0.2755773268019594, + "grad_norm": 0.4253197821835051, + "learning_rate": 4.251262443694374e-05, + "loss": 0.569, + "step": 1969 + }, + { + "epoch": 0.27571728481455565, + "grad_norm": 0.41281050840051553, + "learning_rate": 4.2504534651446134e-05, + "loss": 0.5845, + "step": 1970 + }, + { + "epoch": 0.27585724282715185, + "grad_norm": 0.4053164541842358, + "learning_rate": 4.2496441268589046e-05, + "loss": 0.6001, + "step": 1971 + }, + { + "epoch": 0.2759972008397481, + "grad_norm": 0.4381447506546874, + "learning_rate": 4.2488344290035755e-05, + "loss": 0.607, + "step": 1972 + }, + { + "epoch": 0.2761371588523443, + "grad_norm": 0.4183122839304423, + "learning_rate": 4.248024371745027e-05, + "loss": 0.5905, + "step": 1973 + }, + { + "epoch": 0.2762771168649405, + "grad_norm": 0.4284345637823913, + "learning_rate": 4.2472139552497334e-05, + "loss": 0.6224, + "step": 1974 + }, + { + "epoch": 0.27641707487753675, + "grad_norm": 0.43624222428113946, + "learning_rate": 4.246403179684245e-05, + "loss": 0.5874, + "step": 1975 + }, + { + "epoch": 0.27655703289013295, + "grad_norm": 0.4238838475394068, + "learning_rate": 4.245592045215182e-05, + "loss": 0.5939, + "step": 1976 + }, + { + "epoch": 0.2766969909027292, + "grad_norm": 0.41691347825833375, + "learning_rate": 4.244780552009243e-05, + "loss": 0.6258, + "step": 1977 + }, + { + "epoch": 0.2768369489153254, + "grad_norm": 0.4083689917945832, + "learning_rate": 4.2439687002331974e-05, + "loss": 0.5565, + "step": 1978 + }, + { + "epoch": 0.27697690692792165, + "grad_norm": 0.4285760791242101, + "learning_rate": 4.2431564900538884e-05, + "loss": 0.5957, + "step": 1979 + }, + { + "epoch": 0.27711686494051785, + "grad_norm": 0.41197446910489155, + "learning_rate": 4.242343921638234e-05, + "loss": 0.601, + "step": 1980 + }, + { + "epoch": 0.27725682295311405, + "grad_norm": 0.38975348870242715, + "learning_rate": 4.241530995153225e-05, + "loss": 0.562, + "step": 1981 + }, + { + "epoch": 0.2773967809657103, + "grad_norm": 0.4368612257662016, + "learning_rate": 4.2407177107659256e-05, + "loss": 0.6061, + "step": 1982 + }, + { + "epoch": 0.2775367389783065, + "grad_norm": 0.40941708192964865, + "learning_rate": 4.2399040686434756e-05, + "loss": 0.5659, + "step": 1983 + }, + { + "epoch": 0.27767669699090275, + "grad_norm": 0.4233950461539689, + "learning_rate": 4.239090068953086e-05, + "loss": 0.5981, + "step": 1984 + }, + { + "epoch": 0.27781665500349895, + "grad_norm": 0.42236280001104815, + "learning_rate": 4.23827571186204e-05, + "loss": 0.5886, + "step": 1985 + }, + { + "epoch": 0.27795661301609514, + "grad_norm": 0.4144640899811079, + "learning_rate": 4.237460997537699e-05, + "loss": 0.5779, + "step": 1986 + }, + { + "epoch": 0.2780965710286914, + "grad_norm": 0.4625927000805653, + "learning_rate": 4.2366459261474933e-05, + "loss": 0.5679, + "step": 1987 + }, + { + "epoch": 0.2782365290412876, + "grad_norm": 0.4076605883733819, + "learning_rate": 4.235830497858929e-05, + "loss": 0.5655, + "step": 1988 + }, + { + "epoch": 0.27837648705388385, + "grad_norm": 0.42845289807533865, + "learning_rate": 4.235014712839586e-05, + "loss": 0.6282, + "step": 1989 + }, + { + "epoch": 0.27851644506648005, + "grad_norm": 0.3954911530156325, + "learning_rate": 4.2341985712571144e-05, + "loss": 0.5594, + "step": 1990 + }, + { + "epoch": 0.2786564030790763, + "grad_norm": 0.42832437836826653, + "learning_rate": 4.233382073279241e-05, + "loss": 0.592, + "step": 1991 + }, + { + "epoch": 0.2787963610916725, + "grad_norm": 0.3915035514180305, + "learning_rate": 4.232565219073763e-05, + "loss": 0.5767, + "step": 1992 + }, + { + "epoch": 0.2789363191042687, + "grad_norm": 0.4091633397512807, + "learning_rate": 4.231748008808554e-05, + "loss": 0.5995, + "step": 1993 + }, + { + "epoch": 0.27907627711686495, + "grad_norm": 0.43083489824001814, + "learning_rate": 4.230930442651557e-05, + "loss": 0.6254, + "step": 1994 + }, + { + "epoch": 0.27921623512946114, + "grad_norm": 0.4215052561977625, + "learning_rate": 4.230112520770792e-05, + "loss": 0.5946, + "step": 1995 + }, + { + "epoch": 0.2793561931420574, + "grad_norm": 0.41624126906364184, + "learning_rate": 4.22929424333435e-05, + "loss": 0.5912, + "step": 1996 + }, + { + "epoch": 0.2794961511546536, + "grad_norm": 0.4084391778034195, + "learning_rate": 4.228475610510394e-05, + "loss": 0.5643, + "step": 1997 + }, + { + "epoch": 0.27963610916724985, + "grad_norm": 0.4151631289731341, + "learning_rate": 4.227656622467162e-05, + "loss": 0.5762, + "step": 1998 + }, + { + "epoch": 0.27977606717984604, + "grad_norm": 0.4207154793240298, + "learning_rate": 4.226837279372965e-05, + "loss": 0.5836, + "step": 1999 + }, + { + "epoch": 0.27991602519244224, + "grad_norm": 0.44259939289210365, + "learning_rate": 4.226017581396186e-05, + "loss": 0.6026, + "step": 2000 + }, + { + "epoch": 0.2800559832050385, + "grad_norm": 0.40656254137179043, + "learning_rate": 4.2251975287052804e-05, + "loss": 0.5739, + "step": 2001 + }, + { + "epoch": 0.2801959412176347, + "grad_norm": 0.41605805412685426, + "learning_rate": 4.224377121468778e-05, + "loss": 0.6018, + "step": 2002 + }, + { + "epoch": 0.28033589923023094, + "grad_norm": 0.4181616140874938, + "learning_rate": 4.223556359855282e-05, + "loss": 0.5429, + "step": 2003 + }, + { + "epoch": 0.28047585724282714, + "grad_norm": 0.4414984932849065, + "learning_rate": 4.222735244033464e-05, + "loss": 0.6, + "step": 2004 + }, + { + "epoch": 0.2806158152554234, + "grad_norm": 0.40209497833786334, + "learning_rate": 4.221913774172076e-05, + "loss": 0.5793, + "step": 2005 + }, + { + "epoch": 0.2807557732680196, + "grad_norm": 0.40555504520183505, + "learning_rate": 4.221091950439935e-05, + "loss": 0.5522, + "step": 2006 + }, + { + "epoch": 0.2808957312806158, + "grad_norm": 0.4425009809651693, + "learning_rate": 4.220269773005935e-05, + "loss": 0.6562, + "step": 2007 + }, + { + "epoch": 0.28103568929321204, + "grad_norm": 0.4154023509957506, + "learning_rate": 4.219447242039043e-05, + "loss": 0.6121, + "step": 2008 + }, + { + "epoch": 0.28117564730580824, + "grad_norm": 0.4069493962165521, + "learning_rate": 4.2186243577082954e-05, + "loss": 0.582, + "step": 2009 + }, + { + "epoch": 0.2813156053184045, + "grad_norm": 0.42547539237875576, + "learning_rate": 4.217801120182805e-05, + "loss": 0.5737, + "step": 2010 + }, + { + "epoch": 0.2814555633310007, + "grad_norm": 0.4120143925863747, + "learning_rate": 4.2169775296317545e-05, + "loss": 0.5984, + "step": 2011 + }, + { + "epoch": 0.28159552134359694, + "grad_norm": 0.4091520558950441, + "learning_rate": 4.216153586224401e-05, + "loss": 0.5533, + "step": 2012 + }, + { + "epoch": 0.28173547935619314, + "grad_norm": 0.40387339261079624, + "learning_rate": 4.2153292901300715e-05, + "loss": 0.6078, + "step": 2013 + }, + { + "epoch": 0.28187543736878934, + "grad_norm": 0.40058528148893674, + "learning_rate": 4.214504641518169e-05, + "loss": 0.552, + "step": 2014 + }, + { + "epoch": 0.2820153953813856, + "grad_norm": 0.40857595852803164, + "learning_rate": 4.213679640558167e-05, + "loss": 0.5744, + "step": 2015 + }, + { + "epoch": 0.2821553533939818, + "grad_norm": 0.3978718427859697, + "learning_rate": 4.212854287419611e-05, + "loss": 0.5916, + "step": 2016 + }, + { + "epoch": 0.28229531140657804, + "grad_norm": 0.3821785406082015, + "learning_rate": 4.212028582272119e-05, + "loss": 0.555, + "step": 2017 + }, + { + "epoch": 0.28243526941917424, + "grad_norm": 0.4069697450232385, + "learning_rate": 4.2112025252853823e-05, + "loss": 0.564, + "step": 2018 + }, + { + "epoch": 0.2825752274317705, + "grad_norm": 0.4023231577263976, + "learning_rate": 4.210376116629165e-05, + "loss": 0.5486, + "step": 2019 + }, + { + "epoch": 0.2827151854443667, + "grad_norm": 0.4071761170849277, + "learning_rate": 4.2095493564733005e-05, + "loss": 0.5769, + "step": 2020 + }, + { + "epoch": 0.2828551434569629, + "grad_norm": 0.4303385757744145, + "learning_rate": 4.208722244987698e-05, + "loss": 0.5891, + "step": 2021 + }, + { + "epoch": 0.28299510146955914, + "grad_norm": 0.42291929930753197, + "learning_rate": 4.2078947823423364e-05, + "loss": 0.6011, + "step": 2022 + }, + { + "epoch": 0.28313505948215534, + "grad_norm": 0.4004019282036015, + "learning_rate": 4.2070669687072685e-05, + "loss": 0.5643, + "step": 2023 + }, + { + "epoch": 0.2832750174947516, + "grad_norm": 0.4294748353114633, + "learning_rate": 4.206238804252617e-05, + "loss": 0.6098, + "step": 2024 + }, + { + "epoch": 0.2834149755073478, + "grad_norm": 0.4207040878829443, + "learning_rate": 4.205410289148579e-05, + "loss": 0.5825, + "step": 2025 + }, + { + "epoch": 0.28355493351994404, + "grad_norm": 0.42868887539679973, + "learning_rate": 4.204581423565424e-05, + "loss": 0.5703, + "step": 2026 + }, + { + "epoch": 0.28369489153254024, + "grad_norm": 0.41961324792796834, + "learning_rate": 4.2037522076734895e-05, + "loss": 0.6024, + "step": 2027 + }, + { + "epoch": 0.28383484954513644, + "grad_norm": 0.426062343917317, + "learning_rate": 4.20292264164319e-05, + "loss": 0.5451, + "step": 2028 + }, + { + "epoch": 0.2839748075577327, + "grad_norm": 0.5356851221725408, + "learning_rate": 4.202092725645009e-05, + "loss": 0.566, + "step": 2029 + }, + { + "epoch": 0.2841147655703289, + "grad_norm": 0.41705129999473584, + "learning_rate": 4.2012624598495026e-05, + "loss": 0.5515, + "step": 2030 + }, + { + "epoch": 0.28425472358292514, + "grad_norm": 0.4305240936670277, + "learning_rate": 4.2004318444272985e-05, + "loss": 0.6063, + "step": 2031 + }, + { + "epoch": 0.28439468159552134, + "grad_norm": 0.4109507589908985, + "learning_rate": 4.199600879549098e-05, + "loss": 0.5841, + "step": 2032 + }, + { + "epoch": 0.2845346396081176, + "grad_norm": 0.4152517216058978, + "learning_rate": 4.198769565385671e-05, + "loss": 0.618, + "step": 2033 + }, + { + "epoch": 0.2846745976207138, + "grad_norm": 0.40875510353923156, + "learning_rate": 4.197937902107863e-05, + "loss": 0.6169, + "step": 2034 + }, + { + "epoch": 0.28481455563331, + "grad_norm": 0.4080934485529636, + "learning_rate": 4.197105889886587e-05, + "loss": 0.6191, + "step": 2035 + }, + { + "epoch": 0.28495451364590624, + "grad_norm": 0.395534357123435, + "learning_rate": 4.1962735288928305e-05, + "loss": 0.572, + "step": 2036 + }, + { + "epoch": 0.28509447165850244, + "grad_norm": 0.4123561699274625, + "learning_rate": 4.1954408192976536e-05, + "loss": 0.6112, + "step": 2037 + }, + { + "epoch": 0.2852344296710987, + "grad_norm": 0.43088641312757203, + "learning_rate": 4.1946077612721854e-05, + "loss": 0.6146, + "step": 2038 + }, + { + "epoch": 0.2853743876836949, + "grad_norm": 0.4073212886136425, + "learning_rate": 4.193774354987629e-05, + "loss": 0.5876, + "step": 2039 + }, + { + "epoch": 0.28551434569629114, + "grad_norm": 0.41104021326097206, + "learning_rate": 4.192940600615255e-05, + "loss": 0.5861, + "step": 2040 + }, + { + "epoch": 0.28565430370888734, + "grad_norm": 0.4266808368141709, + "learning_rate": 4.192106498326411e-05, + "loss": 0.5982, + "step": 2041 + }, + { + "epoch": 0.28579426172148353, + "grad_norm": 0.44040096596479616, + "learning_rate": 4.191272048292513e-05, + "loss": 0.5877, + "step": 2042 + }, + { + "epoch": 0.2859342197340798, + "grad_norm": 0.4156724474946737, + "learning_rate": 4.1904372506850484e-05, + "loss": 0.5899, + "step": 2043 + }, + { + "epoch": 0.286074177746676, + "grad_norm": 0.42353874241636685, + "learning_rate": 4.189602105675577e-05, + "loss": 0.5958, + "step": 2044 + }, + { + "epoch": 0.28621413575927224, + "grad_norm": 0.40321593874563183, + "learning_rate": 4.18876661343573e-05, + "loss": 0.5898, + "step": 2045 + }, + { + "epoch": 0.28635409377186843, + "grad_norm": 0.43950894922113853, + "learning_rate": 4.187930774137209e-05, + "loss": 0.578, + "step": 2046 + }, + { + "epoch": 0.2864940517844647, + "grad_norm": 0.4094852639707724, + "learning_rate": 4.187094587951786e-05, + "loss": 0.5808, + "step": 2047 + }, + { + "epoch": 0.2866340097970609, + "grad_norm": 0.4290190354250858, + "learning_rate": 4.1862580550513086e-05, + "loss": 0.5754, + "step": 2048 + }, + { + "epoch": 0.2867739678096571, + "grad_norm": 0.43386920098037757, + "learning_rate": 4.18542117560769e-05, + "loss": 0.6008, + "step": 2049 + }, + { + "epoch": 0.28691392582225334, + "grad_norm": 0.4112010436019356, + "learning_rate": 4.1845839497929204e-05, + "loss": 0.5878, + "step": 2050 + }, + { + "epoch": 0.28705388383484953, + "grad_norm": 0.4052022574202855, + "learning_rate": 4.183746377779055e-05, + "loss": 0.5733, + "step": 2051 + }, + { + "epoch": 0.2871938418474458, + "grad_norm": 0.4325624910156896, + "learning_rate": 4.182908459738226e-05, + "loss": 0.5954, + "step": 2052 + }, + { + "epoch": 0.287333799860042, + "grad_norm": 0.41379739337134963, + "learning_rate": 4.1820701958426325e-05, + "loss": 0.5487, + "step": 2053 + }, + { + "epoch": 0.28747375787263824, + "grad_norm": 0.41206409125142784, + "learning_rate": 4.1812315862645466e-05, + "loss": 0.5317, + "step": 2054 + }, + { + "epoch": 0.28761371588523443, + "grad_norm": 0.4410834744075679, + "learning_rate": 4.180392631176312e-05, + "loss": 0.6188, + "step": 2055 + }, + { + "epoch": 0.28775367389783063, + "grad_norm": 0.41721301374555403, + "learning_rate": 4.179553330750341e-05, + "loss": 0.5873, + "step": 2056 + }, + { + "epoch": 0.2878936319104269, + "grad_norm": 0.4140910091320955, + "learning_rate": 4.1787136851591194e-05, + "loss": 0.5913, + "step": 2057 + }, + { + "epoch": 0.2880335899230231, + "grad_norm": 0.4136362029589039, + "learning_rate": 4.177873694575202e-05, + "loss": 0.6285, + "step": 2058 + }, + { + "epoch": 0.28817354793561933, + "grad_norm": 0.42507432331536726, + "learning_rate": 4.1770333591712164e-05, + "loss": 0.5854, + "step": 2059 + }, + { + "epoch": 0.28831350594821553, + "grad_norm": 0.43737552322389167, + "learning_rate": 4.176192679119859e-05, + "loss": 0.5922, + "step": 2060 + }, + { + "epoch": 0.28845346396081173, + "grad_norm": 0.3891108181335366, + "learning_rate": 4.175351654593899e-05, + "loss": 0.5284, + "step": 2061 + }, + { + "epoch": 0.288593421973408, + "grad_norm": 0.4969916385805558, + "learning_rate": 4.174510285766175e-05, + "loss": 0.5824, + "step": 2062 + }, + { + "epoch": 0.2887333799860042, + "grad_norm": 0.42340411002126843, + "learning_rate": 4.173668572809597e-05, + "loss": 0.5689, + "step": 2063 + }, + { + "epoch": 0.28887333799860043, + "grad_norm": 0.41505954996479894, + "learning_rate": 4.172826515897146e-05, + "loss": 0.5734, + "step": 2064 + }, + { + "epoch": 0.28901329601119663, + "grad_norm": 0.3799206030872319, + "learning_rate": 4.1719841152018716e-05, + "loss": 0.5669, + "step": 2065 + }, + { + "epoch": 0.2891532540237929, + "grad_norm": 0.43628757271419605, + "learning_rate": 4.171141370896898e-05, + "loss": 0.6063, + "step": 2066 + }, + { + "epoch": 0.2892932120363891, + "grad_norm": 0.42076886960500287, + "learning_rate": 4.170298283155416e-05, + "loss": 0.5923, + "step": 2067 + }, + { + "epoch": 0.2894331700489853, + "grad_norm": 0.4074347275662083, + "learning_rate": 4.169454852150689e-05, + "loss": 0.5872, + "step": 2068 + }, + { + "epoch": 0.28957312806158153, + "grad_norm": 0.4048049815445387, + "learning_rate": 4.168611078056051e-05, + "loss": 0.5549, + "step": 2069 + }, + { + "epoch": 0.28971308607417773, + "grad_norm": 0.411926260409551, + "learning_rate": 4.167766961044907e-05, + "loss": 0.5879, + "step": 2070 + }, + { + "epoch": 0.289853044086774, + "grad_norm": 0.4144949406829952, + "learning_rate": 4.166922501290729e-05, + "loss": 0.5718, + "step": 2071 + }, + { + "epoch": 0.2899930020993702, + "grad_norm": 0.43675024412528857, + "learning_rate": 4.1660776989670646e-05, + "loss": 0.5933, + "step": 2072 + }, + { + "epoch": 0.29013296011196643, + "grad_norm": 0.394293953998829, + "learning_rate": 4.165232554247528e-05, + "loss": 0.5345, + "step": 2073 + }, + { + "epoch": 0.29027291812456263, + "grad_norm": 0.4193571632306237, + "learning_rate": 4.164387067305805e-05, + "loss": 0.6182, + "step": 2074 + }, + { + "epoch": 0.2904128761371588, + "grad_norm": 0.6437864579317593, + "learning_rate": 4.163541238315653e-05, + "loss": 0.6224, + "step": 2075 + }, + { + "epoch": 0.2905528341497551, + "grad_norm": 0.4015710103834747, + "learning_rate": 4.162695067450897e-05, + "loss": 0.6, + "step": 2076 + }, + { + "epoch": 0.2906927921623513, + "grad_norm": 0.42416882893328844, + "learning_rate": 4.161848554885434e-05, + "loss": 0.5964, + "step": 2077 + }, + { + "epoch": 0.29083275017494753, + "grad_norm": 0.39865445877876937, + "learning_rate": 4.161001700793231e-05, + "loss": 0.5379, + "step": 2078 + }, + { + "epoch": 0.2909727081875437, + "grad_norm": 0.428296814729791, + "learning_rate": 4.160154505348326e-05, + "loss": 0.565, + "step": 2079 + }, + { + "epoch": 0.29111266620014, + "grad_norm": 0.4500265708630457, + "learning_rate": 4.159306968724824e-05, + "loss": 0.6454, + "step": 2080 + }, + { + "epoch": 0.2912526242127362, + "grad_norm": 0.4113564994850615, + "learning_rate": 4.1584590910969055e-05, + "loss": 0.5792, + "step": 2081 + }, + { + "epoch": 0.2913925822253324, + "grad_norm": 0.4181766831204532, + "learning_rate": 4.157610872638815e-05, + "loss": 0.6005, + "step": 2082 + }, + { + "epoch": 0.29153254023792863, + "grad_norm": 0.42000429084184243, + "learning_rate": 4.156762313524873e-05, + "loss": 0.5565, + "step": 2083 + }, + { + "epoch": 0.2916724982505248, + "grad_norm": 0.4106324972814455, + "learning_rate": 4.155913413929463e-05, + "loss": 0.5678, + "step": 2084 + }, + { + "epoch": 0.2918124562631211, + "grad_norm": 0.3971576285170608, + "learning_rate": 4.155064174027047e-05, + "loss": 0.5972, + "step": 2085 + }, + { + "epoch": 0.2919524142757173, + "grad_norm": 0.4438093270564298, + "learning_rate": 4.154214593992149e-05, + "loss": 0.5909, + "step": 2086 + }, + { + "epoch": 0.29209237228831353, + "grad_norm": 0.4056320133949262, + "learning_rate": 4.1533646739993676e-05, + "loss": 0.5832, + "step": 2087 + }, + { + "epoch": 0.2922323303009097, + "grad_norm": 0.4129515425696569, + "learning_rate": 4.1525144142233705e-05, + "loss": 0.5995, + "step": 2088 + }, + { + "epoch": 0.2923722883135059, + "grad_norm": 0.4513903259561082, + "learning_rate": 4.151663814838893e-05, + "loss": 0.5527, + "step": 2089 + }, + { + "epoch": 0.2925122463261022, + "grad_norm": 0.3975679752001749, + "learning_rate": 4.150812876020744e-05, + "loss": 0.5579, + "step": 2090 + }, + { + "epoch": 0.2926522043386984, + "grad_norm": 0.4079824226246549, + "learning_rate": 4.1499615979437986e-05, + "loss": 0.5583, + "step": 2091 + }, + { + "epoch": 0.2927921623512946, + "grad_norm": 0.4452632659983039, + "learning_rate": 4.1491099807830044e-05, + "loss": 0.5871, + "step": 2092 + }, + { + "epoch": 0.2929321203638908, + "grad_norm": 0.416969695477689, + "learning_rate": 4.148258024713376e-05, + "loss": 0.5686, + "step": 2093 + }, + { + "epoch": 0.2930720783764871, + "grad_norm": 0.39010342813066295, + "learning_rate": 4.147405729909999e-05, + "loss": 0.5805, + "step": 2094 + }, + { + "epoch": 0.2932120363890833, + "grad_norm": 0.4146408500392661, + "learning_rate": 4.146553096548031e-05, + "loss": 0.5939, + "step": 2095 + }, + { + "epoch": 0.2933519944016795, + "grad_norm": 0.4438792319022797, + "learning_rate": 4.145700124802693e-05, + "loss": 0.6048, + "step": 2096 + }, + { + "epoch": 0.2934919524142757, + "grad_norm": 0.652029593861103, + "learning_rate": 4.144846814849282e-05, + "loss": 0.6083, + "step": 2097 + }, + { + "epoch": 0.2936319104268719, + "grad_norm": 0.4075857320797679, + "learning_rate": 4.143993166863161e-05, + "loss": 0.5861, + "step": 2098 + }, + { + "epoch": 0.2937718684394682, + "grad_norm": 0.3986323846100174, + "learning_rate": 4.143139181019764e-05, + "loss": 0.5729, + "step": 2099 + }, + { + "epoch": 0.2939118264520644, + "grad_norm": 0.42024806604839915, + "learning_rate": 4.1422848574945924e-05, + "loss": 0.5662, + "step": 2100 + }, + { + "epoch": 0.2940517844646606, + "grad_norm": 0.4174607888005454, + "learning_rate": 4.141430196463221e-05, + "loss": 0.542, + "step": 2101 + }, + { + "epoch": 0.2941917424772568, + "grad_norm": 0.4330991872091227, + "learning_rate": 4.140575198101288e-05, + "loss": 0.6237, + "step": 2102 + }, + { + "epoch": 0.294331700489853, + "grad_norm": 0.41437624963664066, + "learning_rate": 4.139719862584506e-05, + "loss": 0.5758, + "step": 2103 + }, + { + "epoch": 0.2944716585024493, + "grad_norm": 0.41455194934526307, + "learning_rate": 4.138864190088655e-05, + "loss": 0.5494, + "step": 2104 + }, + { + "epoch": 0.29461161651504547, + "grad_norm": 0.3928031972149667, + "learning_rate": 4.1380081807895846e-05, + "loss": 0.615, + "step": 2105 + }, + { + "epoch": 0.2947515745276417, + "grad_norm": 0.4097807146687554, + "learning_rate": 4.137151834863213e-05, + "loss": 0.5563, + "step": 2106 + }, + { + "epoch": 0.2948915325402379, + "grad_norm": 0.4201887917529205, + "learning_rate": 4.1362951524855275e-05, + "loss": 0.5934, + "step": 2107 + }, + { + "epoch": 0.2950314905528342, + "grad_norm": 0.408291117857594, + "learning_rate": 4.1354381338325864e-05, + "loss": 0.6003, + "step": 2108 + }, + { + "epoch": 0.29517144856543037, + "grad_norm": 0.43085354542414667, + "learning_rate": 4.1345807790805144e-05, + "loss": 0.5948, + "step": 2109 + }, + { + "epoch": 0.29531140657802657, + "grad_norm": 0.46223900630617937, + "learning_rate": 4.1337230884055073e-05, + "loss": 0.5575, + "step": 2110 + }, + { + "epoch": 0.2954513645906228, + "grad_norm": 0.42857422520776206, + "learning_rate": 4.132865061983829e-05, + "loss": 0.6218, + "step": 2111 + }, + { + "epoch": 0.295591322603219, + "grad_norm": 0.4328798834584895, + "learning_rate": 4.132006699991813e-05, + "loss": 0.572, + "step": 2112 + }, + { + "epoch": 0.2957312806158153, + "grad_norm": 0.4029310476167804, + "learning_rate": 4.131148002605861e-05, + "loss": 0.5762, + "step": 2113 + }, + { + "epoch": 0.29587123862841147, + "grad_norm": 0.4227536522427694, + "learning_rate": 4.130288970002444e-05, + "loss": 0.5716, + "step": 2114 + }, + { + "epoch": 0.2960111966410077, + "grad_norm": 0.3992813234221805, + "learning_rate": 4.1294296023581015e-05, + "loss": 0.5803, + "step": 2115 + }, + { + "epoch": 0.2961511546536039, + "grad_norm": 0.40775499827762557, + "learning_rate": 4.128569899849443e-05, + "loss": 0.5813, + "step": 2116 + }, + { + "epoch": 0.2962911126662001, + "grad_norm": 0.4187348195942707, + "learning_rate": 4.127709862653146e-05, + "loss": 0.5915, + "step": 2117 + }, + { + "epoch": 0.29643107067879637, + "grad_norm": 0.43072765488595965, + "learning_rate": 4.126849490945958e-05, + "loss": 0.5869, + "step": 2118 + }, + { + "epoch": 0.29657102869139257, + "grad_norm": 0.40631929550004103, + "learning_rate": 4.1259887849046906e-05, + "loss": 0.5783, + "step": 2119 + }, + { + "epoch": 0.2967109867039888, + "grad_norm": 0.4236924640290844, + "learning_rate": 4.1251277447062315e-05, + "loss": 0.64, + "step": 2120 + }, + { + "epoch": 0.296850944716585, + "grad_norm": 0.42013017681077297, + "learning_rate": 4.124266370527531e-05, + "loss": 0.6384, + "step": 2121 + }, + { + "epoch": 0.29699090272918127, + "grad_norm": 0.4065769677758428, + "learning_rate": 4.123404662545611e-05, + "loss": 0.6086, + "step": 2122 + }, + { + "epoch": 0.29713086074177747, + "grad_norm": 0.3886256061886866, + "learning_rate": 4.1225426209375605e-05, + "loss": 0.5219, + "step": 2123 + }, + { + "epoch": 0.29727081875437367, + "grad_norm": 0.40830396890274406, + "learning_rate": 4.121680245880539e-05, + "loss": 0.5487, + "step": 2124 + }, + { + "epoch": 0.2974107767669699, + "grad_norm": 0.40457862036093745, + "learning_rate": 4.120817537551773e-05, + "loss": 0.5797, + "step": 2125 + }, + { + "epoch": 0.2975507347795661, + "grad_norm": 0.414048107015411, + "learning_rate": 4.1199544961285574e-05, + "loss": 0.5827, + "step": 2126 + }, + { + "epoch": 0.29769069279216237, + "grad_norm": 0.4211166151095332, + "learning_rate": 4.119091121788256e-05, + "loss": 0.5869, + "step": 2127 + }, + { + "epoch": 0.29783065080475857, + "grad_norm": 0.3999830413220613, + "learning_rate": 4.1182274147083e-05, + "loss": 0.5538, + "step": 2128 + }, + { + "epoch": 0.2979706088173548, + "grad_norm": 0.43614503719146447, + "learning_rate": 4.1173633750661915e-05, + "loss": 0.6079, + "step": 2129 + }, + { + "epoch": 0.298110566829951, + "grad_norm": 0.4284744155784332, + "learning_rate": 4.116499003039499e-05, + "loss": 0.5669, + "step": 2130 + }, + { + "epoch": 0.2982505248425472, + "grad_norm": 0.4178472294126034, + "learning_rate": 4.1156342988058606e-05, + "loss": 0.5466, + "step": 2131 + }, + { + "epoch": 0.29839048285514347, + "grad_norm": 0.4104945666216928, + "learning_rate": 4.114769262542979e-05, + "loss": 0.5566, + "step": 2132 + }, + { + "epoch": 0.29853044086773967, + "grad_norm": 0.5142114902119391, + "learning_rate": 4.113903894428632e-05, + "loss": 0.5706, + "step": 2133 + }, + { + "epoch": 0.2986703988803359, + "grad_norm": 0.42568394409367394, + "learning_rate": 4.113038194640658e-05, + "loss": 0.5772, + "step": 2134 + }, + { + "epoch": 0.2988103568929321, + "grad_norm": 0.4072689251705309, + "learning_rate": 4.112172163356969e-05, + "loss": 0.5839, + "step": 2135 + }, + { + "epoch": 0.2989503149055283, + "grad_norm": 0.43230154146342653, + "learning_rate": 4.1113058007555415e-05, + "loss": 0.5762, + "step": 2136 + }, + { + "epoch": 0.29909027291812457, + "grad_norm": 0.42221307246185413, + "learning_rate": 4.110439107014423e-05, + "loss": 0.5854, + "step": 2137 + }, + { + "epoch": 0.29923023093072076, + "grad_norm": 0.4115485110413119, + "learning_rate": 4.109572082311729e-05, + "loss": 0.6144, + "step": 2138 + }, + { + "epoch": 0.299370188943317, + "grad_norm": 0.4381710550273501, + "learning_rate": 4.10870472682564e-05, + "loss": 0.5764, + "step": 2139 + }, + { + "epoch": 0.2995101469559132, + "grad_norm": 0.40672792105491173, + "learning_rate": 4.107837040734407e-05, + "loss": 0.5623, + "step": 2140 + }, + { + "epoch": 0.29965010496850947, + "grad_norm": 0.4230148958590366, + "learning_rate": 4.1069690242163484e-05, + "loss": 0.6007, + "step": 2141 + }, + { + "epoch": 0.29979006298110566, + "grad_norm": 0.42487403889075154, + "learning_rate": 4.106100677449851e-05, + "loss": 0.5952, + "step": 2142 + }, + { + "epoch": 0.29993002099370186, + "grad_norm": 0.41800613830620925, + "learning_rate": 4.105232000613367e-05, + "loss": 0.5814, + "step": 2143 + }, + { + "epoch": 0.3000699790062981, + "grad_norm": 0.42812390915287574, + "learning_rate": 4.10436299388542e-05, + "loss": 0.6044, + "step": 2144 + }, + { + "epoch": 0.3002099370188943, + "grad_norm": 0.42630144567604744, + "learning_rate": 4.103493657444599e-05, + "loss": 0.5458, + "step": 2145 + }, + { + "epoch": 0.30034989503149057, + "grad_norm": 0.40026869359194683, + "learning_rate": 4.1026239914695617e-05, + "loss": 0.567, + "step": 2146 + }, + { + "epoch": 0.30048985304408676, + "grad_norm": 0.4045104989308898, + "learning_rate": 4.101753996139033e-05, + "loss": 0.5573, + "step": 2147 + }, + { + "epoch": 0.300629811056683, + "grad_norm": 0.39480816764383886, + "learning_rate": 4.100883671631806e-05, + "loss": 0.5554, + "step": 2148 + }, + { + "epoch": 0.3007697690692792, + "grad_norm": 0.4359485340891318, + "learning_rate": 4.100013018126742e-05, + "loss": 0.5306, + "step": 2149 + }, + { + "epoch": 0.3009097270818754, + "grad_norm": 0.44274809745131777, + "learning_rate": 4.099142035802767e-05, + "loss": 0.6069, + "step": 2150 + }, + { + "epoch": 0.30104968509447166, + "grad_norm": 0.4010134732813477, + "learning_rate": 4.098270724838879e-05, + "loss": 0.5737, + "step": 2151 + }, + { + "epoch": 0.30118964310706786, + "grad_norm": 0.42583763567956523, + "learning_rate": 4.09739908541414e-05, + "loss": 0.5646, + "step": 2152 + }, + { + "epoch": 0.3013296011196641, + "grad_norm": 0.43076744664262784, + "learning_rate": 4.096527117707681e-05, + "loss": 0.6089, + "step": 2153 + }, + { + "epoch": 0.3014695591322603, + "grad_norm": 0.41900293437499037, + "learning_rate": 4.0956548218987004e-05, + "loss": 0.5818, + "step": 2154 + }, + { + "epoch": 0.30160951714485656, + "grad_norm": 0.441326688768721, + "learning_rate": 4.094782198166464e-05, + "loss": 0.601, + "step": 2155 + }, + { + "epoch": 0.30174947515745276, + "grad_norm": 0.41905876085866123, + "learning_rate": 4.093909246690303e-05, + "loss": 0.5694, + "step": 2156 + }, + { + "epoch": 0.30188943317004896, + "grad_norm": 0.40710066709424864, + "learning_rate": 4.093035967649621e-05, + "loss": 0.5571, + "step": 2157 + }, + { + "epoch": 0.3020293911826452, + "grad_norm": 0.41211015638134546, + "learning_rate": 4.092162361223884e-05, + "loss": 0.6152, + "step": 2158 + }, + { + "epoch": 0.3021693491952414, + "grad_norm": 0.42106082661170496, + "learning_rate": 4.091288427592626e-05, + "loss": 0.5927, + "step": 2159 + }, + { + "epoch": 0.30230930720783766, + "grad_norm": 0.4141524195584368, + "learning_rate": 4.0904141669354504e-05, + "loss": 0.6316, + "step": 2160 + }, + { + "epoch": 0.30244926522043386, + "grad_norm": 0.44132339838745016, + "learning_rate": 4.0895395794320265e-05, + "loss": 0.5985, + "step": 2161 + }, + { + "epoch": 0.3025892232330301, + "grad_norm": 0.4276646446648299, + "learning_rate": 4.088664665262091e-05, + "loss": 0.6097, + "step": 2162 + }, + { + "epoch": 0.3027291812456263, + "grad_norm": 0.4150132159301798, + "learning_rate": 4.087789424605447e-05, + "loss": 0.5909, + "step": 2163 + }, + { + "epoch": 0.3028691392582225, + "grad_norm": 0.39307522840638937, + "learning_rate": 4.0869138576419665e-05, + "loss": 0.5914, + "step": 2164 + }, + { + "epoch": 0.30300909727081876, + "grad_norm": 0.38594672832612004, + "learning_rate": 4.086037964551587e-05, + "loss": 0.5607, + "step": 2165 + }, + { + "epoch": 0.30314905528341496, + "grad_norm": 0.4044083531551891, + "learning_rate": 4.085161745514312e-05, + "loss": 0.5844, + "step": 2166 + }, + { + "epoch": 0.3032890132960112, + "grad_norm": 0.42910899298854893, + "learning_rate": 4.084285200710215e-05, + "loss": 0.5998, + "step": 2167 + }, + { + "epoch": 0.3034289713086074, + "grad_norm": 0.4246336566951562, + "learning_rate": 4.083408330319435e-05, + "loss": 0.5741, + "step": 2168 + }, + { + "epoch": 0.30356892932120366, + "grad_norm": 0.4257821385233403, + "learning_rate": 4.082531134522176e-05, + "loss": 0.6026, + "step": 2169 + }, + { + "epoch": 0.30370888733379986, + "grad_norm": 0.4191758702628161, + "learning_rate": 4.0816536134987124e-05, + "loss": 0.6057, + "step": 2170 + }, + { + "epoch": 0.30384884534639606, + "grad_norm": 0.4038665214466415, + "learning_rate": 4.0807757674293834e-05, + "loss": 0.5794, + "step": 2171 + }, + { + "epoch": 0.3039888033589923, + "grad_norm": 0.38253864598092674, + "learning_rate": 4.079897596494594e-05, + "loss": 0.5572, + "step": 2172 + }, + { + "epoch": 0.3041287613715885, + "grad_norm": 0.40057781747525356, + "learning_rate": 4.0790191008748193e-05, + "loss": 0.5757, + "step": 2173 + }, + { + "epoch": 0.30426871938418476, + "grad_norm": 0.4425784348877834, + "learning_rate": 4.078140280750597e-05, + "loss": 0.5909, + "step": 2174 + }, + { + "epoch": 0.30440867739678096, + "grad_norm": 0.43807792095543074, + "learning_rate": 4.077261136302536e-05, + "loss": 0.5713, + "step": 2175 + }, + { + "epoch": 0.3045486354093772, + "grad_norm": 0.42494113081632956, + "learning_rate": 4.0763816677113064e-05, + "loss": 0.5599, + "step": 2176 + }, + { + "epoch": 0.3046885934219734, + "grad_norm": 0.4147870888377848, + "learning_rate": 4.07550187515765e-05, + "loss": 0.5958, + "step": 2177 + }, + { + "epoch": 0.3048285514345696, + "grad_norm": 0.44106090618207494, + "learning_rate": 4.0746217588223724e-05, + "loss": 0.5719, + "step": 2178 + }, + { + "epoch": 0.30496850944716586, + "grad_norm": 0.43022335538791223, + "learning_rate": 4.073741318886347e-05, + "loss": 0.5666, + "step": 2179 + }, + { + "epoch": 0.30510846745976206, + "grad_norm": 0.422655488810057, + "learning_rate": 4.072860555530512e-05, + "loss": 0.5725, + "step": 2180 + }, + { + "epoch": 0.3052484254723583, + "grad_norm": 0.41560143611944184, + "learning_rate": 4.071979468935874e-05, + "loss": 0.5707, + "step": 2181 + }, + { + "epoch": 0.3053883834849545, + "grad_norm": 0.40831478451563363, + "learning_rate": 4.071098059283505e-05, + "loss": 0.5724, + "step": 2182 + }, + { + "epoch": 0.30552834149755076, + "grad_norm": 0.45412422141258324, + "learning_rate": 4.070216326754544e-05, + "loss": 0.5914, + "step": 2183 + }, + { + "epoch": 0.30566829951014696, + "grad_norm": 0.4238846330415573, + "learning_rate": 4.069334271530196e-05, + "loss": 0.5852, + "step": 2184 + }, + { + "epoch": 0.30580825752274315, + "grad_norm": 0.4168947592800387, + "learning_rate": 4.0684518937917315e-05, + "loss": 0.5668, + "step": 2185 + }, + { + "epoch": 0.3059482155353394, + "grad_norm": 0.3974079353132702, + "learning_rate": 4.0675691937204886e-05, + "loss": 0.5796, + "step": 2186 + }, + { + "epoch": 0.3060881735479356, + "grad_norm": 0.447707780183876, + "learning_rate": 4.0666861714978724e-05, + "loss": 0.6053, + "step": 2187 + }, + { + "epoch": 0.30622813156053186, + "grad_norm": 0.4241731286712751, + "learning_rate": 4.06580282730535e-05, + "loss": 0.5433, + "step": 2188 + }, + { + "epoch": 0.30636808957312806, + "grad_norm": 0.40634600846513996, + "learning_rate": 4.0649191613244596e-05, + "loss": 0.5778, + "step": 2189 + }, + { + "epoch": 0.3065080475857243, + "grad_norm": 0.40317717922477264, + "learning_rate": 4.064035173736804e-05, + "loss": 0.5654, + "step": 2190 + }, + { + "epoch": 0.3066480055983205, + "grad_norm": 0.40439388741840737, + "learning_rate": 4.06315086472405e-05, + "loss": 0.554, + "step": 2191 + }, + { + "epoch": 0.3067879636109167, + "grad_norm": 0.40052999103675635, + "learning_rate": 4.062266234467933e-05, + "loss": 0.5633, + "step": 2192 + }, + { + "epoch": 0.30692792162351296, + "grad_norm": 0.39039005393866494, + "learning_rate": 4.061381283150254e-05, + "loss": 0.5712, + "step": 2193 + }, + { + "epoch": 0.30706787963610915, + "grad_norm": 0.4452309708879627, + "learning_rate": 4.0604960109528786e-05, + "loss": 0.5951, + "step": 2194 + }, + { + "epoch": 0.3072078376487054, + "grad_norm": 0.4316897315393044, + "learning_rate": 4.059610418057739e-05, + "loss": 0.5643, + "step": 2195 + }, + { + "epoch": 0.3073477956613016, + "grad_norm": 0.43497901635487507, + "learning_rate": 4.058724504646834e-05, + "loss": 0.6104, + "step": 2196 + }, + { + "epoch": 0.30748775367389786, + "grad_norm": 0.3971370299202904, + "learning_rate": 4.057838270902228e-05, + "loss": 0.5678, + "step": 2197 + }, + { + "epoch": 0.30762771168649405, + "grad_norm": 0.4316780175497349, + "learning_rate": 4.056951717006051e-05, + "loss": 0.6105, + "step": 2198 + }, + { + "epoch": 0.30776766969909025, + "grad_norm": 0.39695048154674517, + "learning_rate": 4.056064843140498e-05, + "loss": 0.5726, + "step": 2199 + }, + { + "epoch": 0.3079076277116865, + "grad_norm": 0.41439706383436253, + "learning_rate": 4.0551776494878316e-05, + "loss": 0.568, + "step": 2200 + }, + { + "epoch": 0.3080475857242827, + "grad_norm": 0.4191940957424624, + "learning_rate": 4.054290136230379e-05, + "loss": 0.6245, + "step": 2201 + }, + { + "epoch": 0.30818754373687895, + "grad_norm": 0.4143832100728591, + "learning_rate": 4.053402303550533e-05, + "loss": 0.5795, + "step": 2202 + }, + { + "epoch": 0.30832750174947515, + "grad_norm": 0.3915343927508335, + "learning_rate": 4.052514151630752e-05, + "loss": 0.5543, + "step": 2203 + }, + { + "epoch": 0.3084674597620714, + "grad_norm": 0.41805518170359557, + "learning_rate": 4.0516256806535594e-05, + "loss": 0.5891, + "step": 2204 + }, + { + "epoch": 0.3086074177746676, + "grad_norm": 0.41287829384767516, + "learning_rate": 4.050736890801547e-05, + "loss": 0.569, + "step": 2205 + }, + { + "epoch": 0.3087473757872638, + "grad_norm": 0.4029096483520546, + "learning_rate": 4.049847782257369e-05, + "loss": 0.5859, + "step": 2206 + }, + { + "epoch": 0.30888733379986005, + "grad_norm": 0.39417183897543207, + "learning_rate": 4.048958355203746e-05, + "loss": 0.5282, + "step": 2207 + }, + { + "epoch": 0.30902729181245625, + "grad_norm": 0.3952415992081539, + "learning_rate": 4.048068609823464e-05, + "loss": 0.5668, + "step": 2208 + }, + { + "epoch": 0.3091672498250525, + "grad_norm": 0.4072747277650614, + "learning_rate": 4.047178546299376e-05, + "loss": 0.5977, + "step": 2209 + }, + { + "epoch": 0.3093072078376487, + "grad_norm": 0.4390729532532709, + "learning_rate": 4.0462881648143977e-05, + "loss": 0.6085, + "step": 2210 + }, + { + "epoch": 0.30944716585024495, + "grad_norm": 0.41550618366447567, + "learning_rate": 4.045397465551513e-05, + "loss": 0.5501, + "step": 2211 + }, + { + "epoch": 0.30958712386284115, + "grad_norm": 0.42949098915048817, + "learning_rate": 4.044506448693769e-05, + "loss": 0.5786, + "step": 2212 + }, + { + "epoch": 0.30972708187543735, + "grad_norm": 0.4180486476641866, + "learning_rate": 4.0436151144242776e-05, + "loss": 0.5759, + "step": 2213 + }, + { + "epoch": 0.3098670398880336, + "grad_norm": 0.4151740836794609, + "learning_rate": 4.042723462926219e-05, + "loss": 0.5858, + "step": 2214 + }, + { + "epoch": 0.3100069979006298, + "grad_norm": 0.41105102208793937, + "learning_rate": 4.041831494382835e-05, + "loss": 0.5285, + "step": 2215 + }, + { + "epoch": 0.31014695591322605, + "grad_norm": 0.40516018829246475, + "learning_rate": 4.040939208977435e-05, + "loss": 0.55, + "step": 2216 + }, + { + "epoch": 0.31028691392582225, + "grad_norm": 0.42729080222511606, + "learning_rate": 4.040046606893392e-05, + "loss": 0.5872, + "step": 2217 + }, + { + "epoch": 0.31042687193841845, + "grad_norm": 0.4226725585719995, + "learning_rate": 4.039153688314145e-05, + "loss": 0.5882, + "step": 2218 + }, + { + "epoch": 0.3105668299510147, + "grad_norm": 0.40702190282971207, + "learning_rate": 4.0382604534232e-05, + "loss": 0.5968, + "step": 2219 + }, + { + "epoch": 0.3107067879636109, + "grad_norm": 0.4266606910520138, + "learning_rate": 4.0373669024041226e-05, + "loss": 0.5822, + "step": 2220 + }, + { + "epoch": 0.31084674597620715, + "grad_norm": 0.4163690162383508, + "learning_rate": 4.0364730354405475e-05, + "loss": 0.5829, + "step": 2221 + }, + { + "epoch": 0.31098670398880335, + "grad_norm": 0.40577915115104807, + "learning_rate": 4.035578852716175e-05, + "loss": 0.5733, + "step": 2222 + }, + { + "epoch": 0.3111266620013996, + "grad_norm": 0.4011948223027502, + "learning_rate": 4.034684354414767e-05, + "loss": 0.6159, + "step": 2223 + }, + { + "epoch": 0.3112666200139958, + "grad_norm": 0.40735551230540823, + "learning_rate": 4.0337895407201527e-05, + "loss": 0.5897, + "step": 2224 + }, + { + "epoch": 0.311406578026592, + "grad_norm": 0.4007832693673057, + "learning_rate": 4.0328944118162255e-05, + "loss": 0.5823, + "step": 2225 + }, + { + "epoch": 0.31154653603918825, + "grad_norm": 0.4220826406609178, + "learning_rate": 4.0319989678869426e-05, + "loss": 0.603, + "step": 2226 + }, + { + "epoch": 0.31168649405178445, + "grad_norm": 0.42993833632780304, + "learning_rate": 4.031103209116328e-05, + "loss": 0.5959, + "step": 2227 + }, + { + "epoch": 0.3118264520643807, + "grad_norm": 0.405630507124848, + "learning_rate": 4.030207135688468e-05, + "loss": 0.5846, + "step": 2228 + }, + { + "epoch": 0.3119664100769769, + "grad_norm": 0.40554860006169224, + "learning_rate": 4.029310747787516e-05, + "loss": 0.5809, + "step": 2229 + }, + { + "epoch": 0.31210636808957315, + "grad_norm": 0.38272639115415097, + "learning_rate": 4.028414045597688e-05, + "loss": 0.5554, + "step": 2230 + }, + { + "epoch": 0.31224632610216935, + "grad_norm": 0.4133830894332048, + "learning_rate": 4.027517029303266e-05, + "loss": 0.5844, + "step": 2231 + }, + { + "epoch": 0.31238628411476554, + "grad_norm": 0.40469368564073166, + "learning_rate": 4.0266196990885955e-05, + "loss": 0.5692, + "step": 2232 + }, + { + "epoch": 0.3125262421273618, + "grad_norm": 0.43419090564140506, + "learning_rate": 4.025722055138087e-05, + "loss": 0.5766, + "step": 2233 + }, + { + "epoch": 0.312666200139958, + "grad_norm": 0.4192206941532246, + "learning_rate": 4.0248240976362154e-05, + "loss": 0.5916, + "step": 2234 + }, + { + "epoch": 0.31280615815255425, + "grad_norm": 0.40494171533150775, + "learning_rate": 4.02392582676752e-05, + "loss": 0.5405, + "step": 2235 + }, + { + "epoch": 0.31294611616515045, + "grad_norm": 0.41642501232810825, + "learning_rate": 4.023027242716606e-05, + "loss": 0.5658, + "step": 2236 + }, + { + "epoch": 0.3130860741777467, + "grad_norm": 0.4179541672168489, + "learning_rate": 4.022128345668139e-05, + "loss": 0.5747, + "step": 2237 + }, + { + "epoch": 0.3132260321903429, + "grad_norm": 0.42437584151566893, + "learning_rate": 4.021229135806853e-05, + "loss": 0.5839, + "step": 2238 + }, + { + "epoch": 0.3133659902029391, + "grad_norm": 0.39708478174893375, + "learning_rate": 4.020329613317545e-05, + "loss": 0.5491, + "step": 2239 + }, + { + "epoch": 0.31350594821553535, + "grad_norm": 0.40589223534794583, + "learning_rate": 4.0194297783850755e-05, + "loss": 0.5767, + "step": 2240 + }, + { + "epoch": 0.31364590622813154, + "grad_norm": 0.4186147684647929, + "learning_rate": 4.018529631194369e-05, + "loss": 0.5754, + "step": 2241 + }, + { + "epoch": 0.3137858642407278, + "grad_norm": 0.4066652101804317, + "learning_rate": 4.017629171930416e-05, + "loss": 0.5655, + "step": 2242 + }, + { + "epoch": 0.313925822253324, + "grad_norm": 0.4383552206858084, + "learning_rate": 4.01672840077827e-05, + "loss": 0.5829, + "step": 2243 + }, + { + "epoch": 0.31406578026592025, + "grad_norm": 0.422916994665018, + "learning_rate": 4.0158273179230475e-05, + "loss": 0.6183, + "step": 2244 + }, + { + "epoch": 0.31420573827851644, + "grad_norm": 0.4255816299563645, + "learning_rate": 4.0149259235499317e-05, + "loss": 0.6063, + "step": 2245 + }, + { + "epoch": 0.31434569629111264, + "grad_norm": 0.43561902805107455, + "learning_rate": 4.014024217844167e-05, + "loss": 0.572, + "step": 2246 + }, + { + "epoch": 0.3144856543037089, + "grad_norm": 0.4257236548174317, + "learning_rate": 4.013122200991064e-05, + "loss": 0.5752, + "step": 2247 + }, + { + "epoch": 0.3146256123163051, + "grad_norm": 0.38763550151633236, + "learning_rate": 4.012219873175995e-05, + "loss": 0.5677, + "step": 2248 + }, + { + "epoch": 0.31476557032890135, + "grad_norm": 0.4175429066296847, + "learning_rate": 4.0113172345843983e-05, + "loss": 0.5595, + "step": 2249 + }, + { + "epoch": 0.31490552834149754, + "grad_norm": 0.4480069652429578, + "learning_rate": 4.010414285401777e-05, + "loss": 0.5681, + "step": 2250 + }, + { + "epoch": 0.3150454863540938, + "grad_norm": 0.40107954134135065, + "learning_rate": 4.009511025813694e-05, + "loss": 0.5706, + "step": 2251 + }, + { + "epoch": 0.31518544436669, + "grad_norm": 0.40774375327862394, + "learning_rate": 4.008607456005778e-05, + "loss": 0.5684, + "step": 2252 + }, + { + "epoch": 0.3153254023792862, + "grad_norm": 0.4374841181743415, + "learning_rate": 4.007703576163724e-05, + "loss": 0.5685, + "step": 2253 + }, + { + "epoch": 0.31546536039188244, + "grad_norm": 0.41257242312452364, + "learning_rate": 4.006799386473287e-05, + "loss": 0.591, + "step": 2254 + }, + { + "epoch": 0.31560531840447864, + "grad_norm": 0.4114995646091396, + "learning_rate": 4.005894887120287e-05, + "loss": 0.5519, + "step": 2255 + }, + { + "epoch": 0.3157452764170749, + "grad_norm": 0.4076070427270775, + "learning_rate": 4.0049900782906086e-05, + "loss": 0.5844, + "step": 2256 + }, + { + "epoch": 0.3158852344296711, + "grad_norm": 0.4132478127299326, + "learning_rate": 4.004084960170199e-05, + "loss": 0.5655, + "step": 2257 + }, + { + "epoch": 0.31602519244226734, + "grad_norm": 0.4076919229201329, + "learning_rate": 4.0031795329450685e-05, + "loss": 0.5885, + "step": 2258 + }, + { + "epoch": 0.31616515045486354, + "grad_norm": 0.421676106935922, + "learning_rate": 4.002273796801292e-05, + "loss": 0.6067, + "step": 2259 + }, + { + "epoch": 0.31630510846745974, + "grad_norm": 0.3902189908536509, + "learning_rate": 4.001367751925008e-05, + "loss": 0.5666, + "step": 2260 + }, + { + "epoch": 0.316445066480056, + "grad_norm": 0.42674732993318637, + "learning_rate": 4.000461398502418e-05, + "loss": 0.5944, + "step": 2261 + }, + { + "epoch": 0.3165850244926522, + "grad_norm": 0.40075738490963975, + "learning_rate": 3.9995547367197845e-05, + "loss": 0.565, + "step": 2262 + }, + { + "epoch": 0.31672498250524844, + "grad_norm": 0.42762891854787655, + "learning_rate": 3.998647766763438e-05, + "loss": 0.6214, + "step": 2263 + }, + { + "epoch": 0.31686494051784464, + "grad_norm": 0.3961782540593822, + "learning_rate": 3.9977404888197704e-05, + "loss": 0.5454, + "step": 2264 + }, + { + "epoch": 0.3170048985304409, + "grad_norm": 0.3923939088232202, + "learning_rate": 3.996832903075235e-05, + "loss": 0.5883, + "step": 2265 + }, + { + "epoch": 0.3171448565430371, + "grad_norm": 0.42198649975518954, + "learning_rate": 3.99592500971635e-05, + "loss": 0.5809, + "step": 2266 + }, + { + "epoch": 0.3172848145556333, + "grad_norm": 0.4083907533890683, + "learning_rate": 3.995016808929698e-05, + "loss": 0.5423, + "step": 2267 + }, + { + "epoch": 0.31742477256822954, + "grad_norm": 0.39942363696045713, + "learning_rate": 3.9941083009019223e-05, + "loss": 0.561, + "step": 2268 + }, + { + "epoch": 0.31756473058082574, + "grad_norm": 0.4262274605313951, + "learning_rate": 3.993199485819731e-05, + "loss": 0.5875, + "step": 2269 + }, + { + "epoch": 0.317704688593422, + "grad_norm": 0.38915227058740126, + "learning_rate": 3.992290363869895e-05, + "loss": 0.5531, + "step": 2270 + }, + { + "epoch": 0.3178446466060182, + "grad_norm": 0.43560718886498834, + "learning_rate": 3.9913809352392474e-05, + "loss": 0.5923, + "step": 2271 + }, + { + "epoch": 0.31798460461861444, + "grad_norm": 0.44620165771992476, + "learning_rate": 3.990471200114685e-05, + "loss": 0.608, + "step": 2272 + }, + { + "epoch": 0.31812456263121064, + "grad_norm": 0.4359041969176294, + "learning_rate": 3.9895611586831685e-05, + "loss": 0.5733, + "step": 2273 + }, + { + "epoch": 0.31826452064380684, + "grad_norm": 0.42976996875212653, + "learning_rate": 3.98865081113172e-05, + "loss": 0.5701, + "step": 2274 + }, + { + "epoch": 0.3184044786564031, + "grad_norm": 0.40393224396318145, + "learning_rate": 3.987740157647426e-05, + "loss": 0.5764, + "step": 2275 + }, + { + "epoch": 0.3185444366689993, + "grad_norm": 0.42936298439237425, + "learning_rate": 3.986829198417433e-05, + "loss": 0.5922, + "step": 2276 + }, + { + "epoch": 0.31868439468159554, + "grad_norm": 0.40635602349462724, + "learning_rate": 3.985917933628955e-05, + "loss": 0.5492, + "step": 2277 + }, + { + "epoch": 0.31882435269419174, + "grad_norm": 0.38817161800821404, + "learning_rate": 3.9850063634692635e-05, + "loss": 0.6025, + "step": 2278 + }, + { + "epoch": 0.318964310706788, + "grad_norm": 0.40934279574178284, + "learning_rate": 3.984094488125698e-05, + "loss": 0.5779, + "step": 2279 + }, + { + "epoch": 0.3191042687193842, + "grad_norm": 0.4265395827777272, + "learning_rate": 3.983182307785657e-05, + "loss": 0.5947, + "step": 2280 + }, + { + "epoch": 0.3192442267319804, + "grad_norm": 0.4131217561851261, + "learning_rate": 3.982269822636602e-05, + "loss": 0.5659, + "step": 2281 + }, + { + "epoch": 0.31938418474457664, + "grad_norm": 0.40502407520544415, + "learning_rate": 3.981357032866058e-05, + "loss": 0.618, + "step": 2282 + }, + { + "epoch": 0.31952414275717284, + "grad_norm": 0.4162474667828162, + "learning_rate": 3.980443938661614e-05, + "loss": 0.5778, + "step": 2283 + }, + { + "epoch": 0.3196641007697691, + "grad_norm": 0.41057477420812005, + "learning_rate": 3.9795305402109195e-05, + "loss": 0.5432, + "step": 2284 + }, + { + "epoch": 0.3198040587823653, + "grad_norm": 0.4199988291247849, + "learning_rate": 3.9786168377016866e-05, + "loss": 0.5615, + "step": 2285 + }, + { + "epoch": 0.31994401679496154, + "grad_norm": 0.4154620895424027, + "learning_rate": 3.977702831321692e-05, + "loss": 0.5774, + "step": 2286 + }, + { + "epoch": 0.32008397480755774, + "grad_norm": 0.42692144319966024, + "learning_rate": 3.976788521258771e-05, + "loss": 0.6076, + "step": 2287 + }, + { + "epoch": 0.32022393282015393, + "grad_norm": 0.41668448669509417, + "learning_rate": 3.975873907700825e-05, + "loss": 0.5973, + "step": 2288 + }, + { + "epoch": 0.3203638908327502, + "grad_norm": 0.4268759954887578, + "learning_rate": 3.974958990835816e-05, + "loss": 0.581, + "step": 2289 + }, + { + "epoch": 0.3205038488453464, + "grad_norm": 0.3952528234342034, + "learning_rate": 3.974043770851769e-05, + "loss": 0.5613, + "step": 2290 + }, + { + "epoch": 0.32064380685794264, + "grad_norm": 0.44836986260917555, + "learning_rate": 3.9731282479367706e-05, + "loss": 0.6248, + "step": 2291 + }, + { + "epoch": 0.32078376487053883, + "grad_norm": 0.4256932127612938, + "learning_rate": 3.9722124222789705e-05, + "loss": 0.6071, + "step": 2292 + }, + { + "epoch": 0.32092372288313503, + "grad_norm": 0.44510004429106764, + "learning_rate": 3.97129629406658e-05, + "loss": 0.5797, + "step": 2293 + }, + { + "epoch": 0.3210636808957313, + "grad_norm": 0.4302627905158645, + "learning_rate": 3.970379863487872e-05, + "loss": 0.5665, + "step": 2294 + }, + { + "epoch": 0.3212036389083275, + "grad_norm": 0.41235485841130853, + "learning_rate": 3.969463130731183e-05, + "loss": 0.5634, + "step": 2295 + }, + { + "epoch": 0.32134359692092374, + "grad_norm": 0.41894260330344535, + "learning_rate": 3.9685460959849105e-05, + "loss": 0.5867, + "step": 2296 + }, + { + "epoch": 0.32148355493351993, + "grad_norm": 0.41203034907703107, + "learning_rate": 3.967628759437516e-05, + "loss": 0.59, + "step": 2297 + }, + { + "epoch": 0.3216235129461162, + "grad_norm": 0.4281005271912537, + "learning_rate": 3.96671112127752e-05, + "loss": 0.6006, + "step": 2298 + }, + { + "epoch": 0.3217634709587124, + "grad_norm": 0.4075127794042514, + "learning_rate": 3.965793181693506e-05, + "loss": 0.565, + "step": 2299 + }, + { + "epoch": 0.3219034289713086, + "grad_norm": 0.39763128617859633, + "learning_rate": 3.96487494087412e-05, + "loss": 0.5437, + "step": 2300 + }, + { + "epoch": 0.32204338698390483, + "grad_norm": 0.4552074769557467, + "learning_rate": 3.9639563990080716e-05, + "loss": 0.6201, + "step": 2301 + }, + { + "epoch": 0.32218334499650103, + "grad_norm": 0.40413041195679866, + "learning_rate": 3.9630375562841295e-05, + "loss": 0.5526, + "step": 2302 + }, + { + "epoch": 0.3223233030090973, + "grad_norm": 0.40283666703685167, + "learning_rate": 3.9621184128911234e-05, + "loss": 0.579, + "step": 2303 + }, + { + "epoch": 0.3224632610216935, + "grad_norm": 0.3980665357469519, + "learning_rate": 3.96119896901795e-05, + "loss": 0.5626, + "step": 2304 + }, + { + "epoch": 0.32260321903428973, + "grad_norm": 0.42655702692861863, + "learning_rate": 3.9602792248535606e-05, + "loss": 0.6075, + "step": 2305 + }, + { + "epoch": 0.32274317704688593, + "grad_norm": 0.4826045217103083, + "learning_rate": 3.959359180586975e-05, + "loss": 0.5799, + "step": 2306 + }, + { + "epoch": 0.32288313505948213, + "grad_norm": 0.43008580043844, + "learning_rate": 3.95843883640727e-05, + "loss": 0.5787, + "step": 2307 + }, + { + "epoch": 0.3230230930720784, + "grad_norm": 0.42351832550150303, + "learning_rate": 3.957518192503587e-05, + "loss": 0.5565, + "step": 2308 + }, + { + "epoch": 0.3231630510846746, + "grad_norm": 0.40482672190396396, + "learning_rate": 3.956597249065126e-05, + "loss": 0.5533, + "step": 2309 + }, + { + "epoch": 0.32330300909727083, + "grad_norm": 0.4107029056895161, + "learning_rate": 3.95567600628115e-05, + "loss": 0.5756, + "step": 2310 + }, + { + "epoch": 0.32344296710986703, + "grad_norm": 0.43221123666830596, + "learning_rate": 3.954754464340987e-05, + "loss": 0.6074, + "step": 2311 + }, + { + "epoch": 0.3235829251224633, + "grad_norm": 0.42051407755166026, + "learning_rate": 3.9538326234340194e-05, + "loss": 0.5948, + "step": 2312 + }, + { + "epoch": 0.3237228831350595, + "grad_norm": 0.38578825068740574, + "learning_rate": 3.9529104837496974e-05, + "loss": 0.5692, + "step": 2313 + }, + { + "epoch": 0.3238628411476557, + "grad_norm": 0.42139866225485473, + "learning_rate": 3.951988045477529e-05, + "loss": 0.5892, + "step": 2314 + }, + { + "epoch": 0.32400279916025193, + "grad_norm": 0.43110899316398094, + "learning_rate": 3.9510653088070846e-05, + "loss": 0.5698, + "step": 2315 + }, + { + "epoch": 0.32414275717284813, + "grad_norm": 0.4426212316446372, + "learning_rate": 3.9501422739279956e-05, + "loss": 0.5727, + "step": 2316 + }, + { + "epoch": 0.3242827151854444, + "grad_norm": 0.42108819792246993, + "learning_rate": 3.9492189410299566e-05, + "loss": 0.554, + "step": 2317 + }, + { + "epoch": 0.3244226731980406, + "grad_norm": 0.46665094478938685, + "learning_rate": 3.94829531030272e-05, + "loss": 0.6323, + "step": 2318 + }, + { + "epoch": 0.32456263121063683, + "grad_norm": 0.4138605211792902, + "learning_rate": 3.9473713819361015e-05, + "loss": 0.5446, + "step": 2319 + }, + { + "epoch": 0.32470258922323303, + "grad_norm": 0.43505576166032045, + "learning_rate": 3.946447156119979e-05, + "loss": 0.5664, + "step": 2320 + }, + { + "epoch": 0.3248425472358292, + "grad_norm": 0.41878878141867854, + "learning_rate": 3.945522633044289e-05, + "loss": 0.608, + "step": 2321 + }, + { + "epoch": 0.3249825052484255, + "grad_norm": 0.40004594511350894, + "learning_rate": 3.9445978128990326e-05, + "loss": 0.5454, + "step": 2322 + }, + { + "epoch": 0.3251224632610217, + "grad_norm": 0.42281186251192865, + "learning_rate": 3.943672695874267e-05, + "loss": 0.567, + "step": 2323 + }, + { + "epoch": 0.32526242127361793, + "grad_norm": 0.4068793191045584, + "learning_rate": 3.942747282160114e-05, + "loss": 0.5424, + "step": 2324 + }, + { + "epoch": 0.3254023792862141, + "grad_norm": 0.42411112127273465, + "learning_rate": 3.9418215719467565e-05, + "loss": 0.567, + "step": 2325 + }, + { + "epoch": 0.3255423372988104, + "grad_norm": 0.4111794082883435, + "learning_rate": 3.9408955654244364e-05, + "loss": 0.5682, + "step": 2326 + }, + { + "epoch": 0.3256822953114066, + "grad_norm": 0.39707493084776485, + "learning_rate": 3.939969262783457e-05, + "loss": 0.5541, + "step": 2327 + }, + { + "epoch": 0.3258222533240028, + "grad_norm": 0.3941781607373903, + "learning_rate": 3.939042664214184e-05, + "loss": 0.5511, + "step": 2328 + }, + { + "epoch": 0.32596221133659903, + "grad_norm": 0.4150783846079092, + "learning_rate": 3.9381157699070424e-05, + "loss": 0.6085, + "step": 2329 + }, + { + "epoch": 0.3261021693491952, + "grad_norm": 0.392787000567577, + "learning_rate": 3.937188580052518e-05, + "loss": 0.5834, + "step": 2330 + }, + { + "epoch": 0.3262421273617915, + "grad_norm": 0.41765704626894373, + "learning_rate": 3.9362610948411585e-05, + "loss": 0.5497, + "step": 2331 + }, + { + "epoch": 0.3263820853743877, + "grad_norm": 0.4175224418697374, + "learning_rate": 3.9353333144635706e-05, + "loss": 0.5597, + "step": 2332 + }, + { + "epoch": 0.32652204338698393, + "grad_norm": 0.4243451821154263, + "learning_rate": 3.934405239110423e-05, + "loss": 0.5884, + "step": 2333 + }, + { + "epoch": 0.3266620013995801, + "grad_norm": 0.4267515430222131, + "learning_rate": 3.9334768689724456e-05, + "loss": 0.606, + "step": 2334 + }, + { + "epoch": 0.3268019594121763, + "grad_norm": 0.4332693190274577, + "learning_rate": 3.932548204240426e-05, + "loss": 0.5995, + "step": 2335 + }, + { + "epoch": 0.3269419174247726, + "grad_norm": 0.4121506429858514, + "learning_rate": 3.931619245105216e-05, + "loss": 0.5787, + "step": 2336 + }, + { + "epoch": 0.3270818754373688, + "grad_norm": 0.4180451871893415, + "learning_rate": 3.9306899917577245e-05, + "loss": 0.5546, + "step": 2337 + }, + { + "epoch": 0.327221833449965, + "grad_norm": 0.40942456121846177, + "learning_rate": 3.9297604443889234e-05, + "loss": 0.6098, + "step": 2338 + }, + { + "epoch": 0.3273617914625612, + "grad_norm": 0.4067754001916415, + "learning_rate": 3.928830603189844e-05, + "loss": 0.5756, + "step": 2339 + }, + { + "epoch": 0.3275017494751575, + "grad_norm": 0.4140918426830124, + "learning_rate": 3.9279004683515783e-05, + "loss": 0.5641, + "step": 2340 + }, + { + "epoch": 0.3276417074877537, + "grad_norm": 0.4212820521655152, + "learning_rate": 3.926970040065278e-05, + "loss": 0.604, + "step": 2341 + }, + { + "epoch": 0.3277816655003499, + "grad_norm": 0.40003067107763557, + "learning_rate": 3.9260393185221564e-05, + "loss": 0.565, + "step": 2342 + }, + { + "epoch": 0.3279216235129461, + "grad_norm": 0.39827204127063837, + "learning_rate": 3.925108303913485e-05, + "loss": 0.5799, + "step": 2343 + }, + { + "epoch": 0.3280615815255423, + "grad_norm": 0.4233494400434675, + "learning_rate": 3.9241769964305976e-05, + "loss": 0.6037, + "step": 2344 + }, + { + "epoch": 0.3282015395381386, + "grad_norm": 0.4118719992278687, + "learning_rate": 3.9232453962648864e-05, + "loss": 0.6098, + "step": 2345 + }, + { + "epoch": 0.3283414975507348, + "grad_norm": 0.4016000541614996, + "learning_rate": 3.9223135036078064e-05, + "loss": 0.5734, + "step": 2346 + }, + { + "epoch": 0.328481455563331, + "grad_norm": 0.4237468615429023, + "learning_rate": 3.921381318650869e-05, + "loss": 0.6088, + "step": 2347 + }, + { + "epoch": 0.3286214135759272, + "grad_norm": 0.46652462106861353, + "learning_rate": 3.920448841585649e-05, + "loss": 0.5814, + "step": 2348 + }, + { + "epoch": 0.3287613715885234, + "grad_norm": 0.39685764815363, + "learning_rate": 3.9195160726037805e-05, + "loss": 0.5646, + "step": 2349 + }, + { + "epoch": 0.3289013296011197, + "grad_norm": 0.4229891713627349, + "learning_rate": 3.918583011896955e-05, + "loss": 0.5773, + "step": 2350 + }, + { + "epoch": 0.32904128761371587, + "grad_norm": 0.3879674012353026, + "learning_rate": 3.9176496596569265e-05, + "loss": 0.5397, + "step": 2351 + }, + { + "epoch": 0.3291812456263121, + "grad_norm": 0.3901223408311448, + "learning_rate": 3.91671601607551e-05, + "loss": 0.5736, + "step": 2352 + }, + { + "epoch": 0.3293212036389083, + "grad_norm": 0.4002239555751096, + "learning_rate": 3.915782081344578e-05, + "loss": 0.5967, + "step": 2353 + }, + { + "epoch": 0.3294611616515046, + "grad_norm": 0.41288801391640517, + "learning_rate": 3.914847855656062e-05, + "loss": 0.5715, + "step": 2354 + }, + { + "epoch": 0.3296011196641008, + "grad_norm": 0.4184700560414015, + "learning_rate": 3.913913339201956e-05, + "loss": 0.5507, + "step": 2355 + }, + { + "epoch": 0.32974107767669697, + "grad_norm": 0.3894927148762295, + "learning_rate": 3.9129785321743125e-05, + "loss": 0.5705, + "step": 2356 + }, + { + "epoch": 0.3298810356892932, + "grad_norm": 0.3847000287394412, + "learning_rate": 3.912043434765245e-05, + "loss": 0.5484, + "step": 2357 + }, + { + "epoch": 0.3300209937018894, + "grad_norm": 0.4403932420001484, + "learning_rate": 3.911108047166924e-05, + "loss": 0.6139, + "step": 2358 + }, + { + "epoch": 0.3301609517144857, + "grad_norm": 0.4352977938001518, + "learning_rate": 3.9101723695715805e-05, + "loss": 0.6006, + "step": 2359 + }, + { + "epoch": 0.33030090972708187, + "grad_norm": 0.40492957509367133, + "learning_rate": 3.909236402171508e-05, + "loss": 0.6026, + "step": 2360 + }, + { + "epoch": 0.3304408677396781, + "grad_norm": 0.3867521703349589, + "learning_rate": 3.908300145159055e-05, + "loss": 0.5449, + "step": 2361 + }, + { + "epoch": 0.3305808257522743, + "grad_norm": 0.4404641851361547, + "learning_rate": 3.907363598726635e-05, + "loss": 0.5905, + "step": 2362 + }, + { + "epoch": 0.3307207837648705, + "grad_norm": 0.41840972294550066, + "learning_rate": 3.906426763066714e-05, + "loss": 0.5632, + "step": 2363 + }, + { + "epoch": 0.33086074177746677, + "grad_norm": 0.4162282322770041, + "learning_rate": 3.905489638371823e-05, + "loss": 0.5913, + "step": 2364 + }, + { + "epoch": 0.33100069979006297, + "grad_norm": 0.3955450449429818, + "learning_rate": 3.9045522248345504e-05, + "loss": 0.5324, + "step": 2365 + }, + { + "epoch": 0.3311406578026592, + "grad_norm": 0.49091811650049194, + "learning_rate": 3.903614522647545e-05, + "loss": 0.5769, + "step": 2366 + }, + { + "epoch": 0.3312806158152554, + "grad_norm": 0.4173793487196645, + "learning_rate": 3.902676532003514e-05, + "loss": 0.5886, + "step": 2367 + }, + { + "epoch": 0.33142057382785167, + "grad_norm": 0.3932967546945319, + "learning_rate": 3.901738253095222e-05, + "loss": 0.5621, + "step": 2368 + }, + { + "epoch": 0.33156053184044787, + "grad_norm": 0.43569710224836344, + "learning_rate": 3.900799686115498e-05, + "loss": 0.5741, + "step": 2369 + }, + { + "epoch": 0.33170048985304407, + "grad_norm": 0.4115145852184928, + "learning_rate": 3.899860831257224e-05, + "loss": 0.5259, + "step": 2370 + }, + { + "epoch": 0.3318404478656403, + "grad_norm": 0.4381698932866813, + "learning_rate": 3.898921688713346e-05, + "loss": 0.6024, + "step": 2371 + }, + { + "epoch": 0.3319804058782365, + "grad_norm": 0.4208901188098798, + "learning_rate": 3.897982258676867e-05, + "loss": 0.5527, + "step": 2372 + }, + { + "epoch": 0.33212036389083277, + "grad_norm": 0.41912261407751483, + "learning_rate": 3.8970425413408495e-05, + "loss": 0.5822, + "step": 2373 + }, + { + "epoch": 0.33226032190342897, + "grad_norm": 0.4040927772770925, + "learning_rate": 3.896102536898415e-05, + "loss": 0.5491, + "step": 2374 + }, + { + "epoch": 0.33240027991602517, + "grad_norm": 0.4173965549079611, + "learning_rate": 3.8951622455427425e-05, + "loss": 0.6025, + "step": 2375 + }, + { + "epoch": 0.3325402379286214, + "grad_norm": 0.404116401312251, + "learning_rate": 3.894221667467074e-05, + "loss": 0.553, + "step": 2376 + }, + { + "epoch": 0.3326801959412176, + "grad_norm": 0.39814070442365174, + "learning_rate": 3.893280802864706e-05, + "loss": 0.5538, + "step": 2377 + }, + { + "epoch": 0.33282015395381387, + "grad_norm": 0.4097199303736863, + "learning_rate": 3.892339651928996e-05, + "loss": 0.5562, + "step": 2378 + }, + { + "epoch": 0.33296011196641007, + "grad_norm": 0.4251433986281386, + "learning_rate": 3.89139821485336e-05, + "loss": 0.5886, + "step": 2379 + }, + { + "epoch": 0.3331000699790063, + "grad_norm": 0.3965548232917148, + "learning_rate": 3.8904564918312737e-05, + "loss": 0.5536, + "step": 2380 + }, + { + "epoch": 0.3332400279916025, + "grad_norm": 0.4012944065952801, + "learning_rate": 3.889514483056269e-05, + "loss": 0.5742, + "step": 2381 + }, + { + "epoch": 0.3333799860041987, + "grad_norm": 0.4201060112427097, + "learning_rate": 3.8885721887219406e-05, + "loss": 0.5691, + "step": 2382 + }, + { + "epoch": 0.33351994401679497, + "grad_norm": 0.42817185933177176, + "learning_rate": 3.887629609021938e-05, + "loss": 0.5678, + "step": 2383 + }, + { + "epoch": 0.33365990202939116, + "grad_norm": 0.43405350104277596, + "learning_rate": 3.88668674414997e-05, + "loss": 0.6216, + "step": 2384 + }, + { + "epoch": 0.3337998600419874, + "grad_norm": 0.42315727231869327, + "learning_rate": 3.8857435942998074e-05, + "loss": 0.575, + "step": 2385 + }, + { + "epoch": 0.3339398180545836, + "grad_norm": 0.40648841814651454, + "learning_rate": 3.884800159665276e-05, + "loss": 0.5715, + "step": 2386 + }, + { + "epoch": 0.33407977606717987, + "grad_norm": 0.426901177821104, + "learning_rate": 3.883856440440261e-05, + "loss": 0.5788, + "step": 2387 + }, + { + "epoch": 0.33421973407977607, + "grad_norm": 0.4264982996567364, + "learning_rate": 3.882912436818705e-05, + "loss": 0.5849, + "step": 2388 + }, + { + "epoch": 0.33435969209237226, + "grad_norm": 0.41759550845666904, + "learning_rate": 3.881968148994613e-05, + "loss": 0.5524, + "step": 2389 + }, + { + "epoch": 0.3344996501049685, + "grad_norm": 0.3922959061793526, + "learning_rate": 3.881023577162044e-05, + "loss": 0.556, + "step": 2390 + }, + { + "epoch": 0.3346396081175647, + "grad_norm": 0.3988789451222829, + "learning_rate": 3.880078721515117e-05, + "loss": 0.5743, + "step": 2391 + }, + { + "epoch": 0.33477956613016097, + "grad_norm": 0.4206132245414296, + "learning_rate": 3.87913358224801e-05, + "loss": 0.5688, + "step": 2392 + }, + { + "epoch": 0.33491952414275716, + "grad_norm": 0.400794205539379, + "learning_rate": 3.878188159554959e-05, + "loss": 0.6014, + "step": 2393 + }, + { + "epoch": 0.3350594821553534, + "grad_norm": 0.4098374598496262, + "learning_rate": 3.8772424536302564e-05, + "loss": 0.6093, + "step": 2394 + }, + { + "epoch": 0.3351994401679496, + "grad_norm": 0.4250977633595455, + "learning_rate": 3.876296464668257e-05, + "loss": 0.606, + "step": 2395 + }, + { + "epoch": 0.3353393981805458, + "grad_norm": 0.4220441415552252, + "learning_rate": 3.875350192863368e-05, + "loss": 0.5897, + "step": 2396 + }, + { + "epoch": 0.33547935619314206, + "grad_norm": 0.41667093576332226, + "learning_rate": 3.87440363841006e-05, + "loss": 0.633, + "step": 2397 + }, + { + "epoch": 0.33561931420573826, + "grad_norm": 0.4157738741324718, + "learning_rate": 3.873456801502859e-05, + "loss": 0.5703, + "step": 2398 + }, + { + "epoch": 0.3357592722183345, + "grad_norm": 0.4258564142322055, + "learning_rate": 3.872509682336351e-05, + "loss": 0.5919, + "step": 2399 + }, + { + "epoch": 0.3358992302309307, + "grad_norm": 0.42470627438795794, + "learning_rate": 3.871562281105175e-05, + "loss": 0.5662, + "step": 2400 + }, + { + "epoch": 0.33603918824352697, + "grad_norm": 0.4186178764102384, + "learning_rate": 3.8706145980040344e-05, + "loss": 0.581, + "step": 2401 + }, + { + "epoch": 0.33617914625612316, + "grad_norm": 0.3987870833859228, + "learning_rate": 3.8696666332276875e-05, + "loss": 0.585, + "step": 2402 + }, + { + "epoch": 0.33631910426871936, + "grad_norm": 0.40669896712395964, + "learning_rate": 3.86871838697095e-05, + "loss": 0.5793, + "step": 2403 + }, + { + "epoch": 0.3364590622813156, + "grad_norm": 0.4324505891171469, + "learning_rate": 3.867769859428697e-05, + "loss": 0.5895, + "step": 2404 + }, + { + "epoch": 0.3365990202939118, + "grad_norm": 0.4089670782754292, + "learning_rate": 3.866821050795859e-05, + "loss": 0.614, + "step": 2405 + }, + { + "epoch": 0.33673897830650806, + "grad_norm": 0.5537632601572994, + "learning_rate": 3.865871961267427e-05, + "loss": 0.5969, + "step": 2406 + }, + { + "epoch": 0.33687893631910426, + "grad_norm": 0.4149474055553234, + "learning_rate": 3.864922591038448e-05, + "loss": 0.5521, + "step": 2407 + }, + { + "epoch": 0.3370188943317005, + "grad_norm": 0.426552813649065, + "learning_rate": 3.863972940304028e-05, + "loss": 0.6032, + "step": 2408 + }, + { + "epoch": 0.3371588523442967, + "grad_norm": 0.41223595995661894, + "learning_rate": 3.863023009259329e-05, + "loss": 0.5785, + "step": 2409 + }, + { + "epoch": 0.3372988103568929, + "grad_norm": 0.4024674567365589, + "learning_rate": 3.8620727980995716e-05, + "loss": 0.5589, + "step": 2410 + }, + { + "epoch": 0.33743876836948916, + "grad_norm": 0.41476951241547977, + "learning_rate": 3.861122307020034e-05, + "loss": 0.5648, + "step": 2411 + }, + { + "epoch": 0.33757872638208536, + "grad_norm": 0.4024211020152443, + "learning_rate": 3.860171536216052e-05, + "loss": 0.5569, + "step": 2412 + }, + { + "epoch": 0.3377186843946816, + "grad_norm": 0.41785686679485745, + "learning_rate": 3.8592204858830175e-05, + "loss": 0.6093, + "step": 2413 + }, + { + "epoch": 0.3378586424072778, + "grad_norm": 0.41460218899468515, + "learning_rate": 3.858269156216383e-05, + "loss": 0.5795, + "step": 2414 + }, + { + "epoch": 0.33799860041987406, + "grad_norm": 0.40446354182657146, + "learning_rate": 3.8573175474116545e-05, + "loss": 0.549, + "step": 2415 + }, + { + "epoch": 0.33813855843247026, + "grad_norm": 0.4037463725904796, + "learning_rate": 3.856365659664399e-05, + "loss": 0.5356, + "step": 2416 + }, + { + "epoch": 0.33827851644506646, + "grad_norm": 0.4182076163889934, + "learning_rate": 3.855413493170237e-05, + "loss": 0.6045, + "step": 2417 + }, + { + "epoch": 0.3384184744576627, + "grad_norm": 0.42186938322484663, + "learning_rate": 3.85446104812485e-05, + "loss": 0.6155, + "step": 2418 + }, + { + "epoch": 0.3385584324702589, + "grad_norm": 0.41470003229003616, + "learning_rate": 3.853508324723976e-05, + "loss": 0.5701, + "step": 2419 + }, + { + "epoch": 0.33869839048285516, + "grad_norm": 0.41867300233582005, + "learning_rate": 3.852555323163406e-05, + "loss": 0.5762, + "step": 2420 + }, + { + "epoch": 0.33883834849545136, + "grad_norm": 0.43083623709965385, + "learning_rate": 3.851602043638994e-05, + "loss": 0.5928, + "step": 2421 + }, + { + "epoch": 0.3389783065080476, + "grad_norm": 0.4112941630964681, + "learning_rate": 3.850648486346649e-05, + "loss": 0.5671, + "step": 2422 + }, + { + "epoch": 0.3391182645206438, + "grad_norm": 0.4081030227054361, + "learning_rate": 3.849694651482335e-05, + "loss": 0.5662, + "step": 2423 + }, + { + "epoch": 0.33925822253324, + "grad_norm": 0.41990746427748654, + "learning_rate": 3.848740539242075e-05, + "loss": 0.6186, + "step": 2424 + }, + { + "epoch": 0.33939818054583626, + "grad_norm": 0.41509855881507357, + "learning_rate": 3.84778614982195e-05, + "loss": 0.5777, + "step": 2425 + }, + { + "epoch": 0.33953813855843246, + "grad_norm": 0.41529612645730785, + "learning_rate": 3.8468314834180954e-05, + "loss": 0.5834, + "step": 2426 + }, + { + "epoch": 0.3396780965710287, + "grad_norm": 0.4612886169509395, + "learning_rate": 3.845876540226706e-05, + "loss": 0.5898, + "step": 2427 + }, + { + "epoch": 0.3398180545836249, + "grad_norm": 0.40004450008966375, + "learning_rate": 3.844921320444031e-05, + "loss": 0.5677, + "step": 2428 + }, + { + "epoch": 0.33995801259622116, + "grad_norm": 0.42665920823328024, + "learning_rate": 3.843965824266379e-05, + "loss": 0.5643, + "step": 2429 + }, + { + "epoch": 0.34009797060881736, + "grad_norm": 0.4124069827888389, + "learning_rate": 3.843010051890114e-05, + "loss": 0.6075, + "step": 2430 + }, + { + "epoch": 0.34023792862141355, + "grad_norm": 0.41303867930204274, + "learning_rate": 3.842054003511656e-05, + "loss": 0.5811, + "step": 2431 + }, + { + "epoch": 0.3403778866340098, + "grad_norm": 0.4196697445658453, + "learning_rate": 3.841097679327483e-05, + "loss": 0.5525, + "step": 2432 + }, + { + "epoch": 0.340517844646606, + "grad_norm": 0.4360442351288732, + "learning_rate": 3.840141079534131e-05, + "loss": 0.6244, + "step": 2433 + }, + { + "epoch": 0.34065780265920226, + "grad_norm": 0.39392921636528555, + "learning_rate": 3.839184204328188e-05, + "loss": 0.5494, + "step": 2434 + }, + { + "epoch": 0.34079776067179846, + "grad_norm": 0.39649367381567424, + "learning_rate": 3.8382270539063035e-05, + "loss": 0.5741, + "step": 2435 + }, + { + "epoch": 0.3409377186843947, + "grad_norm": 0.4170815607651056, + "learning_rate": 3.8372696284651814e-05, + "loss": 0.5759, + "step": 2436 + }, + { + "epoch": 0.3410776766969909, + "grad_norm": 0.3997155811593266, + "learning_rate": 3.8363119282015816e-05, + "loss": 0.5702, + "step": 2437 + }, + { + "epoch": 0.3412176347095871, + "grad_norm": 0.4282009363533291, + "learning_rate": 3.835353953312322e-05, + "loss": 0.5891, + "step": 2438 + }, + { + "epoch": 0.34135759272218336, + "grad_norm": 0.41921744383670845, + "learning_rate": 3.834395703994276e-05, + "loss": 0.5803, + "step": 2439 + }, + { + "epoch": 0.34149755073477955, + "grad_norm": 0.40474108379603385, + "learning_rate": 3.833437180444373e-05, + "loss": 0.6013, + "step": 2440 + }, + { + "epoch": 0.3416375087473758, + "grad_norm": 0.42511536225444024, + "learning_rate": 3.8324783828596e-05, + "loss": 0.5705, + "step": 2441 + }, + { + "epoch": 0.341777466759972, + "grad_norm": 0.3949585768953575, + "learning_rate": 3.8315193114369996e-05, + "loss": 0.5617, + "step": 2442 + }, + { + "epoch": 0.34191742477256826, + "grad_norm": 0.39765016272736614, + "learning_rate": 3.830559966373671e-05, + "loss": 0.5164, + "step": 2443 + }, + { + "epoch": 0.34205738278516445, + "grad_norm": 0.3878551915452198, + "learning_rate": 3.829600347866768e-05, + "loss": 0.5661, + "step": 2444 + }, + { + "epoch": 0.34219734079776065, + "grad_norm": 0.4346559633685042, + "learning_rate": 3.828640456113504e-05, + "loss": 0.6013, + "step": 2445 + }, + { + "epoch": 0.3423372988103569, + "grad_norm": 0.4218764947636453, + "learning_rate": 3.827680291311143e-05, + "loss": 0.5684, + "step": 2446 + }, + { + "epoch": 0.3424772568229531, + "grad_norm": 0.40065712577833346, + "learning_rate": 3.8267198536570123e-05, + "loss": 0.5313, + "step": 2447 + }, + { + "epoch": 0.34261721483554936, + "grad_norm": 0.4050494830959936, + "learning_rate": 3.8257591433484906e-05, + "loss": 0.5632, + "step": 2448 + }, + { + "epoch": 0.34275717284814555, + "grad_norm": 0.41579240423995417, + "learning_rate": 3.824798160583012e-05, + "loss": 0.5719, + "step": 2449 + }, + { + "epoch": 0.34289713086074175, + "grad_norm": 0.41342697420725416, + "learning_rate": 3.823836905558071e-05, + "loss": 0.5825, + "step": 2450 + }, + { + "epoch": 0.343037088873338, + "grad_norm": 0.4242841636188481, + "learning_rate": 3.8228753784712126e-05, + "loss": 0.5917, + "step": 2451 + }, + { + "epoch": 0.3431770468859342, + "grad_norm": 0.3909799623795335, + "learning_rate": 3.8219135795200417e-05, + "loss": 0.5294, + "step": 2452 + }, + { + "epoch": 0.34331700489853045, + "grad_norm": 0.4191631316056565, + "learning_rate": 3.8209515089022175e-05, + "loss": 0.6145, + "step": 2453 + }, + { + "epoch": 0.34345696291112665, + "grad_norm": 0.41444745056939064, + "learning_rate": 3.819989166815455e-05, + "loss": 0.5448, + "step": 2454 + }, + { + "epoch": 0.3435969209237229, + "grad_norm": 0.41319072983975424, + "learning_rate": 3.8190265534575256e-05, + "loss": 0.5493, + "step": 2455 + }, + { + "epoch": 0.3437368789363191, + "grad_norm": 0.4144210912873605, + "learning_rate": 3.818063669026256e-05, + "loss": 0.5699, + "step": 2456 + }, + { + "epoch": 0.3438768369489153, + "grad_norm": 0.43371658028513654, + "learning_rate": 3.817100513719529e-05, + "loss": 0.5994, + "step": 2457 + }, + { + "epoch": 0.34401679496151155, + "grad_norm": 0.39358381227775163, + "learning_rate": 3.8161370877352825e-05, + "loss": 0.5772, + "step": 2458 + }, + { + "epoch": 0.34415675297410775, + "grad_norm": 0.4524806967713819, + "learning_rate": 3.815173391271511e-05, + "loss": 0.5801, + "step": 2459 + }, + { + "epoch": 0.344296710986704, + "grad_norm": 0.42248629141745564, + "learning_rate": 3.814209424526262e-05, + "loss": 0.5344, + "step": 2460 + }, + { + "epoch": 0.3444366689993002, + "grad_norm": 0.4222729879954454, + "learning_rate": 3.813245187697643e-05, + "loss": 0.6305, + "step": 2461 + }, + { + "epoch": 0.34457662701189645, + "grad_norm": 0.40274756440667137, + "learning_rate": 3.812280680983812e-05, + "loss": 0.5587, + "step": 2462 + }, + { + "epoch": 0.34471658502449265, + "grad_norm": 0.4041971062889942, + "learning_rate": 3.8113159045829864e-05, + "loss": 0.5837, + "step": 2463 + }, + { + "epoch": 0.34485654303708885, + "grad_norm": 0.437567215932465, + "learning_rate": 3.8103508586934365e-05, + "loss": 0.6047, + "step": 2464 + }, + { + "epoch": 0.3449965010496851, + "grad_norm": 0.4015334832237547, + "learning_rate": 3.8093855435134914e-05, + "loss": 0.5504, + "step": 2465 + }, + { + "epoch": 0.3451364590622813, + "grad_norm": 0.42379890127600006, + "learning_rate": 3.8084199592415305e-05, + "loss": 0.5723, + "step": 2466 + }, + { + "epoch": 0.34527641707487755, + "grad_norm": 0.41715521655535315, + "learning_rate": 3.8074541060759925e-05, + "loss": 0.5857, + "step": 2467 + }, + { + "epoch": 0.34541637508747375, + "grad_norm": 0.43326783152483606, + "learning_rate": 3.806487984215369e-05, + "loss": 0.5603, + "step": 2468 + }, + { + "epoch": 0.34555633310007, + "grad_norm": 0.403241790805183, + "learning_rate": 3.8055215938582086e-05, + "loss": 0.5727, + "step": 2469 + }, + { + "epoch": 0.3456962911126662, + "grad_norm": 0.4061061612551589, + "learning_rate": 3.804554935203115e-05, + "loss": 0.5846, + "step": 2470 + }, + { + "epoch": 0.3458362491252624, + "grad_norm": 0.40214409369497894, + "learning_rate": 3.803588008448745e-05, + "loss": 0.5655, + "step": 2471 + }, + { + "epoch": 0.34597620713785865, + "grad_norm": 0.4213923914769129, + "learning_rate": 3.802620813793814e-05, + "loss": 0.5632, + "step": 2472 + }, + { + "epoch": 0.34611616515045485, + "grad_norm": 0.41515367784081497, + "learning_rate": 3.801653351437087e-05, + "loss": 0.562, + "step": 2473 + }, + { + "epoch": 0.3462561231630511, + "grad_norm": 0.424690045157795, + "learning_rate": 3.8006856215773895e-05, + "loss": 0.5889, + "step": 2474 + }, + { + "epoch": 0.3463960811756473, + "grad_norm": 0.43537881056181427, + "learning_rate": 3.799717624413599e-05, + "loss": 0.6304, + "step": 2475 + }, + { + "epoch": 0.34653603918824355, + "grad_norm": 0.40770798716655837, + "learning_rate": 3.798749360144651e-05, + "loss": 0.5743, + "step": 2476 + }, + { + "epoch": 0.34667599720083975, + "grad_norm": 0.42860038780017473, + "learning_rate": 3.7977808289695306e-05, + "loss": 0.6029, + "step": 2477 + }, + { + "epoch": 0.34681595521343594, + "grad_norm": 0.41741937935524637, + "learning_rate": 3.796812031087281e-05, + "loss": 0.5745, + "step": 2478 + }, + { + "epoch": 0.3469559132260322, + "grad_norm": 0.4169469655781413, + "learning_rate": 3.7958429666970024e-05, + "loss": 0.5932, + "step": 2479 + }, + { + "epoch": 0.3470958712386284, + "grad_norm": 0.41794717504089257, + "learning_rate": 3.7948736359978455e-05, + "loss": 0.5957, + "step": 2480 + }, + { + "epoch": 0.34723582925122465, + "grad_norm": 0.42237578550439164, + "learning_rate": 3.793904039189018e-05, + "loss": 0.5809, + "step": 2481 + }, + { + "epoch": 0.34737578726382085, + "grad_norm": 0.42457946983643247, + "learning_rate": 3.7929341764697816e-05, + "loss": 0.5741, + "step": 2482 + }, + { + "epoch": 0.3475157452764171, + "grad_norm": 0.40686808588079204, + "learning_rate": 3.791964048039454e-05, + "loss": 0.577, + "step": 2483 + }, + { + "epoch": 0.3476557032890133, + "grad_norm": 0.3917392399092806, + "learning_rate": 3.790993654097405e-05, + "loss": 0.5389, + "step": 2484 + }, + { + "epoch": 0.3477956613016095, + "grad_norm": 0.41842414036596237, + "learning_rate": 3.790022994843061e-05, + "loss": 0.6111, + "step": 2485 + }, + { + "epoch": 0.34793561931420575, + "grad_norm": 0.40264506155704255, + "learning_rate": 3.789052070475903e-05, + "loss": 0.5691, + "step": 2486 + }, + { + "epoch": 0.34807557732680194, + "grad_norm": 0.41841559375670284, + "learning_rate": 3.788080881195465e-05, + "loss": 0.5796, + "step": 2487 + }, + { + "epoch": 0.3482155353393982, + "grad_norm": 0.42238316240978235, + "learning_rate": 3.787109427201337e-05, + "loss": 0.5721, + "step": 2488 + }, + { + "epoch": 0.3483554933519944, + "grad_norm": 0.4261343625599289, + "learning_rate": 3.7861377086931615e-05, + "loss": 0.5762, + "step": 2489 + }, + { + "epoch": 0.34849545136459065, + "grad_norm": 0.40050526138599146, + "learning_rate": 3.785165725870638e-05, + "loss": 0.5771, + "step": 2490 + }, + { + "epoch": 0.34863540937718684, + "grad_norm": 0.4011183639320898, + "learning_rate": 3.7841934789335164e-05, + "loss": 0.5722, + "step": 2491 + }, + { + "epoch": 0.34877536738978304, + "grad_norm": 0.41515525438995227, + "learning_rate": 3.783220968081606e-05, + "loss": 0.5863, + "step": 2492 + }, + { + "epoch": 0.3489153254023793, + "grad_norm": 0.3930742764120805, + "learning_rate": 3.782248193514766e-05, + "loss": 0.531, + "step": 2493 + }, + { + "epoch": 0.3490552834149755, + "grad_norm": 0.41871895607441606, + "learning_rate": 3.7812751554329116e-05, + "loss": 0.5925, + "step": 2494 + }, + { + "epoch": 0.34919524142757175, + "grad_norm": 0.39386200492237877, + "learning_rate": 3.780301854036013e-05, + "loss": 0.6283, + "step": 2495 + }, + { + "epoch": 0.34933519944016794, + "grad_norm": 0.4562215128945137, + "learning_rate": 3.7793282895240926e-05, + "loss": 0.6435, + "step": 2496 + }, + { + "epoch": 0.3494751574527642, + "grad_norm": 0.40380935634400195, + "learning_rate": 3.778354462097229e-05, + "loss": 0.5874, + "step": 2497 + }, + { + "epoch": 0.3496151154653604, + "grad_norm": 0.37718467161463654, + "learning_rate": 3.7773803719555514e-05, + "loss": 0.5451, + "step": 2498 + }, + { + "epoch": 0.3497550734779566, + "grad_norm": 0.49862732423279105, + "learning_rate": 3.776406019299247e-05, + "loss": 0.566, + "step": 2499 + }, + { + "epoch": 0.34989503149055284, + "grad_norm": 0.3890315425799742, + "learning_rate": 3.7754314043285556e-05, + "loss": 0.541, + "step": 2500 + }, + { + "epoch": 0.35003498950314904, + "grad_norm": 0.4020611603166375, + "learning_rate": 3.774456527243768e-05, + "loss": 0.5703, + "step": 2501 + }, + { + "epoch": 0.3501749475157453, + "grad_norm": 0.4024001534789896, + "learning_rate": 3.7734813882452334e-05, + "loss": 0.5596, + "step": 2502 + }, + { + "epoch": 0.3503149055283415, + "grad_norm": 0.41123367006288974, + "learning_rate": 3.772505987533352e-05, + "loss": 0.5635, + "step": 2503 + }, + { + "epoch": 0.35045486354093774, + "grad_norm": 0.42428174810444297, + "learning_rate": 3.771530325308579e-05, + "loss": 0.5814, + "step": 2504 + }, + { + "epoch": 0.35059482155353394, + "grad_norm": 0.4038080676069653, + "learning_rate": 3.770554401771423e-05, + "loss": 0.5727, + "step": 2505 + }, + { + "epoch": 0.35073477956613014, + "grad_norm": 0.3839599878758453, + "learning_rate": 3.769578217122446e-05, + "loss": 0.5834, + "step": 2506 + }, + { + "epoch": 0.3508747375787264, + "grad_norm": 0.4158023938010911, + "learning_rate": 3.768601771562262e-05, + "loss": 0.5712, + "step": 2507 + }, + { + "epoch": 0.3510146955913226, + "grad_norm": 0.3943326004175278, + "learning_rate": 3.767625065291544e-05, + "loss": 0.568, + "step": 2508 + }, + { + "epoch": 0.35115465360391884, + "grad_norm": 0.3964702492091683, + "learning_rate": 3.766648098511012e-05, + "loss": 0.576, + "step": 2509 + }, + { + "epoch": 0.35129461161651504, + "grad_norm": 0.3921536448421572, + "learning_rate": 3.765670871421445e-05, + "loss": 0.5209, + "step": 2510 + }, + { + "epoch": 0.3514345696291113, + "grad_norm": 0.4351764473354882, + "learning_rate": 3.764693384223671e-05, + "loss": 0.5926, + "step": 2511 + }, + { + "epoch": 0.3515745276417075, + "grad_norm": 0.40588260989157027, + "learning_rate": 3.763715637118575e-05, + "loss": 0.548, + "step": 2512 + }, + { + "epoch": 0.3517144856543037, + "grad_norm": 0.4170639037160172, + "learning_rate": 3.762737630307093e-05, + "loss": 0.5875, + "step": 2513 + }, + { + "epoch": 0.35185444366689994, + "grad_norm": 0.4115129851905205, + "learning_rate": 3.761759363990215e-05, + "loss": 0.5462, + "step": 2514 + }, + { + "epoch": 0.35199440167949614, + "grad_norm": 0.4147105457172198, + "learning_rate": 3.7607808383689856e-05, + "loss": 0.5463, + "step": 2515 + }, + { + "epoch": 0.3521343596920924, + "grad_norm": 0.4071302253511469, + "learning_rate": 3.7598020536445017e-05, + "loss": 0.5445, + "step": 2516 + }, + { + "epoch": 0.3522743177046886, + "grad_norm": 0.39539443956776693, + "learning_rate": 3.758823010017913e-05, + "loss": 0.5455, + "step": 2517 + }, + { + "epoch": 0.35241427571728484, + "grad_norm": 0.39791097766830325, + "learning_rate": 3.7578437076904236e-05, + "loss": 0.5258, + "step": 2518 + }, + { + "epoch": 0.35255423372988104, + "grad_norm": 0.42035164161239397, + "learning_rate": 3.75686414686329e-05, + "loss": 0.5639, + "step": 2519 + }, + { + "epoch": 0.35269419174247724, + "grad_norm": 0.3830433028113876, + "learning_rate": 3.7558843277378206e-05, + "loss": 0.536, + "step": 2520 + }, + { + "epoch": 0.3528341497550735, + "grad_norm": 0.3933038656978647, + "learning_rate": 3.75490425051538e-05, + "loss": 0.5452, + "step": 2521 + }, + { + "epoch": 0.3529741077676697, + "grad_norm": 0.41406652639293745, + "learning_rate": 3.753923915397383e-05, + "loss": 0.5667, + "step": 2522 + }, + { + "epoch": 0.35311406578026594, + "grad_norm": 0.4201955355989494, + "learning_rate": 3.752943322585297e-05, + "loss": 0.5417, + "step": 2523 + }, + { + "epoch": 0.35325402379286214, + "grad_norm": 0.41614348716949034, + "learning_rate": 3.751962472280647e-05, + "loss": 0.5817, + "step": 2524 + }, + { + "epoch": 0.35339398180545833, + "grad_norm": 0.40621639691384537, + "learning_rate": 3.750981364685005e-05, + "loss": 0.5815, + "step": 2525 + }, + { + "epoch": 0.3535339398180546, + "grad_norm": 0.4007740627071781, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.5635, + "step": 2526 + }, + { + "epoch": 0.3536738978306508, + "grad_norm": 0.4088088272074455, + "learning_rate": 3.749018378427312e-05, + "loss": 0.599, + "step": 2527 + }, + { + "epoch": 0.35381385584324704, + "grad_norm": 0.4035751382941894, + "learning_rate": 3.7480365001686746e-05, + "loss": 0.601, + "step": 2528 + }, + { + "epoch": 0.35395381385584324, + "grad_norm": 0.41808007576821743, + "learning_rate": 3.747054365425872e-05, + "loss": 0.5625, + "step": 2529 + }, + { + "epoch": 0.3540937718684395, + "grad_norm": 0.4142769655137648, + "learning_rate": 3.7460719744007446e-05, + "loss": 0.5321, + "step": 2530 + }, + { + "epoch": 0.3542337298810357, + "grad_norm": 0.4258627449069856, + "learning_rate": 3.745089327295184e-05, + "loss": 0.5915, + "step": 2531 + }, + { + "epoch": 0.3543736878936319, + "grad_norm": 0.3979599134535474, + "learning_rate": 3.744106424311133e-05, + "loss": 0.528, + "step": 2532 + }, + { + "epoch": 0.35451364590622814, + "grad_norm": 0.40828188091566914, + "learning_rate": 3.7431232656505885e-05, + "loss": 0.584, + "step": 2533 + }, + { + "epoch": 0.35465360391882433, + "grad_norm": 0.41378663443999314, + "learning_rate": 3.7421398515155994e-05, + "loss": 0.5735, + "step": 2534 + }, + { + "epoch": 0.3547935619314206, + "grad_norm": 0.40350752325849487, + "learning_rate": 3.741156182108268e-05, + "loss": 0.5856, + "step": 2535 + }, + { + "epoch": 0.3549335199440168, + "grad_norm": 0.42065994177021454, + "learning_rate": 3.740172257630747e-05, + "loss": 0.5996, + "step": 2536 + }, + { + "epoch": 0.35507347795661304, + "grad_norm": 0.40860097335167056, + "learning_rate": 3.739188078285244e-05, + "loss": 0.6183, + "step": 2537 + }, + { + "epoch": 0.35521343596920923, + "grad_norm": 0.4122420177259316, + "learning_rate": 3.738203644274018e-05, + "loss": 0.5549, + "step": 2538 + }, + { + "epoch": 0.35535339398180543, + "grad_norm": 0.4181580931065648, + "learning_rate": 3.7372189557993794e-05, + "loss": 0.5945, + "step": 2539 + }, + { + "epoch": 0.3554933519944017, + "grad_norm": 0.41214229514921485, + "learning_rate": 3.7362340130636925e-05, + "loss": 0.5964, + "step": 2540 + }, + { + "epoch": 0.3556333100069979, + "grad_norm": 0.5379104247408596, + "learning_rate": 3.735248816269372e-05, + "loss": 0.5855, + "step": 2541 + }, + { + "epoch": 0.35577326801959414, + "grad_norm": 0.4221656282811352, + "learning_rate": 3.734263365618886e-05, + "loss": 0.611, + "step": 2542 + }, + { + "epoch": 0.35591322603219033, + "grad_norm": 0.4014247264225883, + "learning_rate": 3.7332776613147564e-05, + "loss": 0.5706, + "step": 2543 + }, + { + "epoch": 0.3560531840447866, + "grad_norm": 0.42213996299337353, + "learning_rate": 3.732291703559553e-05, + "loss": 0.5995, + "step": 2544 + }, + { + "epoch": 0.3561931420573828, + "grad_norm": 0.41540133704593746, + "learning_rate": 3.731305492555901e-05, + "loss": 0.5574, + "step": 2545 + }, + { + "epoch": 0.356333100069979, + "grad_norm": 0.4104571121982501, + "learning_rate": 3.7303190285064776e-05, + "loss": 0.5627, + "step": 2546 + }, + { + "epoch": 0.35647305808257523, + "grad_norm": 0.3942675491175812, + "learning_rate": 3.72933231161401e-05, + "loss": 0.558, + "step": 2547 + }, + { + "epoch": 0.35661301609517143, + "grad_norm": 0.39670035734631537, + "learning_rate": 3.7283453420812786e-05, + "loss": 0.5475, + "step": 2548 + }, + { + "epoch": 0.3567529741077677, + "grad_norm": 0.42150223375815987, + "learning_rate": 3.727358120111117e-05, + "loss": 0.5721, + "step": 2549 + }, + { + "epoch": 0.3568929321203639, + "grad_norm": 0.4008262706510209, + "learning_rate": 3.726370645906407e-05, + "loss": 0.5561, + "step": 2550 + }, + { + "epoch": 0.35703289013296013, + "grad_norm": 0.44040584835645996, + "learning_rate": 3.7253829196700876e-05, + "loss": 0.5846, + "step": 2551 + }, + { + "epoch": 0.35717284814555633, + "grad_norm": 0.4126914987733967, + "learning_rate": 3.7243949416051435e-05, + "loss": 0.5469, + "step": 2552 + }, + { + "epoch": 0.35731280615815253, + "grad_norm": 0.39603138357670026, + "learning_rate": 3.723406711914617e-05, + "loss": 0.5942, + "step": 2553 + }, + { + "epoch": 0.3574527641707488, + "grad_norm": 0.3852892928635738, + "learning_rate": 3.7224182308015975e-05, + "loss": 0.5928, + "step": 2554 + }, + { + "epoch": 0.357592722183345, + "grad_norm": 0.41121367387986824, + "learning_rate": 3.7214294984692285e-05, + "loss": 0.5853, + "step": 2555 + }, + { + "epoch": 0.35773268019594123, + "grad_norm": 0.4165573669555882, + "learning_rate": 3.7204405151207036e-05, + "loss": 0.5897, + "step": 2556 + }, + { + "epoch": 0.35787263820853743, + "grad_norm": 0.45676174939260117, + "learning_rate": 3.71945128095927e-05, + "loss": 0.5853, + "step": 2557 + }, + { + "epoch": 0.3580125962211337, + "grad_norm": 0.39957833133810244, + "learning_rate": 3.718461796188225e-05, + "loss": 0.5287, + "step": 2558 + }, + { + "epoch": 0.3581525542337299, + "grad_norm": 0.4145657616919604, + "learning_rate": 3.717472061010918e-05, + "loss": 0.593, + "step": 2559 + }, + { + "epoch": 0.3582925122463261, + "grad_norm": 0.4080507721332056, + "learning_rate": 3.71648207563075e-05, + "loss": 0.5634, + "step": 2560 + }, + { + "epoch": 0.35843247025892233, + "grad_norm": 0.3912915057782903, + "learning_rate": 3.715491840251172e-05, + "loss": 0.5949, + "step": 2561 + }, + { + "epoch": 0.35857242827151853, + "grad_norm": 0.41837049449187164, + "learning_rate": 3.714501355075688e-05, + "loss": 0.572, + "step": 2562 + }, + { + "epoch": 0.3587123862841148, + "grad_norm": 0.40328972090381965, + "learning_rate": 3.713510620307852e-05, + "loss": 0.5856, + "step": 2563 + }, + { + "epoch": 0.358852344296711, + "grad_norm": 0.431299030659745, + "learning_rate": 3.712519636151272e-05, + "loss": 0.5952, + "step": 2564 + }, + { + "epoch": 0.35899230230930723, + "grad_norm": 0.3985200361199881, + "learning_rate": 3.711528402809603e-05, + "loss": 0.5533, + "step": 2565 + }, + { + "epoch": 0.35913226032190343, + "grad_norm": 0.4192753590584814, + "learning_rate": 3.710536920486555e-05, + "loss": 0.5667, + "step": 2566 + }, + { + "epoch": 0.3592722183344996, + "grad_norm": 0.42240052564081954, + "learning_rate": 3.709545189385887e-05, + "loss": 0.5708, + "step": 2567 + }, + { + "epoch": 0.3594121763470959, + "grad_norm": 0.4236767334123938, + "learning_rate": 3.708553209711409e-05, + "loss": 0.5485, + "step": 2568 + }, + { + "epoch": 0.3595521343596921, + "grad_norm": 0.40080357675210276, + "learning_rate": 3.707560981666986e-05, + "loss": 0.557, + "step": 2569 + }, + { + "epoch": 0.35969209237228833, + "grad_norm": 0.410487921988623, + "learning_rate": 3.706568505456527e-05, + "loss": 0.5547, + "step": 2570 + }, + { + "epoch": 0.3598320503848845, + "grad_norm": 0.41368891130760344, + "learning_rate": 3.705575781283999e-05, + "loss": 0.5673, + "step": 2571 + }, + { + "epoch": 0.3599720083974808, + "grad_norm": 0.40193894214258574, + "learning_rate": 3.704582809353415e-05, + "loss": 0.5925, + "step": 2572 + }, + { + "epoch": 0.360111966410077, + "grad_norm": 0.407102772883237, + "learning_rate": 3.703589589868841e-05, + "loss": 0.5658, + "step": 2573 + }, + { + "epoch": 0.3602519244226732, + "grad_norm": 0.4100475297715309, + "learning_rate": 3.702596123034395e-05, + "loss": 0.5946, + "step": 2574 + }, + { + "epoch": 0.36039188243526943, + "grad_norm": 0.4005884577498317, + "learning_rate": 3.701602409054243e-05, + "loss": 0.5293, + "step": 2575 + }, + { + "epoch": 0.3605318404478656, + "grad_norm": 0.37159137413391713, + "learning_rate": 3.700608448132604e-05, + "loss": 0.5973, + "step": 2576 + }, + { + "epoch": 0.3606717984604619, + "grad_norm": 0.41775069760426675, + "learning_rate": 3.699614240473748e-05, + "loss": 0.5731, + "step": 2577 + }, + { + "epoch": 0.3608117564730581, + "grad_norm": 0.4536333746914714, + "learning_rate": 3.6986197862819934e-05, + "loss": 0.5814, + "step": 2578 + }, + { + "epoch": 0.36095171448565433, + "grad_norm": 0.415665084054717, + "learning_rate": 3.69762508576171e-05, + "loss": 0.5813, + "step": 2579 + }, + { + "epoch": 0.3610916724982505, + "grad_norm": 0.4111649076881108, + "learning_rate": 3.6966301391173206e-05, + "loss": 0.5687, + "step": 2580 + }, + { + "epoch": 0.3612316305108467, + "grad_norm": 0.45348364261588203, + "learning_rate": 3.695634946553296e-05, + "loss": 0.5789, + "step": 2581 + }, + { + "epoch": 0.361371588523443, + "grad_norm": 0.46524048585079214, + "learning_rate": 3.694639508274158e-05, + "loss": 0.605, + "step": 2582 + }, + { + "epoch": 0.3615115465360392, + "grad_norm": 0.43254745661664457, + "learning_rate": 3.69364382448448e-05, + "loss": 0.5723, + "step": 2583 + }, + { + "epoch": 0.3616515045486354, + "grad_norm": 0.5926681683204299, + "learning_rate": 3.692647895388884e-05, + "loss": 0.5657, + "step": 2584 + }, + { + "epoch": 0.3617914625612316, + "grad_norm": 0.3984247442681412, + "learning_rate": 3.691651721192046e-05, + "loss": 0.5375, + "step": 2585 + }, + { + "epoch": 0.3619314205738279, + "grad_norm": 0.4256089941995705, + "learning_rate": 3.6906553020986876e-05, + "loss": 0.5663, + "step": 2586 + }, + { + "epoch": 0.3620713785864241, + "grad_norm": 0.3955060934018287, + "learning_rate": 3.6896586383135835e-05, + "loss": 0.5367, + "step": 2587 + }, + { + "epoch": 0.3622113365990203, + "grad_norm": 0.3983798844884697, + "learning_rate": 3.688661730041559e-05, + "loss": 0.5757, + "step": 2588 + }, + { + "epoch": 0.3623512946116165, + "grad_norm": 0.40920669971645085, + "learning_rate": 3.6876645774874876e-05, + "loss": 0.5445, + "step": 2589 + }, + { + "epoch": 0.3624912526242127, + "grad_norm": 0.43900374382517116, + "learning_rate": 3.6866671808562945e-05, + "loss": 0.5702, + "step": 2590 + }, + { + "epoch": 0.362631210636809, + "grad_norm": 0.4009385370546209, + "learning_rate": 3.685669540352957e-05, + "loss": 0.5873, + "step": 2591 + }, + { + "epoch": 0.3627711686494052, + "grad_norm": 0.4010104073772178, + "learning_rate": 3.6846716561824965e-05, + "loss": 0.5693, + "step": 2592 + }, + { + "epoch": 0.3629111266620014, + "grad_norm": 0.3942578614306398, + "learning_rate": 3.6836735285499924e-05, + "loss": 0.5459, + "step": 2593 + }, + { + "epoch": 0.3630510846745976, + "grad_norm": 0.4052354355699724, + "learning_rate": 3.682675157660567e-05, + "loss": 0.5753, + "step": 2594 + }, + { + "epoch": 0.3631910426871938, + "grad_norm": 0.4111971028442228, + "learning_rate": 3.681676543719396e-05, + "loss": 0.6152, + "step": 2595 + }, + { + "epoch": 0.3633310006997901, + "grad_norm": 0.38664588990396315, + "learning_rate": 3.680677686931707e-05, + "loss": 0.5354, + "step": 2596 + }, + { + "epoch": 0.36347095871238627, + "grad_norm": 0.4207284242435013, + "learning_rate": 3.679678587502773e-05, + "loss": 0.576, + "step": 2597 + }, + { + "epoch": 0.3636109167249825, + "grad_norm": 0.40595710076845204, + "learning_rate": 3.67867924563792e-05, + "loss": 0.5885, + "step": 2598 + }, + { + "epoch": 0.3637508747375787, + "grad_norm": 0.41445075167878537, + "learning_rate": 3.6776796615425224e-05, + "loss": 0.5732, + "step": 2599 + }, + { + "epoch": 0.363890832750175, + "grad_norm": 0.38888704179883676, + "learning_rate": 3.6766798354220054e-05, + "loss": 0.5658, + "step": 2600 + }, + { + "epoch": 0.3640307907627712, + "grad_norm": 0.40093975960461736, + "learning_rate": 3.675679767481842e-05, + "loss": 0.5597, + "step": 2601 + }, + { + "epoch": 0.36417074877536737, + "grad_norm": 0.4321442568384402, + "learning_rate": 3.674679457927559e-05, + "loss": 0.5451, + "step": 2602 + }, + { + "epoch": 0.3643107067879636, + "grad_norm": 0.41193216461597326, + "learning_rate": 3.673678906964727e-05, + "loss": 0.5562, + "step": 2603 + }, + { + "epoch": 0.3644506648005598, + "grad_norm": 0.3982633594790959, + "learning_rate": 3.672678114798972e-05, + "loss": 0.5463, + "step": 2604 + }, + { + "epoch": 0.3645906228131561, + "grad_norm": 0.42438474880866467, + "learning_rate": 3.671677081635966e-05, + "loss": 0.5809, + "step": 2605 + }, + { + "epoch": 0.36473058082575227, + "grad_norm": 0.4217628754836848, + "learning_rate": 3.67067580768143e-05, + "loss": 0.553, + "step": 2606 + }, + { + "epoch": 0.36487053883834847, + "grad_norm": 0.42309673305357437, + "learning_rate": 3.669674293141139e-05, + "loss": 0.6209, + "step": 2607 + }, + { + "epoch": 0.3650104968509447, + "grad_norm": 0.3920738753776142, + "learning_rate": 3.6686725382209114e-05, + "loss": 0.5617, + "step": 2608 + }, + { + "epoch": 0.3651504548635409, + "grad_norm": 0.39484584407664886, + "learning_rate": 3.66767054312662e-05, + "loss": 0.5167, + "step": 2609 + }, + { + "epoch": 0.36529041287613717, + "grad_norm": 0.4255886790117686, + "learning_rate": 3.6666683080641846e-05, + "loss": 0.6193, + "step": 2610 + }, + { + "epoch": 0.36543037088873337, + "grad_norm": 0.40579442495363904, + "learning_rate": 3.665665833239574e-05, + "loss": 0.5342, + "step": 2611 + }, + { + "epoch": 0.3655703289013296, + "grad_norm": 0.4233421685048626, + "learning_rate": 3.664663118858808e-05, + "loss": 0.5465, + "step": 2612 + }, + { + "epoch": 0.3657102869139258, + "grad_norm": 0.4174294680767461, + "learning_rate": 3.6636601651279524e-05, + "loss": 0.5849, + "step": 2613 + }, + { + "epoch": 0.365850244926522, + "grad_norm": 0.44476352179830164, + "learning_rate": 3.662656972253127e-05, + "loss": 0.5717, + "step": 2614 + }, + { + "epoch": 0.36599020293911827, + "grad_norm": 0.3948209367210007, + "learning_rate": 3.6616535404404964e-05, + "loss": 0.5529, + "step": 2615 + }, + { + "epoch": 0.36613016095171447, + "grad_norm": 0.40738682961697953, + "learning_rate": 3.660649869896276e-05, + "loss": 0.5268, + "step": 2616 + }, + { + "epoch": 0.3662701189643107, + "grad_norm": 0.40020906641099585, + "learning_rate": 3.659645960826732e-05, + "loss": 0.5567, + "step": 2617 + }, + { + "epoch": 0.3664100769769069, + "grad_norm": 0.4237523890631533, + "learning_rate": 3.658641813438176e-05, + "loss": 0.573, + "step": 2618 + }, + { + "epoch": 0.36655003498950317, + "grad_norm": 0.4325798007082038, + "learning_rate": 3.657637427936972e-05, + "loss": 0.5791, + "step": 2619 + }, + { + "epoch": 0.36668999300209937, + "grad_norm": 0.43907042323555606, + "learning_rate": 3.65663280452953e-05, + "loss": 0.5786, + "step": 2620 + }, + { + "epoch": 0.36682995101469557, + "grad_norm": 0.4323098724184291, + "learning_rate": 3.6556279434223116e-05, + "loss": 0.6016, + "step": 2621 + }, + { + "epoch": 0.3669699090272918, + "grad_norm": 0.4265584212865507, + "learning_rate": 3.654622844821825e-05, + "loss": 0.623, + "step": 2622 + }, + { + "epoch": 0.367109867039888, + "grad_norm": 0.4179019694136672, + "learning_rate": 3.6536175089346285e-05, + "loss": 0.5468, + "step": 2623 + }, + { + "epoch": 0.36724982505248427, + "grad_norm": 0.41889421083309525, + "learning_rate": 3.6526119359673284e-05, + "loss": 0.5574, + "step": 2624 + }, + { + "epoch": 0.36738978306508047, + "grad_norm": 0.41710292728469345, + "learning_rate": 3.651606126126581e-05, + "loss": 0.5781, + "step": 2625 + }, + { + "epoch": 0.3675297410776767, + "grad_norm": 0.3897312626800565, + "learning_rate": 3.65060007961909e-05, + "loss": 0.5193, + "step": 2626 + }, + { + "epoch": 0.3676696990902729, + "grad_norm": 0.42386091892910743, + "learning_rate": 3.649593796651608e-05, + "loss": 0.5917, + "step": 2627 + }, + { + "epoch": 0.3678096571028691, + "grad_norm": 0.414524194950392, + "learning_rate": 3.648587277430936e-05, + "loss": 0.5758, + "step": 2628 + }, + { + "epoch": 0.36794961511546537, + "grad_norm": 0.4038428167881955, + "learning_rate": 3.647580522163925e-05, + "loss": 0.5621, + "step": 2629 + }, + { + "epoch": 0.36808957312806156, + "grad_norm": 0.4006767303991816, + "learning_rate": 3.646573531057473e-05, + "loss": 0.5586, + "step": 2630 + }, + { + "epoch": 0.3682295311406578, + "grad_norm": 0.40655169612049585, + "learning_rate": 3.645566304318526e-05, + "loss": 0.6016, + "step": 2631 + }, + { + "epoch": 0.368369489153254, + "grad_norm": 0.434790153457983, + "learning_rate": 3.644558842154081e-05, + "loss": 0.5481, + "step": 2632 + }, + { + "epoch": 0.36850944716585027, + "grad_norm": 0.4099645764830532, + "learning_rate": 3.6435511447711804e-05, + "loss": 0.5738, + "step": 2633 + }, + { + "epoch": 0.36864940517844647, + "grad_norm": 0.41274450825271575, + "learning_rate": 3.642543212376916e-05, + "loss": 0.5991, + "step": 2634 + }, + { + "epoch": 0.36878936319104266, + "grad_norm": 0.4032135476124307, + "learning_rate": 3.6415350451784294e-05, + "loss": 0.5499, + "step": 2635 + }, + { + "epoch": 0.3689293212036389, + "grad_norm": 0.4188775277615029, + "learning_rate": 3.6405266433829075e-05, + "loss": 0.5654, + "step": 2636 + }, + { + "epoch": 0.3690692792162351, + "grad_norm": 0.5178966570845596, + "learning_rate": 3.6395180071975885e-05, + "loss": 0.5396, + "step": 2637 + }, + { + "epoch": 0.36920923722883137, + "grad_norm": 0.40182828296097955, + "learning_rate": 3.638509136829758e-05, + "loss": 0.5652, + "step": 2638 + }, + { + "epoch": 0.36934919524142756, + "grad_norm": 0.390078684434463, + "learning_rate": 3.637500032486747e-05, + "loss": 0.557, + "step": 2639 + }, + { + "epoch": 0.3694891532540238, + "grad_norm": 0.3931686164476163, + "learning_rate": 3.636490694375938e-05, + "loss": 0.5841, + "step": 2640 + }, + { + "epoch": 0.36962911126662, + "grad_norm": 0.39382269721567603, + "learning_rate": 3.63548112270476e-05, + "loss": 0.5575, + "step": 2641 + }, + { + "epoch": 0.3697690692792162, + "grad_norm": 0.42173997864130025, + "learning_rate": 3.63447131768069e-05, + "loss": 0.6018, + "step": 2642 + }, + { + "epoch": 0.36990902729181246, + "grad_norm": 0.43303379767097616, + "learning_rate": 3.6334612795112534e-05, + "loss": 0.5894, + "step": 2643 + }, + { + "epoch": 0.37004898530440866, + "grad_norm": 0.40551708557859245, + "learning_rate": 3.632451008404024e-05, + "loss": 0.5822, + "step": 2644 + }, + { + "epoch": 0.3701889433170049, + "grad_norm": 0.4199307056582574, + "learning_rate": 3.631440504566621e-05, + "loss": 0.5359, + "step": 2645 + }, + { + "epoch": 0.3703289013296011, + "grad_norm": 0.3856025757451338, + "learning_rate": 3.6304297682067144e-05, + "loss": 0.5279, + "step": 2646 + }, + { + "epoch": 0.37046885934219737, + "grad_norm": 0.39790868545989994, + "learning_rate": 3.6294187995320214e-05, + "loss": 0.5685, + "step": 2647 + }, + { + "epoch": 0.37060881735479356, + "grad_norm": 0.41011662651634145, + "learning_rate": 3.628407598750305e-05, + "loss": 0.5732, + "step": 2648 + }, + { + "epoch": 0.37074877536738976, + "grad_norm": 0.41740394404930137, + "learning_rate": 3.627396166069377e-05, + "loss": 0.5922, + "step": 2649 + }, + { + "epoch": 0.370888733379986, + "grad_norm": 0.41661946277247025, + "learning_rate": 3.626384501697099e-05, + "loss": 0.5501, + "step": 2650 + }, + { + "epoch": 0.3710286913925822, + "grad_norm": 0.42202366075884673, + "learning_rate": 3.625372605841376e-05, + "loss": 0.5659, + "step": 2651 + }, + { + "epoch": 0.37116864940517846, + "grad_norm": 0.39038696179315746, + "learning_rate": 3.624360478710165e-05, + "loss": 0.5729, + "step": 2652 + }, + { + "epoch": 0.37130860741777466, + "grad_norm": 0.41764113003828224, + "learning_rate": 3.623348120511466e-05, + "loss": 0.5663, + "step": 2653 + }, + { + "epoch": 0.3714485654303709, + "grad_norm": 0.39913226890826564, + "learning_rate": 3.622335531453331e-05, + "loss": 0.5645, + "step": 2654 + }, + { + "epoch": 0.3715885234429671, + "grad_norm": 0.3939385454277571, + "learning_rate": 3.621322711743858e-05, + "loss": 0.5631, + "step": 2655 + }, + { + "epoch": 0.3717284814555633, + "grad_norm": 0.41308594557194117, + "learning_rate": 3.6203096615911884e-05, + "loss": 0.6052, + "step": 2656 + }, + { + "epoch": 0.37186843946815956, + "grad_norm": 0.4298728275791088, + "learning_rate": 3.619296381203517e-05, + "loss": 0.5396, + "step": 2657 + }, + { + "epoch": 0.37200839748075576, + "grad_norm": 0.4544709711994261, + "learning_rate": 3.6182828707890816e-05, + "loss": 0.5973, + "step": 2658 + }, + { + "epoch": 0.372148355493352, + "grad_norm": 0.417274676645833, + "learning_rate": 3.61726913055617e-05, + "loss": 0.58, + "step": 2659 + }, + { + "epoch": 0.3722883135059482, + "grad_norm": 0.4027341706183872, + "learning_rate": 3.6162551607131164e-05, + "loss": 0.5793, + "step": 2660 + }, + { + "epoch": 0.37242827151854446, + "grad_norm": 0.41668849630230104, + "learning_rate": 3.615240961468301e-05, + "loss": 0.5573, + "step": 2661 + }, + { + "epoch": 0.37256822953114066, + "grad_norm": 0.40880781367892177, + "learning_rate": 3.614226533030152e-05, + "loss": 0.5821, + "step": 2662 + }, + { + "epoch": 0.37270818754373686, + "grad_norm": 0.4154234374761176, + "learning_rate": 3.6132118756071456e-05, + "loss": 0.5932, + "step": 2663 + }, + { + "epoch": 0.3728481455563331, + "grad_norm": 0.3839166628649522, + "learning_rate": 3.6121969894078024e-05, + "loss": 0.5498, + "step": 2664 + }, + { + "epoch": 0.3729881035689293, + "grad_norm": 0.42314063405669433, + "learning_rate": 3.611181874640694e-05, + "loss": 0.571, + "step": 2665 + }, + { + "epoch": 0.37312806158152556, + "grad_norm": 0.3944581050181605, + "learning_rate": 3.610166531514436e-05, + "loss": 0.5382, + "step": 2666 + }, + { + "epoch": 0.37326801959412176, + "grad_norm": 0.39463547519248415, + "learning_rate": 3.60915096023769e-05, + "loss": 0.5544, + "step": 2667 + }, + { + "epoch": 0.373407977606718, + "grad_norm": 0.41268049741997787, + "learning_rate": 3.6081351610191696e-05, + "loss": 0.5427, + "step": 2668 + }, + { + "epoch": 0.3735479356193142, + "grad_norm": 0.4095949213776676, + "learning_rate": 3.607119134067629e-05, + "loss": 0.5564, + "step": 2669 + }, + { + "epoch": 0.3736878936319104, + "grad_norm": 0.39819288148024584, + "learning_rate": 3.606102879591874e-05, + "loss": 0.5584, + "step": 2670 + }, + { + "epoch": 0.37382785164450666, + "grad_norm": 0.4133173629042419, + "learning_rate": 3.605086397800753e-05, + "loss": 0.5563, + "step": 2671 + }, + { + "epoch": 0.37396780965710286, + "grad_norm": 0.39980433778091934, + "learning_rate": 3.604069688903165e-05, + "loss": 0.5536, + "step": 2672 + }, + { + "epoch": 0.3741077676696991, + "grad_norm": 0.4143096692418758, + "learning_rate": 3.603052753108053e-05, + "loss": 0.6135, + "step": 2673 + }, + { + "epoch": 0.3742477256822953, + "grad_norm": 0.4220107842655034, + "learning_rate": 3.602035590624409e-05, + "loss": 0.5875, + "step": 2674 + }, + { + "epoch": 0.37438768369489156, + "grad_norm": 0.41274829656377915, + "learning_rate": 3.6010182016612695e-05, + "loss": 0.553, + "step": 2675 + }, + { + "epoch": 0.37452764170748776, + "grad_norm": 0.4077340731853976, + "learning_rate": 3.600000586427718e-05, + "loss": 0.5626, + "step": 2676 + }, + { + "epoch": 0.37466759972008395, + "grad_norm": 0.432520643568383, + "learning_rate": 3.598982745132885e-05, + "loss": 0.5764, + "step": 2677 + }, + { + "epoch": 0.3748075577326802, + "grad_norm": 0.3972351037179022, + "learning_rate": 3.597964677985946e-05, + "loss": 0.5439, + "step": 2678 + }, + { + "epoch": 0.3749475157452764, + "grad_norm": 0.421002231272715, + "learning_rate": 3.596946385196126e-05, + "loss": 0.5546, + "step": 2679 + }, + { + "epoch": 0.37508747375787266, + "grad_norm": 0.41810855014069603, + "learning_rate": 3.5959278669726935e-05, + "loss": 0.5862, + "step": 2680 + }, + { + "epoch": 0.37522743177046886, + "grad_norm": 0.3799508514194255, + "learning_rate": 3.594909123524965e-05, + "loss": 0.5434, + "step": 2681 + }, + { + "epoch": 0.37536738978306505, + "grad_norm": 0.4035591025821781, + "learning_rate": 3.593890155062302e-05, + "loss": 0.5798, + "step": 2682 + }, + { + "epoch": 0.3755073477956613, + "grad_norm": 0.4114244071207664, + "learning_rate": 3.592870961794113e-05, + "loss": 0.6205, + "step": 2683 + }, + { + "epoch": 0.3756473058082575, + "grad_norm": 0.4076569672449696, + "learning_rate": 3.5918515439298526e-05, + "loss": 0.5798, + "step": 2684 + }, + { + "epoch": 0.37578726382085376, + "grad_norm": 0.4051977237739006, + "learning_rate": 3.5908319016790214e-05, + "loss": 0.5707, + "step": 2685 + }, + { + "epoch": 0.37592722183344995, + "grad_norm": 0.41477449458013194, + "learning_rate": 3.589812035251167e-05, + "loss": 0.58, + "step": 2686 + }, + { + "epoch": 0.3760671798460462, + "grad_norm": 0.4265813533603491, + "learning_rate": 3.588791944855881e-05, + "loss": 0.5891, + "step": 2687 + }, + { + "epoch": 0.3762071378586424, + "grad_norm": 0.4009173028671792, + "learning_rate": 3.587771630702803e-05, + "loss": 0.5559, + "step": 2688 + }, + { + "epoch": 0.3763470958712386, + "grad_norm": 0.41516714213673056, + "learning_rate": 3.586751093001618e-05, + "loss": 0.5518, + "step": 2689 + }, + { + "epoch": 0.37648705388383485, + "grad_norm": 0.4246576996594534, + "learning_rate": 3.5857303319620566e-05, + "loss": 0.5804, + "step": 2690 + }, + { + "epoch": 0.37662701189643105, + "grad_norm": 0.3882381660222785, + "learning_rate": 3.5847093477938956e-05, + "loss": 0.5588, + "step": 2691 + }, + { + "epoch": 0.3767669699090273, + "grad_norm": 0.41694303450281095, + "learning_rate": 3.583688140706958e-05, + "loss": 0.5694, + "step": 2692 + }, + { + "epoch": 0.3769069279216235, + "grad_norm": 0.39378160604472995, + "learning_rate": 3.5826667109111115e-05, + "loss": 0.5595, + "step": 2693 + }, + { + "epoch": 0.37704688593421976, + "grad_norm": 0.38592197692564867, + "learning_rate": 3.581645058616271e-05, + "loss": 0.5418, + "step": 2694 + }, + { + "epoch": 0.37718684394681595, + "grad_norm": 0.3930395842122637, + "learning_rate": 3.580623184032396e-05, + "loss": 0.5424, + "step": 2695 + }, + { + "epoch": 0.37732680195941215, + "grad_norm": 0.4176452454515421, + "learning_rate": 3.579601087369492e-05, + "loss": 0.5433, + "step": 2696 + }, + { + "epoch": 0.3774667599720084, + "grad_norm": 0.4189236803610346, + "learning_rate": 3.5785787688376104e-05, + "loss": 0.5822, + "step": 2697 + }, + { + "epoch": 0.3776067179846046, + "grad_norm": 0.38962623396875146, + "learning_rate": 3.577556228646849e-05, + "loss": 0.5357, + "step": 2698 + }, + { + "epoch": 0.37774667599720085, + "grad_norm": 0.4020721519778404, + "learning_rate": 3.576533467007349e-05, + "loss": 0.5658, + "step": 2699 + }, + { + "epoch": 0.37788663400979705, + "grad_norm": 0.43467388992700556, + "learning_rate": 3.5755104841292974e-05, + "loss": 0.6269, + "step": 2700 + }, + { + "epoch": 0.3780265920223933, + "grad_norm": 0.41515218431176615, + "learning_rate": 3.5744872802229296e-05, + "loss": 0.5777, + "step": 2701 + }, + { + "epoch": 0.3781665500349895, + "grad_norm": 0.4299136904928923, + "learning_rate": 3.5734638554985236e-05, + "loss": 0.6001, + "step": 2702 + }, + { + "epoch": 0.3783065080475857, + "grad_norm": 0.4051757493692295, + "learning_rate": 3.5724402101664023e-05, + "loss": 0.5337, + "step": 2703 + }, + { + "epoch": 0.37844646606018195, + "grad_norm": 0.41621036812349377, + "learning_rate": 3.571416344436938e-05, + "loss": 0.5754, + "step": 2704 + }, + { + "epoch": 0.37858642407277815, + "grad_norm": 0.4041482546847394, + "learning_rate": 3.5703922585205416e-05, + "loss": 0.5556, + "step": 2705 + }, + { + "epoch": 0.3787263820853744, + "grad_norm": 0.4120614594717955, + "learning_rate": 3.569367952627677e-05, + "loss": 0.5881, + "step": 2706 + }, + { + "epoch": 0.3788663400979706, + "grad_norm": 0.4225642230493687, + "learning_rate": 3.5683434269688485e-05, + "loss": 0.5639, + "step": 2707 + }, + { + "epoch": 0.37900629811056685, + "grad_norm": 0.42743372284510195, + "learning_rate": 3.567318681754605e-05, + "loss": 0.5866, + "step": 2708 + }, + { + "epoch": 0.37914625612316305, + "grad_norm": 0.39443077799329296, + "learning_rate": 3.566293717195543e-05, + "loss": 0.5282, + "step": 2709 + }, + { + "epoch": 0.37928621413575925, + "grad_norm": 0.3985257910214282, + "learning_rate": 3.565268533502303e-05, + "loss": 0.5773, + "step": 2710 + }, + { + "epoch": 0.3794261721483555, + "grad_norm": 0.4048944059124916, + "learning_rate": 3.5642431308855705e-05, + "loss": 0.5366, + "step": 2711 + }, + { + "epoch": 0.3795661301609517, + "grad_norm": 0.39796926870963256, + "learning_rate": 3.563217509556076e-05, + "loss": 0.5375, + "step": 2712 + }, + { + "epoch": 0.37970608817354795, + "grad_norm": 0.42405605140429314, + "learning_rate": 3.562191669724597e-05, + "loss": 0.5735, + "step": 2713 + }, + { + "epoch": 0.37984604618614415, + "grad_norm": 0.415293963968241, + "learning_rate": 3.56116561160195e-05, + "loss": 0.5616, + "step": 2714 + }, + { + "epoch": 0.3799860041987404, + "grad_norm": 0.40013621152671924, + "learning_rate": 3.5601393353990046e-05, + "loss": 0.5454, + "step": 2715 + }, + { + "epoch": 0.3801259622113366, + "grad_norm": 0.4248537718600079, + "learning_rate": 3.5591128413266686e-05, + "loss": 0.5732, + "step": 2716 + }, + { + "epoch": 0.3802659202239328, + "grad_norm": 0.4520252264272862, + "learning_rate": 3.558086129595898e-05, + "loss": 0.53, + "step": 2717 + }, + { + "epoch": 0.38040587823652905, + "grad_norm": 0.43210620967696467, + "learning_rate": 3.557059200417691e-05, + "loss": 0.572, + "step": 2718 + }, + { + "epoch": 0.38054583624912525, + "grad_norm": 0.4125830527654036, + "learning_rate": 3.556032054003093e-05, + "loss": 0.5677, + "step": 2719 + }, + { + "epoch": 0.3806857942617215, + "grad_norm": 0.4121722034558122, + "learning_rate": 3.555004690563193e-05, + "loss": 0.557, + "step": 2720 + }, + { + "epoch": 0.3808257522743177, + "grad_norm": 0.4349037334371718, + "learning_rate": 3.553977110309125e-05, + "loss": 0.6072, + "step": 2721 + }, + { + "epoch": 0.38096571028691395, + "grad_norm": 0.4145990063820183, + "learning_rate": 3.552949313452067e-05, + "loss": 0.5716, + "step": 2722 + }, + { + "epoch": 0.38110566829951015, + "grad_norm": 0.4199300937861851, + "learning_rate": 3.5519213002032404e-05, + "loss": 0.55, + "step": 2723 + }, + { + "epoch": 0.38124562631210634, + "grad_norm": 0.4125752251581557, + "learning_rate": 3.550893070773914e-05, + "loss": 0.5758, + "step": 2724 + }, + { + "epoch": 0.3813855843247026, + "grad_norm": 0.42548401480941495, + "learning_rate": 3.5498646253753986e-05, + "loss": 0.5751, + "step": 2725 + }, + { + "epoch": 0.3815255423372988, + "grad_norm": 0.42522116248828645, + "learning_rate": 3.54883596421905e-05, + "loss": 0.5816, + "step": 2726 + }, + { + "epoch": 0.38166550034989505, + "grad_norm": 0.41791254635783553, + "learning_rate": 3.5478070875162694e-05, + "loss": 0.5298, + "step": 2727 + }, + { + "epoch": 0.38180545836249125, + "grad_norm": 0.4058287630837772, + "learning_rate": 3.5467779954785e-05, + "loss": 0.5567, + "step": 2728 + }, + { + "epoch": 0.3819454163750875, + "grad_norm": 0.42582677147713893, + "learning_rate": 3.545748688317232e-05, + "loss": 0.6058, + "step": 2729 + }, + { + "epoch": 0.3820853743876837, + "grad_norm": 0.4103609286165152, + "learning_rate": 3.544719166243998e-05, + "loss": 0.5774, + "step": 2730 + }, + { + "epoch": 0.3822253324002799, + "grad_norm": 0.401572700005355, + "learning_rate": 3.543689429470375e-05, + "loss": 0.5411, + "step": 2731 + }, + { + "epoch": 0.38236529041287615, + "grad_norm": 0.3929209916840099, + "learning_rate": 3.5426594782079846e-05, + "loss": 0.5373, + "step": 2732 + }, + { + "epoch": 0.38250524842547234, + "grad_norm": 0.40599385466099075, + "learning_rate": 3.541629312668492e-05, + "loss": 0.6015, + "step": 2733 + }, + { + "epoch": 0.3826452064380686, + "grad_norm": 0.41006832838571494, + "learning_rate": 3.540598933063607e-05, + "loss": 0.5509, + "step": 2734 + }, + { + "epoch": 0.3827851644506648, + "grad_norm": 0.4107953458690507, + "learning_rate": 3.5395683396050825e-05, + "loss": 0.5818, + "step": 2735 + }, + { + "epoch": 0.38292512246326105, + "grad_norm": 0.4164444405879751, + "learning_rate": 3.5385375325047166e-05, + "loss": 0.5664, + "step": 2736 + }, + { + "epoch": 0.38306508047585724, + "grad_norm": 0.39131447269722003, + "learning_rate": 3.53750651197435e-05, + "loss": 0.5853, + "step": 2737 + }, + { + "epoch": 0.38320503848845344, + "grad_norm": 0.4251859387890631, + "learning_rate": 3.5364752782258684e-05, + "loss": 0.6136, + "step": 2738 + }, + { + "epoch": 0.3833449965010497, + "grad_norm": 0.4074457295926625, + "learning_rate": 3.535443831471201e-05, + "loss": 0.5696, + "step": 2739 + }, + { + "epoch": 0.3834849545136459, + "grad_norm": 0.4275456468988683, + "learning_rate": 3.534412171922319e-05, + "loss": 0.5782, + "step": 2740 + }, + { + "epoch": 0.38362491252624215, + "grad_norm": 0.3958129275901648, + "learning_rate": 3.533380299791241e-05, + "loss": 0.6016, + "step": 2741 + }, + { + "epoch": 0.38376487053883834, + "grad_norm": 0.41339333649591314, + "learning_rate": 3.5323482152900254e-05, + "loss": 0.5741, + "step": 2742 + }, + { + "epoch": 0.3839048285514346, + "grad_norm": 0.4018942596879924, + "learning_rate": 3.5313159186307784e-05, + "loss": 0.5567, + "step": 2743 + }, + { + "epoch": 0.3840447865640308, + "grad_norm": 0.40959426159798856, + "learning_rate": 3.530283410025645e-05, + "loss": 0.5545, + "step": 2744 + }, + { + "epoch": 0.384184744576627, + "grad_norm": 0.39955019012640813, + "learning_rate": 3.529250689686817e-05, + "loss": 0.5491, + "step": 2745 + }, + { + "epoch": 0.38432470258922324, + "grad_norm": 0.41725417707805595, + "learning_rate": 3.5282177578265296e-05, + "loss": 0.5569, + "step": 2746 + }, + { + "epoch": 0.38446466060181944, + "grad_norm": 0.4045943366257942, + "learning_rate": 3.527184614657059e-05, + "loss": 0.5303, + "step": 2747 + }, + { + "epoch": 0.3846046186144157, + "grad_norm": 0.39867086864097645, + "learning_rate": 3.526151260390729e-05, + "loss": 0.5567, + "step": 2748 + }, + { + "epoch": 0.3847445766270119, + "grad_norm": 0.3971092164877937, + "learning_rate": 3.525117695239903e-05, + "loss": 0.5517, + "step": 2749 + }, + { + "epoch": 0.38488453463960814, + "grad_norm": 0.40368782181512425, + "learning_rate": 3.5240839194169885e-05, + "loss": 0.554, + "step": 2750 + }, + { + "epoch": 0.38502449265220434, + "grad_norm": 0.4012824600908186, + "learning_rate": 3.523049933134439e-05, + "loss": 0.5622, + "step": 2751 + }, + { + "epoch": 0.38516445066480054, + "grad_norm": 0.40807955297557885, + "learning_rate": 3.522015736604747e-05, + "loss": 0.5757, + "step": 2752 + }, + { + "epoch": 0.3853044086773968, + "grad_norm": 0.401439172166039, + "learning_rate": 3.5209813300404516e-05, + "loss": 0.5523, + "step": 2753 + }, + { + "epoch": 0.385444366689993, + "grad_norm": 0.4188699833926191, + "learning_rate": 3.519946713654134e-05, + "loss": 0.585, + "step": 2754 + }, + { + "epoch": 0.38558432470258924, + "grad_norm": 0.4123520096515134, + "learning_rate": 3.518911887658418e-05, + "loss": 0.5681, + "step": 2755 + }, + { + "epoch": 0.38572428271518544, + "grad_norm": 0.41652405577544405, + "learning_rate": 3.5178768522659697e-05, + "loss": 0.571, + "step": 2756 + }, + { + "epoch": 0.3858642407277817, + "grad_norm": 0.39453566372212684, + "learning_rate": 3.516841607689501e-05, + "loss": 0.5368, + "step": 2757 + }, + { + "epoch": 0.3860041987403779, + "grad_norm": 0.41756153998856244, + "learning_rate": 3.5158061541417644e-05, + "loss": 0.5799, + "step": 2758 + }, + { + "epoch": 0.3861441567529741, + "grad_norm": 0.41467425735936986, + "learning_rate": 3.514770491835556e-05, + "loss": 0.5743, + "step": 2759 + }, + { + "epoch": 0.38628411476557034, + "grad_norm": 0.3947895347030933, + "learning_rate": 3.5137346209837165e-05, + "loss": 0.5059, + "step": 2760 + }, + { + "epoch": 0.38642407277816654, + "grad_norm": 0.4139447535493024, + "learning_rate": 3.5126985417991254e-05, + "loss": 0.5196, + "step": 2761 + }, + { + "epoch": 0.3865640307907628, + "grad_norm": 0.4002485124686976, + "learning_rate": 3.5116622544947085e-05, + "loss": 0.5704, + "step": 2762 + }, + { + "epoch": 0.386703988803359, + "grad_norm": 0.41066912276683826, + "learning_rate": 3.5106257592834343e-05, + "loss": 0.5808, + "step": 2763 + }, + { + "epoch": 0.3868439468159552, + "grad_norm": 0.40281746677069036, + "learning_rate": 3.509589056378312e-05, + "loss": 0.5779, + "step": 2764 + }, + { + "epoch": 0.38698390482855144, + "grad_norm": 0.4010030729597531, + "learning_rate": 3.5085521459923954e-05, + "loss": 0.5476, + "step": 2765 + }, + { + "epoch": 0.38712386284114764, + "grad_norm": 0.4116926245763738, + "learning_rate": 3.507515028338779e-05, + "loss": 0.5448, + "step": 2766 + }, + { + "epoch": 0.3872638208537439, + "grad_norm": 0.4048451003049423, + "learning_rate": 3.5064777036306015e-05, + "loss": 0.5119, + "step": 2767 + }, + { + "epoch": 0.3874037788663401, + "grad_norm": 0.41709617305694274, + "learning_rate": 3.505440172081044e-05, + "loss": 0.5769, + "step": 2768 + }, + { + "epoch": 0.38754373687893634, + "grad_norm": 0.41584372992034574, + "learning_rate": 3.5044024339033297e-05, + "loss": 0.5302, + "step": 2769 + }, + { + "epoch": 0.38768369489153254, + "grad_norm": 0.4065168099113291, + "learning_rate": 3.503364489310723e-05, + "loss": 0.602, + "step": 2770 + }, + { + "epoch": 0.38782365290412874, + "grad_norm": 0.4163553923214921, + "learning_rate": 3.502326338516534e-05, + "loss": 0.5694, + "step": 2771 + }, + { + "epoch": 0.387963610916725, + "grad_norm": 0.40029772682589904, + "learning_rate": 3.501287981734113e-05, + "loss": 0.5593, + "step": 2772 + }, + { + "epoch": 0.3881035689293212, + "grad_norm": 0.4078208586874547, + "learning_rate": 3.5002494191768516e-05, + "loss": 0.5501, + "step": 2773 + }, + { + "epoch": 0.38824352694191744, + "grad_norm": 0.392227014886232, + "learning_rate": 3.499210651058185e-05, + "loss": 0.5752, + "step": 2774 + }, + { + "epoch": 0.38838348495451364, + "grad_norm": 0.41190554114378175, + "learning_rate": 3.498171677591593e-05, + "loss": 0.5359, + "step": 2775 + }, + { + "epoch": 0.3885234429671099, + "grad_norm": 0.4137674619685686, + "learning_rate": 3.497132498990592e-05, + "loss": 0.5648, + "step": 2776 + }, + { + "epoch": 0.3886634009797061, + "grad_norm": 0.4063767289121146, + "learning_rate": 3.496093115468745e-05, + "loss": 0.5903, + "step": 2777 + }, + { + "epoch": 0.3888033589923023, + "grad_norm": 0.40730098042246266, + "learning_rate": 3.495053527239656e-05, + "loss": 0.5907, + "step": 2778 + }, + { + "epoch": 0.38894331700489854, + "grad_norm": 0.3939529427168923, + "learning_rate": 3.494013734516971e-05, + "loss": 0.5748, + "step": 2779 + }, + { + "epoch": 0.38908327501749473, + "grad_norm": 0.40700996952853286, + "learning_rate": 3.492973737514378e-05, + "loss": 0.57, + "step": 2780 + }, + { + "epoch": 0.389223233030091, + "grad_norm": 0.4127250869631396, + "learning_rate": 3.491933536445606e-05, + "loss": 0.576, + "step": 2781 + }, + { + "epoch": 0.3893631910426872, + "grad_norm": 0.42198868253989263, + "learning_rate": 3.490893131524429e-05, + "loss": 0.5982, + "step": 2782 + }, + { + "epoch": 0.38950314905528344, + "grad_norm": 0.3655454661119806, + "learning_rate": 3.489852522964658e-05, + "loss": 0.565, + "step": 2783 + }, + { + "epoch": 0.38964310706787963, + "grad_norm": 0.4144948208886313, + "learning_rate": 3.488811710980151e-05, + "loss": 0.5674, + "step": 2784 + }, + { + "epoch": 0.38978306508047583, + "grad_norm": 0.4074829627997144, + "learning_rate": 3.487770695784805e-05, + "loss": 0.5748, + "step": 2785 + }, + { + "epoch": 0.3899230230930721, + "grad_norm": 0.41858532067834603, + "learning_rate": 3.486729477592558e-05, + "loss": 0.5814, + "step": 2786 + }, + { + "epoch": 0.3900629811056683, + "grad_norm": 0.41786381819649815, + "learning_rate": 3.485688056617391e-05, + "loss": 0.5818, + "step": 2787 + }, + { + "epoch": 0.39020293911826454, + "grad_norm": 0.43164286265234475, + "learning_rate": 3.484646433073328e-05, + "loss": 0.6056, + "step": 2788 + }, + { + "epoch": 0.39034289713086073, + "grad_norm": 0.41022430212941835, + "learning_rate": 3.483604607174432e-05, + "loss": 0.586, + "step": 2789 + }, + { + "epoch": 0.390482855143457, + "grad_norm": 0.4232828423199565, + "learning_rate": 3.4825625791348096e-05, + "loss": 0.5569, + "step": 2790 + }, + { + "epoch": 0.3906228131560532, + "grad_norm": 0.400556356726097, + "learning_rate": 3.481520349168607e-05, + "loss": 0.5853, + "step": 2791 + }, + { + "epoch": 0.3907627711686494, + "grad_norm": 0.4163389824429049, + "learning_rate": 3.480477917490014e-05, + "loss": 0.5423, + "step": 2792 + }, + { + "epoch": 0.39090272918124563, + "grad_norm": 0.40119034319759783, + "learning_rate": 3.479435284313261e-05, + "loss": 0.5618, + "step": 2793 + }, + { + "epoch": 0.39104268719384183, + "grad_norm": 0.41230633715814125, + "learning_rate": 3.4783924498526184e-05, + "loss": 0.566, + "step": 2794 + }, + { + "epoch": 0.3911826452064381, + "grad_norm": 0.38406879859249193, + "learning_rate": 3.4773494143224e-05, + "loss": 0.5728, + "step": 2795 + }, + { + "epoch": 0.3913226032190343, + "grad_norm": 0.42250079129339874, + "learning_rate": 3.476306177936961e-05, + "loss": 0.5937, + "step": 2796 + }, + { + "epoch": 0.39146256123163053, + "grad_norm": 0.39662665146228837, + "learning_rate": 3.475262740910696e-05, + "loss": 0.5644, + "step": 2797 + }, + { + "epoch": 0.39160251924422673, + "grad_norm": 0.42760877117863655, + "learning_rate": 3.474219103458043e-05, + "loss": 0.5762, + "step": 2798 + }, + { + "epoch": 0.39174247725682293, + "grad_norm": 0.3990049510043462, + "learning_rate": 3.4731752657934794e-05, + "loss": 0.5234, + "step": 2799 + }, + { + "epoch": 0.3918824352694192, + "grad_norm": 0.3870544184519613, + "learning_rate": 3.4721312281315236e-05, + "loss": 0.5921, + "step": 2800 + }, + { + "epoch": 0.3920223932820154, + "grad_norm": 0.3907379222640409, + "learning_rate": 3.471086990686737e-05, + "loss": 0.5402, + "step": 2801 + }, + { + "epoch": 0.39216235129461163, + "grad_norm": 0.42863016339615406, + "learning_rate": 3.470042553673721e-05, + "loss": 0.6075, + "step": 2802 + }, + { + "epoch": 0.39230230930720783, + "grad_norm": 0.40157662221583645, + "learning_rate": 3.468997917307118e-05, + "loss": 0.5723, + "step": 2803 + }, + { + "epoch": 0.3924422673198041, + "grad_norm": 0.42138157521984687, + "learning_rate": 3.46795308180161e-05, + "loss": 0.5605, + "step": 2804 + }, + { + "epoch": 0.3925822253324003, + "grad_norm": 0.39033795656224785, + "learning_rate": 3.466908047371923e-05, + "loss": 0.5814, + "step": 2805 + }, + { + "epoch": 0.3927221833449965, + "grad_norm": 0.3904474581637845, + "learning_rate": 3.465862814232822e-05, + "loss": 0.5662, + "step": 2806 + }, + { + "epoch": 0.39286214135759273, + "grad_norm": 0.4019162642970993, + "learning_rate": 3.464817382599112e-05, + "loss": 0.5509, + "step": 2807 + }, + { + "epoch": 0.39300209937018893, + "grad_norm": 0.4060877755013805, + "learning_rate": 3.4637717526856406e-05, + "loss": 0.5642, + "step": 2808 + }, + { + "epoch": 0.3931420573827852, + "grad_norm": 0.41153563140737226, + "learning_rate": 3.462725924707295e-05, + "loss": 0.574, + "step": 2809 + }, + { + "epoch": 0.3932820153953814, + "grad_norm": 0.4177210107228342, + "learning_rate": 3.461679898879004e-05, + "loss": 0.6133, + "step": 2810 + }, + { + "epoch": 0.39342197340797763, + "grad_norm": 0.3996252120493831, + "learning_rate": 3.460633675415736e-05, + "loss": 0.5635, + "step": 2811 + }, + { + "epoch": 0.39356193142057383, + "grad_norm": 0.417067202150554, + "learning_rate": 3.459587254532502e-05, + "loss": 0.548, + "step": 2812 + }, + { + "epoch": 0.39370188943317, + "grad_norm": 0.4855752987156052, + "learning_rate": 3.458540636444349e-05, + "loss": 0.5833, + "step": 2813 + }, + { + "epoch": 0.3938418474457663, + "grad_norm": 0.40124325073974404, + "learning_rate": 3.457493821366369e-05, + "loss": 0.5403, + "step": 2814 + }, + { + "epoch": 0.3939818054583625, + "grad_norm": 0.41052139871938925, + "learning_rate": 3.456446809513695e-05, + "loss": 0.5639, + "step": 2815 + }, + { + "epoch": 0.39412176347095873, + "grad_norm": 0.429168200503203, + "learning_rate": 3.455399601101497e-05, + "loss": 0.6069, + "step": 2816 + }, + { + "epoch": 0.3942617214835549, + "grad_norm": 0.4130675301409534, + "learning_rate": 3.4543521963449857e-05, + "loss": 0.5628, + "step": 2817 + }, + { + "epoch": 0.3944016794961512, + "grad_norm": 0.4028085056325733, + "learning_rate": 3.4533045954594164e-05, + "loss": 0.5806, + "step": 2818 + }, + { + "epoch": 0.3945416375087474, + "grad_norm": 0.40759739919737564, + "learning_rate": 3.452256798660079e-05, + "loss": 0.6007, + "step": 2819 + }, + { + "epoch": 0.3946815955213436, + "grad_norm": 0.41359320045974896, + "learning_rate": 3.4512088061623075e-05, + "loss": 0.5912, + "step": 2820 + }, + { + "epoch": 0.39482155353393983, + "grad_norm": 0.3896223345474862, + "learning_rate": 3.450160618181476e-05, + "loss": 0.5583, + "step": 2821 + }, + { + "epoch": 0.394961511546536, + "grad_norm": 0.41092585101647605, + "learning_rate": 3.449112234932996e-05, + "loss": 0.5571, + "step": 2822 + }, + { + "epoch": 0.3951014695591323, + "grad_norm": 0.42614487405443774, + "learning_rate": 3.4480636566323215e-05, + "loss": 0.6322, + "step": 2823 + }, + { + "epoch": 0.3952414275717285, + "grad_norm": 0.41152848618890253, + "learning_rate": 3.447014883494946e-05, + "loss": 0.5636, + "step": 2824 + }, + { + "epoch": 0.39538138558432473, + "grad_norm": 0.4043580579983999, + "learning_rate": 3.445965915736403e-05, + "loss": 0.5338, + "step": 2825 + }, + { + "epoch": 0.3955213435969209, + "grad_norm": 0.3944340553638311, + "learning_rate": 3.444916753572266e-05, + "loss": 0.5345, + "step": 2826 + }, + { + "epoch": 0.3956613016095171, + "grad_norm": 0.4032299392463221, + "learning_rate": 3.44386739721815e-05, + "loss": 0.5645, + "step": 2827 + }, + { + "epoch": 0.3958012596221134, + "grad_norm": 0.43128038862284984, + "learning_rate": 3.442817846889705e-05, + "loss": 0.5793, + "step": 2828 + }, + { + "epoch": 0.3959412176347096, + "grad_norm": 0.3904629770048097, + "learning_rate": 3.4417681028026276e-05, + "loss": 0.5594, + "step": 2829 + }, + { + "epoch": 0.3960811756473058, + "grad_norm": 0.4032671372019056, + "learning_rate": 3.4407181651726495e-05, + "loss": 0.6305, + "step": 2830 + }, + { + "epoch": 0.396221133659902, + "grad_norm": 0.41716969232369355, + "learning_rate": 3.439668034215543e-05, + "loss": 0.5661, + "step": 2831 + }, + { + "epoch": 0.3963610916724983, + "grad_norm": 0.3785072143764436, + "learning_rate": 3.4386177101471216e-05, + "loss": 0.5378, + "step": 2832 + }, + { + "epoch": 0.3965010496850945, + "grad_norm": 0.4062464075615544, + "learning_rate": 3.437567193183237e-05, + "loss": 0.5613, + "step": 2833 + }, + { + "epoch": 0.3966410076976907, + "grad_norm": 0.41547338740317635, + "learning_rate": 3.436516483539781e-05, + "loss": 0.5652, + "step": 2834 + }, + { + "epoch": 0.3967809657102869, + "grad_norm": 0.4212387411967819, + "learning_rate": 3.4354655814326845e-05, + "loss": 0.5759, + "step": 2835 + }, + { + "epoch": 0.3969209237228831, + "grad_norm": 0.39412673101104495, + "learning_rate": 3.434414487077921e-05, + "loss": 0.5455, + "step": 2836 + }, + { + "epoch": 0.3970608817354794, + "grad_norm": 0.40728215607499646, + "learning_rate": 3.433363200691498e-05, + "loss": 0.5677, + "step": 2837 + }, + { + "epoch": 0.3972008397480756, + "grad_norm": 0.4180649072512069, + "learning_rate": 3.432311722489467e-05, + "loss": 0.5845, + "step": 2838 + }, + { + "epoch": 0.39734079776067177, + "grad_norm": 0.4237453415621929, + "learning_rate": 3.431260052687919e-05, + "loss": 0.5543, + "step": 2839 + }, + { + "epoch": 0.397480755773268, + "grad_norm": 0.47750072518183245, + "learning_rate": 3.430208191502979e-05, + "loss": 0.5799, + "step": 2840 + }, + { + "epoch": 0.3976207137858642, + "grad_norm": 0.40826704545334747, + "learning_rate": 3.4291561391508185e-05, + "loss": 0.5524, + "step": 2841 + }, + { + "epoch": 0.3977606717984605, + "grad_norm": 0.4353573374399257, + "learning_rate": 3.428103895847644e-05, + "loss": 0.5489, + "step": 2842 + }, + { + "epoch": 0.39790062981105667, + "grad_norm": 0.4164859565724006, + "learning_rate": 3.427051461809702e-05, + "loss": 0.5694, + "step": 2843 + }, + { + "epoch": 0.3980405878236529, + "grad_norm": 0.4285455810589633, + "learning_rate": 3.425998837253278e-05, + "loss": 0.5879, + "step": 2844 + }, + { + "epoch": 0.3981805458362491, + "grad_norm": 0.3797736872864129, + "learning_rate": 3.4249460223946975e-05, + "loss": 0.555, + "step": 2845 + }, + { + "epoch": 0.3983205038488453, + "grad_norm": 0.39764697478634364, + "learning_rate": 3.4238930174503245e-05, + "loss": 0.5683, + "step": 2846 + }, + { + "epoch": 0.3984604618614416, + "grad_norm": 0.40768853681588346, + "learning_rate": 3.422839822636562e-05, + "loss": 0.5822, + "step": 2847 + }, + { + "epoch": 0.39860041987403777, + "grad_norm": 0.40077863607268926, + "learning_rate": 3.421786438169852e-05, + "loss": 0.5218, + "step": 2848 + }, + { + "epoch": 0.398740377886634, + "grad_norm": 0.43178105768051894, + "learning_rate": 3.420732864266677e-05, + "loss": 0.5917, + "step": 2849 + }, + { + "epoch": 0.3988803358992302, + "grad_norm": 0.4012531476036567, + "learning_rate": 3.4196791011435554e-05, + "loss": 0.5575, + "step": 2850 + }, + { + "epoch": 0.3990202939118265, + "grad_norm": 0.42052995566229917, + "learning_rate": 3.418625149017047e-05, + "loss": 0.5898, + "step": 2851 + }, + { + "epoch": 0.39916025192442267, + "grad_norm": 0.48024947250916206, + "learning_rate": 3.4175710081037505e-05, + "loss": 0.5438, + "step": 2852 + }, + { + "epoch": 0.39930020993701887, + "grad_norm": 0.3857911120539825, + "learning_rate": 3.4165166786203015e-05, + "loss": 0.5641, + "step": 2853 + }, + { + "epoch": 0.3994401679496151, + "grad_norm": 0.3948093390858907, + "learning_rate": 3.415462160783375e-05, + "loss": 0.5655, + "step": 2854 + }, + { + "epoch": 0.3995801259622113, + "grad_norm": 0.4003575934592238, + "learning_rate": 3.4144074548096866e-05, + "loss": 0.6106, + "step": 2855 + }, + { + "epoch": 0.39972008397480757, + "grad_norm": 0.4321737583237654, + "learning_rate": 3.413352560915988e-05, + "loss": 0.5514, + "step": 2856 + }, + { + "epoch": 0.39986004198740377, + "grad_norm": 0.45894659025440354, + "learning_rate": 3.412297479319071e-05, + "loss": 0.5669, + "step": 2857 + }, + { + "epoch": 0.4, + "grad_norm": 0.42987083801881937, + "learning_rate": 3.411242210235765e-05, + "loss": 0.5974, + "step": 2858 + }, + { + "epoch": 0.4001399580125962, + "grad_norm": 0.40207997706206416, + "learning_rate": 3.4101867538829394e-05, + "loss": 0.5286, + "step": 2859 + }, + { + "epoch": 0.4002799160251924, + "grad_norm": 0.40235651191545074, + "learning_rate": 3.4091311104775e-05, + "loss": 0.5427, + "step": 2860 + }, + { + "epoch": 0.40041987403778867, + "grad_norm": 0.40231984058011133, + "learning_rate": 3.4080752802363936e-05, + "loss": 0.5341, + "step": 2861 + }, + { + "epoch": 0.40055983205038487, + "grad_norm": 0.4240007925517186, + "learning_rate": 3.4070192633766025e-05, + "loss": 0.5571, + "step": 2862 + }, + { + "epoch": 0.4006997900629811, + "grad_norm": 0.3961994599738629, + "learning_rate": 3.4059630601151504e-05, + "loss": 0.5585, + "step": 2863 + }, + { + "epoch": 0.4008397480755773, + "grad_norm": 0.3908273444014209, + "learning_rate": 3.404906670669097e-05, + "loss": 0.5444, + "step": 2864 + }, + { + "epoch": 0.40097970608817357, + "grad_norm": 0.4011478963975767, + "learning_rate": 3.403850095255542e-05, + "loss": 0.601, + "step": 2865 + }, + { + "epoch": 0.40111966410076977, + "grad_norm": 0.4492421654680456, + "learning_rate": 3.402793334091621e-05, + "loss": 0.5953, + "step": 2866 + }, + { + "epoch": 0.40125962211336597, + "grad_norm": 0.4207454261749868, + "learning_rate": 3.40173638739451e-05, + "loss": 0.5931, + "step": 2867 + }, + { + "epoch": 0.4013995801259622, + "grad_norm": 0.40623369999208175, + "learning_rate": 3.400679255381421e-05, + "loss": 0.5528, + "step": 2868 + }, + { + "epoch": 0.4015395381385584, + "grad_norm": 0.4169914185414076, + "learning_rate": 3.399621938269606e-05, + "loss": 0.5823, + "step": 2869 + }, + { + "epoch": 0.40167949615115467, + "grad_norm": 0.42169370060384803, + "learning_rate": 3.3985644362763566e-05, + "loss": 0.5565, + "step": 2870 + }, + { + "epoch": 0.40181945416375087, + "grad_norm": 0.38157604657299227, + "learning_rate": 3.3975067496189965e-05, + "loss": 0.525, + "step": 2871 + }, + { + "epoch": 0.4019594121763471, + "grad_norm": 0.42241067538345, + "learning_rate": 3.396448878514894e-05, + "loss": 0.577, + "step": 2872 + }, + { + "epoch": 0.4020993701889433, + "grad_norm": 0.4164649305478365, + "learning_rate": 3.395390823181451e-05, + "loss": 0.5334, + "step": 2873 + }, + { + "epoch": 0.4022393282015395, + "grad_norm": 0.4224179515123039, + "learning_rate": 3.3943325838361084e-05, + "loss": 0.6014, + "step": 2874 + }, + { + "epoch": 0.40237928621413577, + "grad_norm": 0.4243031111453756, + "learning_rate": 3.393274160696346e-05, + "loss": 0.5759, + "step": 2875 + }, + { + "epoch": 0.40251924422673196, + "grad_norm": 0.4214607029417847, + "learning_rate": 3.392215553979679e-05, + "loss": 0.6041, + "step": 2876 + }, + { + "epoch": 0.4026592022393282, + "grad_norm": 0.39897243417459854, + "learning_rate": 3.391156763903665e-05, + "loss": 0.5425, + "step": 2877 + }, + { + "epoch": 0.4027991602519244, + "grad_norm": 0.39128320703090635, + "learning_rate": 3.390097790685892e-05, + "loss": 0.579, + "step": 2878 + }, + { + "epoch": 0.40293911826452067, + "grad_norm": 0.40615873009003534, + "learning_rate": 3.389038634543993e-05, + "loss": 0.5828, + "step": 2879 + }, + { + "epoch": 0.40307907627711687, + "grad_norm": 0.41878420947323897, + "learning_rate": 3.387979295695632e-05, + "loss": 0.5828, + "step": 2880 + }, + { + "epoch": 0.40321903428971306, + "grad_norm": 0.383331359104639, + "learning_rate": 3.386919774358518e-05, + "loss": 0.5384, + "step": 2881 + }, + { + "epoch": 0.4033589923023093, + "grad_norm": 0.41913779228819376, + "learning_rate": 3.38586007075039e-05, + "loss": 0.5591, + "step": 2882 + }, + { + "epoch": 0.4034989503149055, + "grad_norm": 0.38348801468846716, + "learning_rate": 3.3848001850890296e-05, + "loss": 0.5393, + "step": 2883 + }, + { + "epoch": 0.40363890832750177, + "grad_norm": 0.4113992124017339, + "learning_rate": 3.383740117592254e-05, + "loss": 0.5567, + "step": 2884 + }, + { + "epoch": 0.40377886634009796, + "grad_norm": 0.40383694106615, + "learning_rate": 3.382679868477917e-05, + "loss": 0.5686, + "step": 2885 + }, + { + "epoch": 0.4039188243526942, + "grad_norm": 0.4275501335912839, + "learning_rate": 3.381619437963911e-05, + "loss": 0.5709, + "step": 2886 + }, + { + "epoch": 0.4040587823652904, + "grad_norm": 0.410136210120104, + "learning_rate": 3.3805588262681655e-05, + "loss": 0.5578, + "step": 2887 + }, + { + "epoch": 0.4041987403778866, + "grad_norm": 0.39945476818253844, + "learning_rate": 3.379498033608646e-05, + "loss": 0.5576, + "step": 2888 + }, + { + "epoch": 0.40433869839048286, + "grad_norm": 0.41503488018923496, + "learning_rate": 3.378437060203357e-05, + "loss": 0.5621, + "step": 2889 + }, + { + "epoch": 0.40447865640307906, + "grad_norm": 0.4205244822133629, + "learning_rate": 3.3773759062703396e-05, + "loss": 0.571, + "step": 2890 + }, + { + "epoch": 0.4046186144156753, + "grad_norm": 0.40721392794208344, + "learning_rate": 3.376314572027671e-05, + "loss": 0.5539, + "step": 2891 + }, + { + "epoch": 0.4047585724282715, + "grad_norm": 0.4061261621378902, + "learning_rate": 3.375253057693466e-05, + "loss": 0.5502, + "step": 2892 + }, + { + "epoch": 0.40489853044086777, + "grad_norm": 0.4040934767631151, + "learning_rate": 3.374191363485877e-05, + "loss": 0.5502, + "step": 2893 + }, + { + "epoch": 0.40503848845346396, + "grad_norm": 0.4140086272358693, + "learning_rate": 3.373129489623093e-05, + "loss": 0.5752, + "step": 2894 + }, + { + "epoch": 0.40517844646606016, + "grad_norm": 0.4285410515445642, + "learning_rate": 3.3720674363233396e-05, + "loss": 0.5641, + "step": 2895 + }, + { + "epoch": 0.4053184044786564, + "grad_norm": 0.3886401656745267, + "learning_rate": 3.37100520380488e-05, + "loss": 0.5721, + "step": 2896 + }, + { + "epoch": 0.4054583624912526, + "grad_norm": 0.4252355420954435, + "learning_rate": 3.369942792286013e-05, + "loss": 0.5494, + "step": 2897 + }, + { + "epoch": 0.40559832050384886, + "grad_norm": 0.44644708491027574, + "learning_rate": 3.368880201985076e-05, + "loss": 0.5722, + "step": 2898 + }, + { + "epoch": 0.40573827851644506, + "grad_norm": 0.42435542539228505, + "learning_rate": 3.367817433120441e-05, + "loss": 0.5744, + "step": 2899 + }, + { + "epoch": 0.4058782365290413, + "grad_norm": 0.41641484378908183, + "learning_rate": 3.366754485910518e-05, + "loss": 0.5874, + "step": 2900 + }, + { + "epoch": 0.4060181945416375, + "grad_norm": 0.40783494684210103, + "learning_rate": 3.365691360573754e-05, + "loss": 0.5561, + "step": 2901 + }, + { + "epoch": 0.4061581525542337, + "grad_norm": 0.3940864417785125, + "learning_rate": 3.3646280573286314e-05, + "loss": 0.5437, + "step": 2902 + }, + { + "epoch": 0.40629811056682996, + "grad_norm": 0.41032563545142275, + "learning_rate": 3.363564576393671e-05, + "loss": 0.5679, + "step": 2903 + }, + { + "epoch": 0.40643806857942616, + "grad_norm": 0.43293587153461693, + "learning_rate": 3.3625009179874265e-05, + "loss": 0.6005, + "step": 2904 + }, + { + "epoch": 0.4065780265920224, + "grad_norm": 0.41913595404532833, + "learning_rate": 3.361437082328493e-05, + "loss": 0.5841, + "step": 2905 + }, + { + "epoch": 0.4067179846046186, + "grad_norm": 0.39955188014787174, + "learning_rate": 3.360373069635498e-05, + "loss": 0.5762, + "step": 2906 + }, + { + "epoch": 0.40685794261721486, + "grad_norm": 0.40532054476042145, + "learning_rate": 3.359308880127108e-05, + "loss": 0.5811, + "step": 2907 + }, + { + "epoch": 0.40699790062981106, + "grad_norm": 0.3950198242019259, + "learning_rate": 3.358244514022025e-05, + "loss": 0.5795, + "step": 2908 + }, + { + "epoch": 0.40713785864240726, + "grad_norm": 0.4211880514353567, + "learning_rate": 3.357179971538985e-05, + "loss": 0.5606, + "step": 2909 + }, + { + "epoch": 0.4072778166550035, + "grad_norm": 0.4142064942386179, + "learning_rate": 3.3561152528967646e-05, + "loss": 0.5802, + "step": 2910 + }, + { + "epoch": 0.4074177746675997, + "grad_norm": 0.41844753492725145, + "learning_rate": 3.355050358314172e-05, + "loss": 0.5274, + "step": 2911 + }, + { + "epoch": 0.40755773268019596, + "grad_norm": 0.41589536675167615, + "learning_rate": 3.353985288010056e-05, + "loss": 0.5938, + "step": 2912 + }, + { + "epoch": 0.40769769069279216, + "grad_norm": 0.39984810028580087, + "learning_rate": 3.352920042203298e-05, + "loss": 0.58, + "step": 2913 + }, + { + "epoch": 0.40783764870538836, + "grad_norm": 0.4140807749556365, + "learning_rate": 3.3518546211128166e-05, + "loss": 0.5491, + "step": 2914 + }, + { + "epoch": 0.4079776067179846, + "grad_norm": 0.4328655244745967, + "learning_rate": 3.350789024957568e-05, + "loss": 0.5972, + "step": 2915 + }, + { + "epoch": 0.4081175647305808, + "grad_norm": 0.3996263118365605, + "learning_rate": 3.349723253956542e-05, + "loss": 0.5629, + "step": 2916 + }, + { + "epoch": 0.40825752274317706, + "grad_norm": 0.4138825890216495, + "learning_rate": 3.348657308328766e-05, + "loss": 0.5461, + "step": 2917 + }, + { + "epoch": 0.40839748075577326, + "grad_norm": 0.45384712853313164, + "learning_rate": 3.3475911882933015e-05, + "loss": 0.5456, + "step": 2918 + }, + { + "epoch": 0.4085374387683695, + "grad_norm": 0.4085437808720945, + "learning_rate": 3.346524894069248e-05, + "loss": 0.5453, + "step": 2919 + }, + { + "epoch": 0.4086773967809657, + "grad_norm": 0.4038494007288198, + "learning_rate": 3.3454584258757404e-05, + "loss": 0.5623, + "step": 2920 + }, + { + "epoch": 0.4088173547935619, + "grad_norm": 0.4566167974871496, + "learning_rate": 3.344391783931947e-05, + "loss": 0.5907, + "step": 2921 + }, + { + "epoch": 0.40895731280615816, + "grad_norm": 0.3969364122046131, + "learning_rate": 3.343324968457076e-05, + "loss": 0.5131, + "step": 2922 + }, + { + "epoch": 0.40909727081875435, + "grad_norm": 0.4203451679837855, + "learning_rate": 3.342257979670365e-05, + "loss": 0.5644, + "step": 2923 + }, + { + "epoch": 0.4092372288313506, + "grad_norm": 0.40925859811350057, + "learning_rate": 3.341190817791094e-05, + "loss": 0.5498, + "step": 2924 + }, + { + "epoch": 0.4093771868439468, + "grad_norm": 0.4083989313014574, + "learning_rate": 3.3401234830385756e-05, + "loss": 0.5854, + "step": 2925 + }, + { + "epoch": 0.40951714485654306, + "grad_norm": 0.39780928336254934, + "learning_rate": 3.3390559756321566e-05, + "loss": 0.5659, + "step": 2926 + }, + { + "epoch": 0.40965710286913926, + "grad_norm": 0.4109873033834379, + "learning_rate": 3.337988295791221e-05, + "loss": 0.5422, + "step": 2927 + }, + { + "epoch": 0.40979706088173545, + "grad_norm": 0.3952927193405086, + "learning_rate": 3.3369204437351886e-05, + "loss": 0.5592, + "step": 2928 + }, + { + "epoch": 0.4099370188943317, + "grad_norm": 0.40670959006044405, + "learning_rate": 3.335852419683513e-05, + "loss": 0.5711, + "step": 2929 + }, + { + "epoch": 0.4100769769069279, + "grad_norm": 0.3974994882518793, + "learning_rate": 3.3347842238556836e-05, + "loss": 0.5315, + "step": 2930 + }, + { + "epoch": 0.41021693491952416, + "grad_norm": 0.44234521625168705, + "learning_rate": 3.3337158564712267e-05, + "loss": 0.5552, + "step": 2931 + }, + { + "epoch": 0.41035689293212035, + "grad_norm": 0.40022379185542395, + "learning_rate": 3.332647317749702e-05, + "loss": 0.5745, + "step": 2932 + }, + { + "epoch": 0.4104968509447166, + "grad_norm": 0.418577329208144, + "learning_rate": 3.3315786079107055e-05, + "loss": 0.5947, + "step": 2933 + }, + { + "epoch": 0.4106368089573128, + "grad_norm": 0.40576821316747064, + "learning_rate": 3.3305097271738665e-05, + "loss": 0.6025, + "step": 2934 + }, + { + "epoch": 0.410776766969909, + "grad_norm": 0.4066641994376824, + "learning_rate": 3.329440675758853e-05, + "loss": 0.5475, + "step": 2935 + }, + { + "epoch": 0.41091672498250525, + "grad_norm": 0.4150792589177556, + "learning_rate": 3.3283714538853636e-05, + "loss": 0.5866, + "step": 2936 + }, + { + "epoch": 0.41105668299510145, + "grad_norm": 0.397074447582423, + "learning_rate": 3.327302061773136e-05, + "loss": 0.5799, + "step": 2937 + }, + { + "epoch": 0.4111966410076977, + "grad_norm": 0.40754727507905664, + "learning_rate": 3.3262324996419405e-05, + "loss": 0.5469, + "step": 2938 + }, + { + "epoch": 0.4113365990202939, + "grad_norm": 0.40164275820649376, + "learning_rate": 3.325162767711583e-05, + "loss": 0.5232, + "step": 2939 + }, + { + "epoch": 0.41147655703289016, + "grad_norm": 0.4080075519925706, + "learning_rate": 3.324092866201904e-05, + "loss": 0.5661, + "step": 2940 + }, + { + "epoch": 0.41161651504548635, + "grad_norm": 0.40149245258055244, + "learning_rate": 3.3230227953327796e-05, + "loss": 0.5474, + "step": 2941 + }, + { + "epoch": 0.41175647305808255, + "grad_norm": 0.4158994899194992, + "learning_rate": 3.321952555324121e-05, + "loss": 0.5258, + "step": 2942 + }, + { + "epoch": 0.4118964310706788, + "grad_norm": 0.4019593770294339, + "learning_rate": 3.320882146395871e-05, + "loss": 0.5762, + "step": 2943 + }, + { + "epoch": 0.412036389083275, + "grad_norm": 0.4249496622474027, + "learning_rate": 3.3198115687680115e-05, + "loss": 0.5462, + "step": 2944 + }, + { + "epoch": 0.41217634709587125, + "grad_norm": 0.43225282993592656, + "learning_rate": 3.318740822660556e-05, + "loss": 0.5446, + "step": 2945 + }, + { + "epoch": 0.41231630510846745, + "grad_norm": 0.41726887193732093, + "learning_rate": 3.3176699082935545e-05, + "loss": 0.5493, + "step": 2946 + }, + { + "epoch": 0.4124562631210637, + "grad_norm": 0.39903806845546885, + "learning_rate": 3.31659882588709e-05, + "loss": 0.5888, + "step": 2947 + }, + { + "epoch": 0.4125962211336599, + "grad_norm": 0.4023782994195669, + "learning_rate": 3.315527575661282e-05, + "loss": 0.5684, + "step": 2948 + }, + { + "epoch": 0.4127361791462561, + "grad_norm": 0.3936024186034419, + "learning_rate": 3.314456157836281e-05, + "loss": 0.5117, + "step": 2949 + }, + { + "epoch": 0.41287613715885235, + "grad_norm": 0.3952468427687398, + "learning_rate": 3.313384572632277e-05, + "loss": 0.5547, + "step": 2950 + }, + { + "epoch": 0.41301609517144855, + "grad_norm": 0.4190384583627527, + "learning_rate": 3.31231282026949e-05, + "loss": 0.5805, + "step": 2951 + }, + { + "epoch": 0.4131560531840448, + "grad_norm": 0.39658407316177025, + "learning_rate": 3.3112409009681766e-05, + "loss": 0.5406, + "step": 2952 + }, + { + "epoch": 0.413296011196641, + "grad_norm": 0.4076292942631321, + "learning_rate": 3.310168814948627e-05, + "loss": 0.588, + "step": 2953 + }, + { + "epoch": 0.41343596920923725, + "grad_norm": 0.41540765697070914, + "learning_rate": 3.3090965624311654e-05, + "loss": 0.5594, + "step": 2954 + }, + { + "epoch": 0.41357592722183345, + "grad_norm": 0.4035372057640521, + "learning_rate": 3.3080241436361506e-05, + "loss": 0.5636, + "step": 2955 + }, + { + "epoch": 0.41371588523442965, + "grad_norm": 0.41459880243511804, + "learning_rate": 3.3069515587839754e-05, + "loss": 0.5532, + "step": 2956 + }, + { + "epoch": 0.4138558432470259, + "grad_norm": 0.39031658646005546, + "learning_rate": 3.305878808095068e-05, + "loss": 0.5939, + "step": 2957 + }, + { + "epoch": 0.4139958012596221, + "grad_norm": 0.4182734311842286, + "learning_rate": 3.304805891789888e-05, + "loss": 0.5872, + "step": 2958 + }, + { + "epoch": 0.41413575927221835, + "grad_norm": 0.4222389070033302, + "learning_rate": 3.303732810088931e-05, + "loss": 0.6106, + "step": 2959 + }, + { + "epoch": 0.41427571728481455, + "grad_norm": 0.39648114038614457, + "learning_rate": 3.302659563212727e-05, + "loss": 0.5457, + "step": 2960 + }, + { + "epoch": 0.4144156752974108, + "grad_norm": 0.42978337071116823, + "learning_rate": 3.301586151381839e-05, + "loss": 0.5644, + "step": 2961 + }, + { + "epoch": 0.414555633310007, + "grad_norm": 0.4005463311390214, + "learning_rate": 3.300512574816863e-05, + "loss": 0.5676, + "step": 2962 + }, + { + "epoch": 0.4146955913226032, + "grad_norm": 0.39670767820561714, + "learning_rate": 3.2994388337384306e-05, + "loss": 0.5683, + "step": 2963 + }, + { + "epoch": 0.41483554933519945, + "grad_norm": 0.414169510604101, + "learning_rate": 3.298364928367207e-05, + "loss": 0.5685, + "step": 2964 + }, + { + "epoch": 0.41497550734779565, + "grad_norm": 0.39829585786394445, + "learning_rate": 3.2972908589238896e-05, + "loss": 0.5627, + "step": 2965 + }, + { + "epoch": 0.4151154653603919, + "grad_norm": 0.40155559541312513, + "learning_rate": 3.2962166256292113e-05, + "loss": 0.5805, + "step": 2966 + }, + { + "epoch": 0.4152554233729881, + "grad_norm": 0.4105341813553997, + "learning_rate": 3.295142228703938e-05, + "loss": 0.6003, + "step": 2967 + }, + { + "epoch": 0.41539538138558435, + "grad_norm": 0.4006968222058459, + "learning_rate": 3.2940676683688677e-05, + "loss": 0.546, + "step": 2968 + }, + { + "epoch": 0.41553533939818055, + "grad_norm": 0.39978495742611814, + "learning_rate": 3.292992944844836e-05, + "loss": 0.5892, + "step": 2969 + }, + { + "epoch": 0.41567529741077675, + "grad_norm": 0.4016166800083618, + "learning_rate": 3.291918058352706e-05, + "loss": 0.5708, + "step": 2970 + }, + { + "epoch": 0.415815255423373, + "grad_norm": 0.40924317132383004, + "learning_rate": 3.290843009113382e-05, + "loss": 0.5291, + "step": 2971 + }, + { + "epoch": 0.4159552134359692, + "grad_norm": 0.41496924484348124, + "learning_rate": 3.2897677973477936e-05, + "loss": 0.5638, + "step": 2972 + }, + { + "epoch": 0.41609517144856545, + "grad_norm": 0.4170073162428262, + "learning_rate": 3.288692423276911e-05, + "loss": 0.5659, + "step": 2973 + }, + { + "epoch": 0.41623512946116165, + "grad_norm": 0.4393981644887219, + "learning_rate": 3.2876168871217325e-05, + "loss": 0.5674, + "step": 2974 + }, + { + "epoch": 0.4163750874737579, + "grad_norm": 0.39928933014260387, + "learning_rate": 3.2865411891032916e-05, + "loss": 0.5589, + "step": 2975 + }, + { + "epoch": 0.4165150454863541, + "grad_norm": 0.6076938066952381, + "learning_rate": 3.2854653294426566e-05, + "loss": 0.5781, + "step": 2976 + }, + { + "epoch": 0.4166550034989503, + "grad_norm": 0.4206690009626964, + "learning_rate": 3.284389308360927e-05, + "loss": 0.5945, + "step": 2977 + }, + { + "epoch": 0.41679496151154655, + "grad_norm": 0.38925483204393885, + "learning_rate": 3.2833131260792345e-05, + "loss": 0.527, + "step": 2978 + }, + { + "epoch": 0.41693491952414274, + "grad_norm": 0.43019105225997517, + "learning_rate": 3.282236782818747e-05, + "loss": 0.5975, + "step": 2979 + }, + { + "epoch": 0.417074877536739, + "grad_norm": 0.4151005702647614, + "learning_rate": 3.2811602788006645e-05, + "loss": 0.5512, + "step": 2980 + }, + { + "epoch": 0.4172148355493352, + "grad_norm": 0.43155499812179654, + "learning_rate": 3.280083614246218e-05, + "loss": 0.5582, + "step": 2981 + }, + { + "epoch": 0.41735479356193145, + "grad_norm": 0.4147678068443396, + "learning_rate": 3.279006789376674e-05, + "loss": 0.5355, + "step": 2982 + }, + { + "epoch": 0.41749475157452764, + "grad_norm": 0.40986730591235127, + "learning_rate": 3.2779298044133304e-05, + "loss": 0.5521, + "step": 2983 + }, + { + "epoch": 0.41763470958712384, + "grad_norm": 0.41750246578218936, + "learning_rate": 3.276852659577519e-05, + "loss": 0.5642, + "step": 2984 + }, + { + "epoch": 0.4177746675997201, + "grad_norm": 0.4008235425339085, + "learning_rate": 3.275775355090603e-05, + "loss": 0.5385, + "step": 2985 + }, + { + "epoch": 0.4179146256123163, + "grad_norm": 0.42115213204670543, + "learning_rate": 3.274697891173982e-05, + "loss": 0.597, + "step": 2986 + }, + { + "epoch": 0.41805458362491255, + "grad_norm": 0.40847799222276204, + "learning_rate": 3.273620268049083e-05, + "loss": 0.5372, + "step": 2987 + }, + { + "epoch": 0.41819454163750874, + "grad_norm": 0.39981522543632353, + "learning_rate": 3.272542485937369e-05, + "loss": 0.5812, + "step": 2988 + }, + { + "epoch": 0.418334499650105, + "grad_norm": 0.4281898310803181, + "learning_rate": 3.271464545060336e-05, + "loss": 0.6052, + "step": 2989 + }, + { + "epoch": 0.4184744576627012, + "grad_norm": 0.3960705407504793, + "learning_rate": 3.2703864456395106e-05, + "loss": 0.5535, + "step": 2990 + }, + { + "epoch": 0.4186144156752974, + "grad_norm": 0.40782715021367766, + "learning_rate": 3.2693081878964546e-05, + "loss": 0.5444, + "step": 2991 + }, + { + "epoch": 0.41875437368789364, + "grad_norm": 0.4108892545513627, + "learning_rate": 3.2682297720527596e-05, + "loss": 0.588, + "step": 2992 + }, + { + "epoch": 0.41889433170048984, + "grad_norm": 0.4052095905789736, + "learning_rate": 3.267151198330053e-05, + "loss": 0.6009, + "step": 2993 + }, + { + "epoch": 0.4190342897130861, + "grad_norm": 0.3957928902994827, + "learning_rate": 3.2660724669499906e-05, + "loss": 0.5503, + "step": 2994 + }, + { + "epoch": 0.4191742477256823, + "grad_norm": 0.3945291064646862, + "learning_rate": 3.264993578134263e-05, + "loss": 0.5456, + "step": 2995 + }, + { + "epoch": 0.4193142057382785, + "grad_norm": 0.38055971820367784, + "learning_rate": 3.263914532104593e-05, + "loss": 0.575, + "step": 2996 + }, + { + "epoch": 0.41945416375087474, + "grad_norm": 0.3990796286153784, + "learning_rate": 3.2628353290827365e-05, + "loss": 0.5653, + "step": 2997 + }, + { + "epoch": 0.41959412176347094, + "grad_norm": 0.39699323320776897, + "learning_rate": 3.2617559692904784e-05, + "loss": 0.5567, + "step": 2998 + }, + { + "epoch": 0.4197340797760672, + "grad_norm": 0.42070498470477863, + "learning_rate": 3.260676452949641e-05, + "loss": 0.5643, + "step": 2999 + }, + { + "epoch": 0.4198740377886634, + "grad_norm": 0.41895847299595507, + "learning_rate": 3.259596780282074e-05, + "loss": 0.5866, + "step": 3000 + }, + { + "epoch": 0.42001399580125964, + "grad_norm": 0.4054149661332881, + "learning_rate": 3.2585169515096615e-05, + "loss": 0.5221, + "step": 3001 + }, + { + "epoch": 0.42015395381385584, + "grad_norm": 0.41159580867468976, + "learning_rate": 3.257436966854319e-05, + "loss": 0.5409, + "step": 3002 + }, + { + "epoch": 0.42029391182645204, + "grad_norm": 0.41565485170065436, + "learning_rate": 3.256356826537994e-05, + "loss": 0.572, + "step": 3003 + }, + { + "epoch": 0.4204338698390483, + "grad_norm": 0.4011710239277276, + "learning_rate": 3.255276530782667e-05, + "loss": 0.5399, + "step": 3004 + }, + { + "epoch": 0.4205738278516445, + "grad_norm": 0.45007473746050475, + "learning_rate": 3.25419607981035e-05, + "loss": 0.5283, + "step": 3005 + }, + { + "epoch": 0.42071378586424074, + "grad_norm": 0.4258234246567913, + "learning_rate": 3.253115473843086e-05, + "loss": 0.6111, + "step": 3006 + }, + { + "epoch": 0.42085374387683694, + "grad_norm": 0.41531516468042334, + "learning_rate": 3.252034713102951e-05, + "loss": 0.5644, + "step": 3007 + }, + { + "epoch": 0.4209937018894332, + "grad_norm": 0.41220387089241467, + "learning_rate": 3.250953797812051e-05, + "loss": 0.5798, + "step": 3008 + }, + { + "epoch": 0.4211336599020294, + "grad_norm": 0.4165684547155104, + "learning_rate": 3.249872728192527e-05, + "loss": 0.578, + "step": 3009 + }, + { + "epoch": 0.4212736179146256, + "grad_norm": 0.41641177346916974, + "learning_rate": 3.248791504466548e-05, + "loss": 0.5847, + "step": 3010 + }, + { + "epoch": 0.42141357592722184, + "grad_norm": 0.4230423352548566, + "learning_rate": 3.2477101268563184e-05, + "loss": 0.5578, + "step": 3011 + }, + { + "epoch": 0.42155353393981804, + "grad_norm": 0.40182190163643416, + "learning_rate": 3.24662859558407e-05, + "loss": 0.5322, + "step": 3012 + }, + { + "epoch": 0.4216934919524143, + "grad_norm": 0.43511787431355525, + "learning_rate": 3.245546910872071e-05, + "loss": 0.5473, + "step": 3013 + }, + { + "epoch": 0.4218334499650105, + "grad_norm": 0.41338327961202165, + "learning_rate": 3.244465072942615e-05, + "loss": 0.5883, + "step": 3014 + }, + { + "epoch": 0.42197340797760674, + "grad_norm": 0.39971647364371277, + "learning_rate": 3.2433830820180346e-05, + "loss": 0.5384, + "step": 3015 + }, + { + "epoch": 0.42211336599020294, + "grad_norm": 0.4417586008166198, + "learning_rate": 3.2423009383206876e-05, + "loss": 0.5662, + "step": 3016 + }, + { + "epoch": 0.42225332400279914, + "grad_norm": 0.41967998090343933, + "learning_rate": 3.241218642072966e-05, + "loss": 0.5456, + "step": 3017 + }, + { + "epoch": 0.4223932820153954, + "grad_norm": 0.41986891673954385, + "learning_rate": 3.240136193497293e-05, + "loss": 0.5633, + "step": 3018 + }, + { + "epoch": 0.4225332400279916, + "grad_norm": 0.4038319504416983, + "learning_rate": 3.239053592816122e-05, + "loss": 0.5405, + "step": 3019 + }, + { + "epoch": 0.42267319804058784, + "grad_norm": 0.42072486493920963, + "learning_rate": 3.2379708402519394e-05, + "loss": 0.5397, + "step": 3020 + }, + { + "epoch": 0.42281315605318404, + "grad_norm": 0.3963667588002371, + "learning_rate": 3.2368879360272606e-05, + "loss": 0.5437, + "step": 3021 + }, + { + "epoch": 0.4229531140657803, + "grad_norm": 0.43813839588807524, + "learning_rate": 3.235804880364635e-05, + "loss": 0.6218, + "step": 3022 + }, + { + "epoch": 0.4230930720783765, + "grad_norm": 0.420130728007189, + "learning_rate": 3.23472167348664e-05, + "loss": 0.5528, + "step": 3023 + }, + { + "epoch": 0.4232330300909727, + "grad_norm": 0.3931327455825961, + "learning_rate": 3.233638315615887e-05, + "loss": 0.5493, + "step": 3024 + }, + { + "epoch": 0.42337298810356894, + "grad_norm": 0.4231921022929699, + "learning_rate": 3.232554806975016e-05, + "loss": 0.5567, + "step": 3025 + }, + { + "epoch": 0.42351294611616513, + "grad_norm": 0.4136408960013234, + "learning_rate": 3.231471147786699e-05, + "loss": 0.6057, + "step": 3026 + }, + { + "epoch": 0.4236529041287614, + "grad_norm": 0.3990285743665062, + "learning_rate": 3.23038733827364e-05, + "loss": 0.5888, + "step": 3027 + }, + { + "epoch": 0.4237928621413576, + "grad_norm": 0.38749988478653646, + "learning_rate": 3.2293033786585716e-05, + "loss": 0.5386, + "step": 3028 + }, + { + "epoch": 0.42393282015395384, + "grad_norm": 0.4001033239780827, + "learning_rate": 3.228219269164259e-05, + "loss": 0.5741, + "step": 3029 + }, + { + "epoch": 0.42407277816655004, + "grad_norm": 0.38245836943637973, + "learning_rate": 3.2271350100134975e-05, + "loss": 0.5658, + "step": 3030 + }, + { + "epoch": 0.42421273617914623, + "grad_norm": 0.3936968710553453, + "learning_rate": 3.226050601429115e-05, + "loss": 0.572, + "step": 3031 + }, + { + "epoch": 0.4243526941917425, + "grad_norm": 0.3902441652129766, + "learning_rate": 3.224966043633966e-05, + "loss": 0.5199, + "step": 3032 + }, + { + "epoch": 0.4244926522043387, + "grad_norm": 0.43154246579160466, + "learning_rate": 3.223881336850939e-05, + "loss": 0.5621, + "step": 3033 + }, + { + "epoch": 0.42463261021693494, + "grad_norm": 0.41498447902123786, + "learning_rate": 3.222796481302953e-05, + "loss": 0.5886, + "step": 3034 + }, + { + "epoch": 0.42477256822953113, + "grad_norm": 0.38695499906221675, + "learning_rate": 3.221711477212956e-05, + "loss": 0.5449, + "step": 3035 + }, + { + "epoch": 0.4249125262421274, + "grad_norm": 0.3974805369088236, + "learning_rate": 3.2206263248039276e-05, + "loss": 0.5602, + "step": 3036 + }, + { + "epoch": 0.4250524842547236, + "grad_norm": 0.40115361129111843, + "learning_rate": 3.2195410242988776e-05, + "loss": 0.5675, + "step": 3037 + }, + { + "epoch": 0.4251924422673198, + "grad_norm": 0.3992286694324687, + "learning_rate": 3.2184555759208465e-05, + "loss": 0.5365, + "step": 3038 + }, + { + "epoch": 0.42533240027991603, + "grad_norm": 0.39187949837957153, + "learning_rate": 3.217369979892905e-05, + "loss": 0.5514, + "step": 3039 + }, + { + "epoch": 0.42547235829251223, + "grad_norm": 0.42850832602682615, + "learning_rate": 3.216284236438154e-05, + "loss": 0.5432, + "step": 3040 + }, + { + "epoch": 0.4256123163051085, + "grad_norm": 0.43436314625968364, + "learning_rate": 3.215198345779723e-05, + "loss": 0.586, + "step": 3041 + }, + { + "epoch": 0.4257522743177047, + "grad_norm": 0.4099577503194474, + "learning_rate": 3.214112308140777e-05, + "loss": 0.5915, + "step": 3042 + }, + { + "epoch": 0.42589223233030093, + "grad_norm": 0.42233874886106193, + "learning_rate": 3.213026123744506e-05, + "loss": 0.6066, + "step": 3043 + }, + { + "epoch": 0.42603219034289713, + "grad_norm": 0.3978279257361451, + "learning_rate": 3.211939792814131e-05, + "loss": 0.5882, + "step": 3044 + }, + { + "epoch": 0.42617214835549333, + "grad_norm": 0.40970969740070096, + "learning_rate": 3.210853315572906e-05, + "loss": 0.5647, + "step": 3045 + }, + { + "epoch": 0.4263121063680896, + "grad_norm": 0.3976277940502661, + "learning_rate": 3.20976669224411e-05, + "loss": 0.5873, + "step": 3046 + }, + { + "epoch": 0.4264520643806858, + "grad_norm": 0.400367465814101, + "learning_rate": 3.208679923051059e-05, + "loss": 0.5595, + "step": 3047 + }, + { + "epoch": 0.42659202239328203, + "grad_norm": 0.40911631062240816, + "learning_rate": 3.207593008217092e-05, + "loss": 0.566, + "step": 3048 + }, + { + "epoch": 0.42673198040587823, + "grad_norm": 0.4245921605186627, + "learning_rate": 3.206505947965583e-05, + "loss": 0.5643, + "step": 3049 + }, + { + "epoch": 0.4268719384184745, + "grad_norm": 0.42320217692212986, + "learning_rate": 3.205418742519933e-05, + "loss": 0.5512, + "step": 3050 + }, + { + "epoch": 0.4270118964310707, + "grad_norm": 0.4050992683553409, + "learning_rate": 3.2043313921035743e-05, + "loss": 0.5712, + "step": 3051 + }, + { + "epoch": 0.4271518544436669, + "grad_norm": 0.39118318821429826, + "learning_rate": 3.203243896939968e-05, + "loss": 0.5839, + "step": 3052 + }, + { + "epoch": 0.42729181245626313, + "grad_norm": 0.4116673535552461, + "learning_rate": 3.202156257252606e-05, + "loss": 0.5599, + "step": 3053 + }, + { + "epoch": 0.42743177046885933, + "grad_norm": 0.42864138348780956, + "learning_rate": 3.201068473265007e-05, + "loss": 0.5877, + "step": 3054 + }, + { + "epoch": 0.4275717284814556, + "grad_norm": 0.4084556626969551, + "learning_rate": 3.1999805452007245e-05, + "loss": 0.578, + "step": 3055 + }, + { + "epoch": 0.4277116864940518, + "grad_norm": 0.40349943769579744, + "learning_rate": 3.1988924732833384e-05, + "loss": 0.562, + "step": 3056 + }, + { + "epoch": 0.42785164450664803, + "grad_norm": 0.41607503622672326, + "learning_rate": 3.197804257736456e-05, + "loss": 0.5766, + "step": 3057 + }, + { + "epoch": 0.42799160251924423, + "grad_norm": 0.390556372929388, + "learning_rate": 3.19671589878372e-05, + "loss": 0.5343, + "step": 3058 + }, + { + "epoch": 0.4281315605318404, + "grad_norm": 0.40189243879585396, + "learning_rate": 3.195627396648796e-05, + "loss": 0.5923, + "step": 3059 + }, + { + "epoch": 0.4282715185444367, + "grad_norm": 0.383550649614106, + "learning_rate": 3.1945387515553846e-05, + "loss": 0.5356, + "step": 3060 + }, + { + "epoch": 0.4284114765570329, + "grad_norm": 0.39467142307894415, + "learning_rate": 3.193449963727213e-05, + "loss": 0.5402, + "step": 3061 + }, + { + "epoch": 0.42855143456962913, + "grad_norm": 0.3980289276258243, + "learning_rate": 3.192361033388037e-05, + "loss": 0.5769, + "step": 3062 + }, + { + "epoch": 0.42869139258222533, + "grad_norm": 0.40844869461585276, + "learning_rate": 3.191271960761645e-05, + "loss": 0.5635, + "step": 3063 + }, + { + "epoch": 0.4288313505948216, + "grad_norm": 0.4137330574117752, + "learning_rate": 3.19018274607185e-05, + "loss": 0.5866, + "step": 3064 + }, + { + "epoch": 0.4289713086074178, + "grad_norm": 0.4089150211795348, + "learning_rate": 3.1890933895424976e-05, + "loss": 0.5783, + "step": 3065 + }, + { + "epoch": 0.429111266620014, + "grad_norm": 0.40728615720674294, + "learning_rate": 3.188003891397463e-05, + "loss": 0.6048, + "step": 3066 + }, + { + "epoch": 0.42925122463261023, + "grad_norm": 0.3988606193766422, + "learning_rate": 3.186914251860648e-05, + "loss": 0.5234, + "step": 3067 + }, + { + "epoch": 0.4293911826452064, + "grad_norm": 0.4081704250933805, + "learning_rate": 3.185824471155983e-05, + "loss": 0.5593, + "step": 3068 + }, + { + "epoch": 0.4295311406578027, + "grad_norm": 0.42949202285429056, + "learning_rate": 3.184734549507431e-05, + "loss": 0.5385, + "step": 3069 + }, + { + "epoch": 0.4296710986703989, + "grad_norm": 0.39800337635965866, + "learning_rate": 3.183644487138982e-05, + "loss": 0.5213, + "step": 3070 + }, + { + "epoch": 0.4298110566829951, + "grad_norm": 0.3968235944285963, + "learning_rate": 3.182554284274654e-05, + "loss": 0.5508, + "step": 3071 + }, + { + "epoch": 0.4299510146955913, + "grad_norm": 0.41991412353150687, + "learning_rate": 3.181463941138495e-05, + "loss": 0.5829, + "step": 3072 + }, + { + "epoch": 0.4300909727081875, + "grad_norm": 0.42186424178865006, + "learning_rate": 3.180373457954581e-05, + "loss": 0.5444, + "step": 3073 + }, + { + "epoch": 0.4302309307207838, + "grad_norm": 0.409043031506044, + "learning_rate": 3.17928283494702e-05, + "loss": 0.5473, + "step": 3074 + }, + { + "epoch": 0.43037088873338, + "grad_norm": 0.4090988482060365, + "learning_rate": 3.178192072339942e-05, + "loss": 0.5293, + "step": 3075 + }, + { + "epoch": 0.43051084674597623, + "grad_norm": 0.40615205351515493, + "learning_rate": 3.177101170357513e-05, + "loss": 0.5869, + "step": 3076 + }, + { + "epoch": 0.4306508047585724, + "grad_norm": 0.39773350024022835, + "learning_rate": 3.176010129223923e-05, + "loss": 0.5456, + "step": 3077 + }, + { + "epoch": 0.4307907627711686, + "grad_norm": 0.4199551126566616, + "learning_rate": 3.174918949163392e-05, + "loss": 0.5642, + "step": 3078 + }, + { + "epoch": 0.4309307207837649, + "grad_norm": 0.43088576609462415, + "learning_rate": 3.17382763040017e-05, + "loss": 0.5563, + "step": 3079 + }, + { + "epoch": 0.4310706787963611, + "grad_norm": 0.40278017058920423, + "learning_rate": 3.172736173158532e-05, + "loss": 0.5279, + "step": 3080 + }, + { + "epoch": 0.4312106368089573, + "grad_norm": 0.4423703616752564, + "learning_rate": 3.171644577662785e-05, + "loss": 0.5701, + "step": 3081 + }, + { + "epoch": 0.4313505948215535, + "grad_norm": 0.4240618723379521, + "learning_rate": 3.1705528441372626e-05, + "loss": 0.5876, + "step": 3082 + }, + { + "epoch": 0.4314905528341498, + "grad_norm": 0.39848697262264293, + "learning_rate": 3.169460972806327e-05, + "loss": 0.5549, + "step": 3083 + }, + { + "epoch": 0.431630510846746, + "grad_norm": 0.40754000265622903, + "learning_rate": 3.1683689638943684e-05, + "loss": 0.5772, + "step": 3084 + }, + { + "epoch": 0.43177046885934217, + "grad_norm": 0.4018754339809231, + "learning_rate": 3.167276817625806e-05, + "loss": 0.5783, + "step": 3085 + }, + { + "epoch": 0.4319104268719384, + "grad_norm": 0.430233233796668, + "learning_rate": 3.166184534225087e-05, + "loss": 0.5775, + "step": 3086 + }, + { + "epoch": 0.4320503848845346, + "grad_norm": 0.4210970260038791, + "learning_rate": 3.165092113916688e-05, + "loss": 0.5593, + "step": 3087 + }, + { + "epoch": 0.4321903428971309, + "grad_norm": 0.4162185793360081, + "learning_rate": 3.163999556925111e-05, + "loss": 0.5587, + "step": 3088 + }, + { + "epoch": 0.43233030090972707, + "grad_norm": 0.38464238839448883, + "learning_rate": 3.162906863474887e-05, + "loss": 0.5527, + "step": 3089 + }, + { + "epoch": 0.4324702589223233, + "grad_norm": 0.3899859914956373, + "learning_rate": 3.161814033790577e-05, + "loss": 0.5734, + "step": 3090 + }, + { + "epoch": 0.4326102169349195, + "grad_norm": 0.40900462690982553, + "learning_rate": 3.160721068096768e-05, + "loss": 0.5399, + "step": 3091 + }, + { + "epoch": 0.4327501749475157, + "grad_norm": 0.4220096792046019, + "learning_rate": 3.159627966618075e-05, + "loss": 0.5727, + "step": 3092 + }, + { + "epoch": 0.432890132960112, + "grad_norm": 0.4233571391991352, + "learning_rate": 3.158534729579142e-05, + "loss": 0.5949, + "step": 3093 + }, + { + "epoch": 0.43303009097270817, + "grad_norm": 0.40740179328267645, + "learning_rate": 3.157441357204641e-05, + "loss": 0.5645, + "step": 3094 + }, + { + "epoch": 0.4331700489853044, + "grad_norm": 0.38846491608664135, + "learning_rate": 3.15634784971927e-05, + "loss": 0.5352, + "step": 3095 + }, + { + "epoch": 0.4333100069979006, + "grad_norm": 0.40243615629816265, + "learning_rate": 3.1552542073477555e-05, + "loss": 0.5672, + "step": 3096 + }, + { + "epoch": 0.4334499650104969, + "grad_norm": 0.38997133690973523, + "learning_rate": 3.154160430314854e-05, + "loss": 0.5756, + "step": 3097 + }, + { + "epoch": 0.43358992302309307, + "grad_norm": 0.3815517517819305, + "learning_rate": 3.1530665188453464e-05, + "loss": 0.5596, + "step": 3098 + }, + { + "epoch": 0.43372988103568927, + "grad_norm": 0.39845289899145564, + "learning_rate": 3.1519724731640424e-05, + "loss": 0.5914, + "step": 3099 + }, + { + "epoch": 0.4338698390482855, + "grad_norm": 0.39018815516325334, + "learning_rate": 3.15087829349578e-05, + "loss": 0.5351, + "step": 3100 + }, + { + "epoch": 0.4340097970608817, + "grad_norm": 0.4129017336286664, + "learning_rate": 3.149783980065425e-05, + "loss": 0.562, + "step": 3101 + }, + { + "epoch": 0.43414975507347797, + "grad_norm": 0.42547377124636576, + "learning_rate": 3.1486895330978685e-05, + "loss": 0.5924, + "step": 3102 + }, + { + "epoch": 0.43428971308607417, + "grad_norm": 0.41352999220449604, + "learning_rate": 3.147594952818031e-05, + "loss": 0.5417, + "step": 3103 + }, + { + "epoch": 0.4344296710986704, + "grad_norm": 0.42347168990756195, + "learning_rate": 3.14650023945086e-05, + "loss": 0.5479, + "step": 3104 + }, + { + "epoch": 0.4345696291112666, + "grad_norm": 0.389428898967484, + "learning_rate": 3.1454053932213304e-05, + "loss": 0.5552, + "step": 3105 + }, + { + "epoch": 0.4347095871238628, + "grad_norm": 0.38385485279343906, + "learning_rate": 3.144310414354444e-05, + "loss": 0.5197, + "step": 3106 + }, + { + "epoch": 0.43484954513645907, + "grad_norm": 0.41030767770871096, + "learning_rate": 3.1432153030752295e-05, + "loss": 0.5597, + "step": 3107 + }, + { + "epoch": 0.43498950314905527, + "grad_norm": 0.41349967870711807, + "learning_rate": 3.142120059608744e-05, + "loss": 0.5663, + "step": 3108 + }, + { + "epoch": 0.4351294611616515, + "grad_norm": 0.4082370728237636, + "learning_rate": 3.141024684180071e-05, + "loss": 0.5275, + "step": 3109 + }, + { + "epoch": 0.4352694191742477, + "grad_norm": 0.4122249486824059, + "learning_rate": 3.139929177014322e-05, + "loss": 0.5451, + "step": 3110 + }, + { + "epoch": 0.43540937718684397, + "grad_norm": 0.41854855746910635, + "learning_rate": 3.138833538336633e-05, + "loss": 0.5418, + "step": 3111 + }, + { + "epoch": 0.43554933519944017, + "grad_norm": 0.4259659771695767, + "learning_rate": 3.137737768372171e-05, + "loss": 0.5931, + "step": 3112 + }, + { + "epoch": 0.43568929321203637, + "grad_norm": 0.43545087247200576, + "learning_rate": 3.1366418673461254e-05, + "loss": 0.5899, + "step": 3113 + }, + { + "epoch": 0.4358292512246326, + "grad_norm": 0.4096432845087902, + "learning_rate": 3.135545835483718e-05, + "loss": 0.5686, + "step": 3114 + }, + { + "epoch": 0.4359692092372288, + "grad_norm": 0.40360816791162424, + "learning_rate": 3.134449673010192e-05, + "loss": 0.5471, + "step": 3115 + }, + { + "epoch": 0.43610916724982507, + "grad_norm": 0.4293932793165949, + "learning_rate": 3.1333533801508204e-05, + "loss": 0.5721, + "step": 3116 + }, + { + "epoch": 0.43624912526242127, + "grad_norm": 0.3963350874406529, + "learning_rate": 3.132256957130904e-05, + "loss": 0.5685, + "step": 3117 + }, + { + "epoch": 0.4363890832750175, + "grad_norm": 0.3871923444763041, + "learning_rate": 3.131160404175767e-05, + "loss": 0.5544, + "step": 3118 + }, + { + "epoch": 0.4365290412876137, + "grad_norm": 0.41730033342791306, + "learning_rate": 3.130063721510763e-05, + "loss": 0.5594, + "step": 3119 + }, + { + "epoch": 0.4366689993002099, + "grad_norm": 0.43319861497508905, + "learning_rate": 3.1289669093612714e-05, + "loss": 0.5896, + "step": 3120 + }, + { + "epoch": 0.43680895731280617, + "grad_norm": 0.3896513313595881, + "learning_rate": 3.127869967952698e-05, + "loss": 0.5799, + "step": 3121 + }, + { + "epoch": 0.43694891532540237, + "grad_norm": 0.4391301390474321, + "learning_rate": 3.126772897510476e-05, + "loss": 0.582, + "step": 3122 + }, + { + "epoch": 0.4370888733379986, + "grad_norm": 0.3951829444712206, + "learning_rate": 3.125675698260065e-05, + "loss": 0.5577, + "step": 3123 + }, + { + "epoch": 0.4372288313505948, + "grad_norm": 0.4088921127679525, + "learning_rate": 3.124578370426947e-05, + "loss": 0.5471, + "step": 3124 + }, + { + "epoch": 0.43736878936319107, + "grad_norm": 0.40507594886972204, + "learning_rate": 3.1234809142366376e-05, + "loss": 0.5787, + "step": 3125 + }, + { + "epoch": 0.43750874737578727, + "grad_norm": 0.48880201961534253, + "learning_rate": 3.1223833299146756e-05, + "loss": 0.5665, + "step": 3126 + }, + { + "epoch": 0.43764870538838346, + "grad_norm": 0.40104306636405085, + "learning_rate": 3.1212856176866226e-05, + "loss": 0.5403, + "step": 3127 + }, + { + "epoch": 0.4377886634009797, + "grad_norm": 0.38729222817598974, + "learning_rate": 3.120187777778073e-05, + "loss": 0.5298, + "step": 3128 + }, + { + "epoch": 0.4379286214135759, + "grad_norm": 0.38974703851837783, + "learning_rate": 3.11908981041464e-05, + "loss": 0.5612, + "step": 3129 + }, + { + "epoch": 0.43806857942617217, + "grad_norm": 0.4100334816926811, + "learning_rate": 3.117991715821972e-05, + "loss": 0.5903, + "step": 3130 + }, + { + "epoch": 0.43820853743876836, + "grad_norm": 0.4156843103680473, + "learning_rate": 3.116893494225734e-05, + "loss": 0.5886, + "step": 3131 + }, + { + "epoch": 0.4383484954513646, + "grad_norm": 0.39607057982434785, + "learning_rate": 3.115795145851625e-05, + "loss": 0.5621, + "step": 3132 + }, + { + "epoch": 0.4384884534639608, + "grad_norm": 0.4036233265018997, + "learning_rate": 3.114696670925365e-05, + "loss": 0.5538, + "step": 3133 + }, + { + "epoch": 0.438628411476557, + "grad_norm": 0.4101277411308792, + "learning_rate": 3.113598069672702e-05, + "loss": 0.5961, + "step": 3134 + }, + { + "epoch": 0.43876836948915326, + "grad_norm": 0.399711153788506, + "learning_rate": 3.11249934231941e-05, + "loss": 0.5594, + "step": 3135 + }, + { + "epoch": 0.43890832750174946, + "grad_norm": 0.3867721840790253, + "learning_rate": 3.111400489091288e-05, + "loss": 0.5122, + "step": 3136 + }, + { + "epoch": 0.4390482855143457, + "grad_norm": 0.4029896269340335, + "learning_rate": 3.110301510214162e-05, + "loss": 0.5724, + "step": 3137 + }, + { + "epoch": 0.4391882435269419, + "grad_norm": 0.4323192969018167, + "learning_rate": 3.1092024059138836e-05, + "loss": 0.5901, + "step": 3138 + }, + { + "epoch": 0.43932820153953817, + "grad_norm": 0.4030250752398442, + "learning_rate": 3.108103176416329e-05, + "loss": 0.5369, + "step": 3139 + }, + { + "epoch": 0.43946815955213436, + "grad_norm": 0.40699746634257905, + "learning_rate": 3.1070038219474026e-05, + "loss": 0.5236, + "step": 3140 + }, + { + "epoch": 0.43960811756473056, + "grad_norm": 0.4054193975355776, + "learning_rate": 3.105904342733032e-05, + "loss": 0.555, + "step": 3141 + }, + { + "epoch": 0.4397480755773268, + "grad_norm": 0.43325267645820853, + "learning_rate": 3.104804738999169e-05, + "loss": 0.5704, + "step": 3142 + }, + { + "epoch": 0.439888033589923, + "grad_norm": 0.41581039947645454, + "learning_rate": 3.103705010971797e-05, + "loss": 0.5733, + "step": 3143 + }, + { + "epoch": 0.44002799160251926, + "grad_norm": 0.397475655616127, + "learning_rate": 3.1026051588769204e-05, + "loss": 0.5681, + "step": 3144 + }, + { + "epoch": 0.44016794961511546, + "grad_norm": 0.4037137237750616, + "learning_rate": 3.101505182940568e-05, + "loss": 0.5641, + "step": 3145 + }, + { + "epoch": 0.44030790762771166, + "grad_norm": 0.395909015440236, + "learning_rate": 3.1004050833887985e-05, + "loss": 0.5453, + "step": 3146 + }, + { + "epoch": 0.4404478656403079, + "grad_norm": 0.40746541718886475, + "learning_rate": 3.099304860447692e-05, + "loss": 0.5576, + "step": 3147 + }, + { + "epoch": 0.4405878236529041, + "grad_norm": 0.40891529518862035, + "learning_rate": 3.098204514343356e-05, + "loss": 0.5744, + "step": 3148 + }, + { + "epoch": 0.44072778166550036, + "grad_norm": 0.42039074958765454, + "learning_rate": 3.097104045301922e-05, + "loss": 0.5945, + "step": 3149 + }, + { + "epoch": 0.44086773967809656, + "grad_norm": 0.4233543661398848, + "learning_rate": 3.096003453549549e-05, + "loss": 0.5174, + "step": 3150 + }, + { + "epoch": 0.4410076976906928, + "grad_norm": 0.41103834231055475, + "learning_rate": 3.0949027393124185e-05, + "loss": 0.5859, + "step": 3151 + }, + { + "epoch": 0.441147655703289, + "grad_norm": 0.4071396114989415, + "learning_rate": 3.093801902816739e-05, + "loss": 0.5513, + "step": 3152 + }, + { + "epoch": 0.4412876137158852, + "grad_norm": 0.5084306343751018, + "learning_rate": 3.092700944288744e-05, + "loss": 0.5871, + "step": 3153 + }, + { + "epoch": 0.44142757172848146, + "grad_norm": 0.41487834345004226, + "learning_rate": 3.091599863954691e-05, + "loss": 0.5789, + "step": 3154 + }, + { + "epoch": 0.44156752974107766, + "grad_norm": 0.38353879873996594, + "learning_rate": 3.090498662040863e-05, + "loss": 0.5864, + "step": 3155 + }, + { + "epoch": 0.4417074877536739, + "grad_norm": 0.4037917238728566, + "learning_rate": 3.0893973387735687e-05, + "loss": 0.5649, + "step": 3156 + }, + { + "epoch": 0.4418474457662701, + "grad_norm": 0.4017882825846787, + "learning_rate": 3.0882958943791405e-05, + "loss": 0.5314, + "step": 3157 + }, + { + "epoch": 0.44198740377886636, + "grad_norm": 0.413358406501944, + "learning_rate": 3.087194329083937e-05, + "loss": 0.5433, + "step": 3158 + }, + { + "epoch": 0.44212736179146256, + "grad_norm": 0.38344575892281724, + "learning_rate": 3.0860926431143415e-05, + "loss": 0.5612, + "step": 3159 + }, + { + "epoch": 0.44226731980405876, + "grad_norm": 0.4087942341185106, + "learning_rate": 3.0849908366967605e-05, + "loss": 0.5256, + "step": 3160 + }, + { + "epoch": 0.442407277816655, + "grad_norm": 0.4055705356412522, + "learning_rate": 3.083888910057627e-05, + "loss": 0.5661, + "step": 3161 + }, + { + "epoch": 0.4425472358292512, + "grad_norm": 0.3826502993612589, + "learning_rate": 3.082786863423399e-05, + "loss": 0.5183, + "step": 3162 + }, + { + "epoch": 0.44268719384184746, + "grad_norm": 0.4043315813890686, + "learning_rate": 3.0816846970205556e-05, + "loss": 0.5805, + "step": 3163 + }, + { + "epoch": 0.44282715185444366, + "grad_norm": 0.6100429100368088, + "learning_rate": 3.0805824110756064e-05, + "loss": 0.5527, + "step": 3164 + }, + { + "epoch": 0.4429671098670399, + "grad_norm": 0.39290972200856517, + "learning_rate": 3.079480005815081e-05, + "loss": 0.5529, + "step": 3165 + }, + { + "epoch": 0.4431070678796361, + "grad_norm": 0.38604414946838106, + "learning_rate": 3.078377481465534e-05, + "loss": 0.5216, + "step": 3166 + }, + { + "epoch": 0.4432470258922323, + "grad_norm": 0.4231758177347116, + "learning_rate": 3.0772748382535463e-05, + "loss": 0.5315, + "step": 3167 + }, + { + "epoch": 0.44338698390482856, + "grad_norm": 0.3991789110660417, + "learning_rate": 3.076172076405722e-05, + "loss": 0.5488, + "step": 3168 + }, + { + "epoch": 0.44352694191742476, + "grad_norm": 0.5750580981426986, + "learning_rate": 3.07506919614869e-05, + "loss": 0.5191, + "step": 3169 + }, + { + "epoch": 0.443666899930021, + "grad_norm": 0.41221569096533744, + "learning_rate": 3.073966197709103e-05, + "loss": 0.5505, + "step": 3170 + }, + { + "epoch": 0.4438068579426172, + "grad_norm": 0.4397217342239035, + "learning_rate": 3.072863081313639e-05, + "loss": 0.594, + "step": 3171 + }, + { + "epoch": 0.44394681595521346, + "grad_norm": 0.4109178364737107, + "learning_rate": 3.071759847188998e-05, + "loss": 0.5438, + "step": 3172 + }, + { + "epoch": 0.44408677396780966, + "grad_norm": 0.4037573317008806, + "learning_rate": 3.070656495561909e-05, + "loss": 0.56, + "step": 3173 + }, + { + "epoch": 0.44422673198040585, + "grad_norm": 0.4029097868342135, + "learning_rate": 3.069553026659119e-05, + "loss": 0.5552, + "step": 3174 + }, + { + "epoch": 0.4443666899930021, + "grad_norm": 0.4100219354350298, + "learning_rate": 3.068449440707404e-05, + "loss": 0.5533, + "step": 3175 + }, + { + "epoch": 0.4445066480055983, + "grad_norm": 0.3892720780012183, + "learning_rate": 3.067345737933561e-05, + "loss": 0.5478, + "step": 3176 + }, + { + "epoch": 0.44464660601819456, + "grad_norm": 0.389146645661204, + "learning_rate": 3.0662419185644115e-05, + "loss": 0.5506, + "step": 3177 + }, + { + "epoch": 0.44478656403079075, + "grad_norm": 0.3873299876663967, + "learning_rate": 3.065137982826802e-05, + "loss": 0.5245, + "step": 3178 + }, + { + "epoch": 0.444926522043387, + "grad_norm": 0.38222672454752543, + "learning_rate": 3.064033930947604e-05, + "loss": 0.5451, + "step": 3179 + }, + { + "epoch": 0.4450664800559832, + "grad_norm": 0.39505544780509594, + "learning_rate": 3.06292976315371e-05, + "loss": 0.5617, + "step": 3180 + }, + { + "epoch": 0.4452064380685794, + "grad_norm": 0.410933368734618, + "learning_rate": 3.0618254796720375e-05, + "loss": 0.5598, + "step": 3181 + }, + { + "epoch": 0.44534639608117566, + "grad_norm": 0.4357975405512465, + "learning_rate": 3.060721080729529e-05, + "loss": 0.5558, + "step": 3182 + }, + { + "epoch": 0.44548635409377185, + "grad_norm": 0.43046775321062414, + "learning_rate": 3.059616566553149e-05, + "loss": 0.5637, + "step": 3183 + }, + { + "epoch": 0.4456263121063681, + "grad_norm": 0.3985416977600822, + "learning_rate": 3.058511937369886e-05, + "loss": 0.525, + "step": 3184 + }, + { + "epoch": 0.4457662701189643, + "grad_norm": 0.40340048691929675, + "learning_rate": 3.057407193406753e-05, + "loss": 0.5249, + "step": 3185 + }, + { + "epoch": 0.44590622813156056, + "grad_norm": 0.4135936572888444, + "learning_rate": 3.056302334890786e-05, + "loss": 0.5549, + "step": 3186 + }, + { + "epoch": 0.44604618614415675, + "grad_norm": 0.3920888452917507, + "learning_rate": 3.0551973620490456e-05, + "loss": 0.5491, + "step": 3187 + }, + { + "epoch": 0.44618614415675295, + "grad_norm": 0.4035469272259035, + "learning_rate": 3.0540922751086135e-05, + "loss": 0.5465, + "step": 3188 + }, + { + "epoch": 0.4463261021693492, + "grad_norm": 0.4216381953710183, + "learning_rate": 3.052987074296596e-05, + "loss": 0.5535, + "step": 3189 + }, + { + "epoch": 0.4464660601819454, + "grad_norm": 0.41233431611024285, + "learning_rate": 3.051881759840124e-05, + "loss": 0.5498, + "step": 3190 + }, + { + "epoch": 0.44660601819454165, + "grad_norm": 0.4114941998250367, + "learning_rate": 3.0507763319663517e-05, + "loss": 0.5718, + "step": 3191 + }, + { + "epoch": 0.44674597620713785, + "grad_norm": 0.407680668986072, + "learning_rate": 3.0496707909024542e-05, + "loss": 0.5298, + "step": 3192 + }, + { + "epoch": 0.4468859342197341, + "grad_norm": 0.3977959198257157, + "learning_rate": 3.0485651368756323e-05, + "loss": 0.549, + "step": 3193 + }, + { + "epoch": 0.4470258922323303, + "grad_norm": 0.3998186038348927, + "learning_rate": 3.0474593701131084e-05, + "loss": 0.5743, + "step": 3194 + }, + { + "epoch": 0.4471658502449265, + "grad_norm": 0.4154710015319719, + "learning_rate": 3.0463534908421298e-05, + "loss": 0.5515, + "step": 3195 + }, + { + "epoch": 0.44730580825752275, + "grad_norm": 0.3943039960321348, + "learning_rate": 3.0452474992899643e-05, + "loss": 0.5391, + "step": 3196 + }, + { + "epoch": 0.44744576627011895, + "grad_norm": 0.4199054832560681, + "learning_rate": 3.044141395683906e-05, + "loss": 0.5741, + "step": 3197 + }, + { + "epoch": 0.4475857242827152, + "grad_norm": 0.41750106544298576, + "learning_rate": 3.0430351802512698e-05, + "loss": 0.6249, + "step": 3198 + }, + { + "epoch": 0.4477256822953114, + "grad_norm": 0.3946839624490567, + "learning_rate": 3.041928853219394e-05, + "loss": 0.5692, + "step": 3199 + }, + { + "epoch": 0.44786564030790765, + "grad_norm": 0.4224068149128873, + "learning_rate": 3.0408224148156407e-05, + "loss": 0.5772, + "step": 3200 + }, + { + "epoch": 0.44800559832050385, + "grad_norm": 0.44042046806838825, + "learning_rate": 3.039715865267393e-05, + "loss": 0.6051, + "step": 3201 + }, + { + "epoch": 0.44814555633310005, + "grad_norm": 0.7011787045122906, + "learning_rate": 3.0386092048020593e-05, + "loss": 0.6074, + "step": 3202 + }, + { + "epoch": 0.4482855143456963, + "grad_norm": 0.41427835190534584, + "learning_rate": 3.037502433647068e-05, + "loss": 0.5624, + "step": 3203 + }, + { + "epoch": 0.4484254723582925, + "grad_norm": 0.4053190934113154, + "learning_rate": 3.0363955520298742e-05, + "loss": 0.5951, + "step": 3204 + }, + { + "epoch": 0.44856543037088875, + "grad_norm": 0.43048899640751676, + "learning_rate": 3.0352885601779512e-05, + "loss": 0.5652, + "step": 3205 + }, + { + "epoch": 0.44870538838348495, + "grad_norm": 0.3964539941995037, + "learning_rate": 3.0341814583187978e-05, + "loss": 0.5595, + "step": 3206 + }, + { + "epoch": 0.4488453463960812, + "grad_norm": 0.4009111440660274, + "learning_rate": 3.033074246679935e-05, + "loss": 0.5861, + "step": 3207 + }, + { + "epoch": 0.4489853044086774, + "grad_norm": 0.38864197775544784, + "learning_rate": 3.0319669254889055e-05, + "loss": 0.598, + "step": 3208 + }, + { + "epoch": 0.4491252624212736, + "grad_norm": 0.4292785996534306, + "learning_rate": 3.0308594949732755e-05, + "loss": 0.5901, + "step": 3209 + }, + { + "epoch": 0.44926522043386985, + "grad_norm": 0.37775134402140526, + "learning_rate": 3.029751955360633e-05, + "loss": 0.5327, + "step": 3210 + }, + { + "epoch": 0.44940517844646605, + "grad_norm": 0.37253895115039465, + "learning_rate": 3.0286443068785885e-05, + "loss": 0.5649, + "step": 3211 + }, + { + "epoch": 0.4495451364590623, + "grad_norm": 0.43373031007480495, + "learning_rate": 3.027536549754775e-05, + "loss": 0.541, + "step": 3212 + }, + { + "epoch": 0.4496850944716585, + "grad_norm": 0.42039610931604904, + "learning_rate": 3.0264286842168477e-05, + "loss": 0.5549, + "step": 3213 + }, + { + "epoch": 0.44982505248425475, + "grad_norm": 0.40625318217496825, + "learning_rate": 3.025320710492484e-05, + "loss": 0.568, + "step": 3214 + }, + { + "epoch": 0.44996501049685095, + "grad_norm": 0.4025993136298079, + "learning_rate": 3.0242126288093846e-05, + "loss": 0.5824, + "step": 3215 + }, + { + "epoch": 0.45010496850944715, + "grad_norm": 0.3984904672668101, + "learning_rate": 3.0231044393952712e-05, + "loss": 0.5615, + "step": 3216 + }, + { + "epoch": 0.4502449265220434, + "grad_norm": 0.4137524109943405, + "learning_rate": 3.0219961424778877e-05, + "loss": 0.5588, + "step": 3217 + }, + { + "epoch": 0.4503848845346396, + "grad_norm": 0.39240078172637105, + "learning_rate": 3.020887738285001e-05, + "loss": 0.521, + "step": 3218 + }, + { + "epoch": 0.45052484254723585, + "grad_norm": 0.41138657372667015, + "learning_rate": 3.0197792270443982e-05, + "loss": 0.5471, + "step": 3219 + }, + { + "epoch": 0.45066480055983205, + "grad_norm": 0.4192515121968027, + "learning_rate": 3.0186706089838913e-05, + "loss": 0.527, + "step": 3220 + }, + { + "epoch": 0.4508047585724283, + "grad_norm": 0.4089670890583155, + "learning_rate": 3.017561884331311e-05, + "loss": 0.5726, + "step": 3221 + }, + { + "epoch": 0.4509447165850245, + "grad_norm": 0.40191329030749956, + "learning_rate": 3.0164530533145123e-05, + "loss": 0.5444, + "step": 3222 + }, + { + "epoch": 0.4510846745976207, + "grad_norm": 0.41118884176015386, + "learning_rate": 3.0153441161613704e-05, + "loss": 0.5541, + "step": 3223 + }, + { + "epoch": 0.45122463261021695, + "grad_norm": 0.4309170413845665, + "learning_rate": 3.0142350730997837e-05, + "loss": 0.5794, + "step": 3224 + }, + { + "epoch": 0.45136459062281314, + "grad_norm": 0.41219057005882026, + "learning_rate": 3.0131259243576726e-05, + "loss": 0.5601, + "step": 3225 + }, + { + "epoch": 0.4515045486354094, + "grad_norm": 0.3941521672183907, + "learning_rate": 3.012016670162977e-05, + "loss": 0.521, + "step": 3226 + }, + { + "epoch": 0.4516445066480056, + "grad_norm": 0.4198018097191238, + "learning_rate": 3.0109073107436608e-05, + "loss": 0.5573, + "step": 3227 + }, + { + "epoch": 0.4517844646606018, + "grad_norm": 0.4135610840050145, + "learning_rate": 3.0097978463277076e-05, + "loss": 0.5718, + "step": 3228 + }, + { + "epoch": 0.45192442267319805, + "grad_norm": 0.405778139396707, + "learning_rate": 3.0086882771431258e-05, + "loss": 0.5468, + "step": 3229 + }, + { + "epoch": 0.45206438068579424, + "grad_norm": 0.4401596995437583, + "learning_rate": 3.0075786034179405e-05, + "loss": 0.5915, + "step": 3230 + }, + { + "epoch": 0.4522043386983905, + "grad_norm": 0.4270515847507665, + "learning_rate": 3.0064688253802026e-05, + "loss": 0.5704, + "step": 3231 + }, + { + "epoch": 0.4523442967109867, + "grad_norm": 0.3969545038717031, + "learning_rate": 3.0053589432579827e-05, + "loss": 0.539, + "step": 3232 + }, + { + "epoch": 0.45248425472358295, + "grad_norm": 0.4231180537782753, + "learning_rate": 3.004248957279372e-05, + "loss": 0.5993, + "step": 3233 + }, + { + "epoch": 0.45262421273617914, + "grad_norm": 0.3821272498454476, + "learning_rate": 3.0031388676724836e-05, + "loss": 0.5444, + "step": 3234 + }, + { + "epoch": 0.45276417074877534, + "grad_norm": 0.38147859259770744, + "learning_rate": 3.0020286746654525e-05, + "loss": 0.5893, + "step": 3235 + }, + { + "epoch": 0.4529041287613716, + "grad_norm": 0.40206232247150925, + "learning_rate": 3.0009183784864365e-05, + "loss": 0.5567, + "step": 3236 + }, + { + "epoch": 0.4530440867739678, + "grad_norm": 0.3712974814446054, + "learning_rate": 2.9998079793636098e-05, + "loss": 0.5148, + "step": 3237 + }, + { + "epoch": 0.45318404478656404, + "grad_norm": 0.4047911445642038, + "learning_rate": 2.998697477525173e-05, + "loss": 0.5543, + "step": 3238 + }, + { + "epoch": 0.45332400279916024, + "grad_norm": 0.41083909886481995, + "learning_rate": 2.997586873199344e-05, + "loss": 0.5411, + "step": 3239 + }, + { + "epoch": 0.4534639608117565, + "grad_norm": 0.3979787826364129, + "learning_rate": 2.996476166614364e-05, + "loss": 0.526, + "step": 3240 + }, + { + "epoch": 0.4536039188243527, + "grad_norm": 0.39694745383011404, + "learning_rate": 2.9953653579984942e-05, + "loss": 0.5523, + "step": 3241 + }, + { + "epoch": 0.4537438768369489, + "grad_norm": 0.3902000936154921, + "learning_rate": 2.9942544475800172e-05, + "loss": 0.5502, + "step": 3242 + }, + { + "epoch": 0.45388383484954514, + "grad_norm": 0.42203355743945187, + "learning_rate": 2.9931434355872367e-05, + "loss": 0.5609, + "step": 3243 + }, + { + "epoch": 0.45402379286214134, + "grad_norm": 0.4454869866225551, + "learning_rate": 2.992032322248476e-05, + "loss": 0.6, + "step": 3244 + }, + { + "epoch": 0.4541637508747376, + "grad_norm": 0.405554964175632, + "learning_rate": 2.9909211077920805e-05, + "loss": 0.572, + "step": 3245 + }, + { + "epoch": 0.4543037088873338, + "grad_norm": 0.41415547431542904, + "learning_rate": 2.989809792446417e-05, + "loss": 0.5578, + "step": 3246 + }, + { + "epoch": 0.45444366689993004, + "grad_norm": 0.41647413135178146, + "learning_rate": 2.9886983764398707e-05, + "loss": 0.5691, + "step": 3247 + }, + { + "epoch": 0.45458362491252624, + "grad_norm": 0.3958733230625823, + "learning_rate": 2.9875868600008496e-05, + "loss": 0.5708, + "step": 3248 + }, + { + "epoch": 0.45472358292512244, + "grad_norm": 0.39688745966500016, + "learning_rate": 2.986475243357782e-05, + "loss": 0.5497, + "step": 3249 + }, + { + "epoch": 0.4548635409377187, + "grad_norm": 0.3859490510740407, + "learning_rate": 2.985363526739115e-05, + "loss": 0.5227, + "step": 3250 + }, + { + "epoch": 0.4550034989503149, + "grad_norm": 0.3934619917655379, + "learning_rate": 2.9842517103733192e-05, + "loss": 0.5662, + "step": 3251 + }, + { + "epoch": 0.45514345696291114, + "grad_norm": 0.5131902290403141, + "learning_rate": 2.9831397944888833e-05, + "loss": 0.5677, + "step": 3252 + }, + { + "epoch": 0.45528341497550734, + "grad_norm": 0.42208973839755937, + "learning_rate": 2.9820277793143177e-05, + "loss": 0.5347, + "step": 3253 + }, + { + "epoch": 0.4554233729881036, + "grad_norm": 0.4047958403557277, + "learning_rate": 2.9809156650781528e-05, + "loss": 0.5533, + "step": 3254 + }, + { + "epoch": 0.4555633310006998, + "grad_norm": 0.41534042335678106, + "learning_rate": 2.9798034520089385e-05, + "loss": 0.6129, + "step": 3255 + }, + { + "epoch": 0.455703289013296, + "grad_norm": 0.405760676923056, + "learning_rate": 2.978691140335247e-05, + "loss": 0.575, + "step": 3256 + }, + { + "epoch": 0.45584324702589224, + "grad_norm": 0.4049361175064025, + "learning_rate": 2.9775787302856683e-05, + "loss": 0.5287, + "step": 3257 + }, + { + "epoch": 0.45598320503848844, + "grad_norm": 0.4212785218180886, + "learning_rate": 2.9764662220888157e-05, + "loss": 0.5649, + "step": 3258 + }, + { + "epoch": 0.4561231630510847, + "grad_norm": 0.4014033439696915, + "learning_rate": 2.9753536159733196e-05, + "loss": 0.5457, + "step": 3259 + }, + { + "epoch": 0.4562631210636809, + "grad_norm": 0.38748328739856236, + "learning_rate": 2.974240912167833e-05, + "loss": 0.5409, + "step": 3260 + }, + { + "epoch": 0.45640307907627714, + "grad_norm": 0.40938166793163544, + "learning_rate": 2.9731281109010256e-05, + "loss": 0.6002, + "step": 3261 + }, + { + "epoch": 0.45654303708887334, + "grad_norm": 0.40068504604922145, + "learning_rate": 2.9720152124015916e-05, + "loss": 0.5767, + "step": 3262 + }, + { + "epoch": 0.45668299510146954, + "grad_norm": 0.39286585333437835, + "learning_rate": 2.9709022168982426e-05, + "loss": 0.5823, + "step": 3263 + }, + { + "epoch": 0.4568229531140658, + "grad_norm": 0.39448349470052235, + "learning_rate": 2.9697891246197097e-05, + "loss": 0.5702, + "step": 3264 + }, + { + "epoch": 0.456962911126662, + "grad_norm": 0.42230524457915075, + "learning_rate": 2.9686759357947446e-05, + "loss": 0.5695, + "step": 3265 + }, + { + "epoch": 0.45710286913925824, + "grad_norm": 0.3829307935804848, + "learning_rate": 2.9675626506521205e-05, + "loss": 0.5486, + "step": 3266 + }, + { + "epoch": 0.45724282715185444, + "grad_norm": 0.4002606373918937, + "learning_rate": 2.966449269420627e-05, + "loss": 0.5953, + "step": 3267 + }, + { + "epoch": 0.4573827851644507, + "grad_norm": 0.4081380916674823, + "learning_rate": 2.9653357923290753e-05, + "loss": 0.5605, + "step": 3268 + }, + { + "epoch": 0.4575227431770469, + "grad_norm": 0.4173988076758946, + "learning_rate": 2.9642222196062973e-05, + "loss": 0.5941, + "step": 3269 + }, + { + "epoch": 0.4576627011896431, + "grad_norm": 0.41804559043749046, + "learning_rate": 2.9631085514811423e-05, + "loss": 0.5979, + "step": 3270 + }, + { + "epoch": 0.45780265920223934, + "grad_norm": 0.3906986098038151, + "learning_rate": 2.9619947881824818e-05, + "loss": 0.5567, + "step": 3271 + }, + { + "epoch": 0.45794261721483553, + "grad_norm": 0.4110120727152376, + "learning_rate": 2.9608809299392043e-05, + "loss": 0.5732, + "step": 3272 + }, + { + "epoch": 0.4580825752274318, + "grad_norm": 0.40920966877207876, + "learning_rate": 2.9597669769802196e-05, + "loss": 0.5673, + "step": 3273 + }, + { + "epoch": 0.458222533240028, + "grad_norm": 0.3959304691526394, + "learning_rate": 2.958652929534456e-05, + "loss": 0.5719, + "step": 3274 + }, + { + "epoch": 0.45836249125262424, + "grad_norm": 0.3844580981772225, + "learning_rate": 2.9575387878308615e-05, + "loss": 0.5441, + "step": 3275 + }, + { + "epoch": 0.45850244926522044, + "grad_norm": 0.4425235109241173, + "learning_rate": 2.9564245520984047e-05, + "loss": 0.5896, + "step": 3276 + }, + { + "epoch": 0.45864240727781663, + "grad_norm": 0.4206583711187626, + "learning_rate": 2.9553102225660706e-05, + "loss": 0.5718, + "step": 3277 + }, + { + "epoch": 0.4587823652904129, + "grad_norm": 0.43363119264281536, + "learning_rate": 2.954195799462866e-05, + "loss": 0.5584, + "step": 3278 + }, + { + "epoch": 0.4589223233030091, + "grad_norm": 0.40918971785473857, + "learning_rate": 2.9530812830178162e-05, + "loss": 0.5386, + "step": 3279 + }, + { + "epoch": 0.45906228131560534, + "grad_norm": 0.4195378947004359, + "learning_rate": 2.951966673459965e-05, + "loss": 0.5862, + "step": 3280 + }, + { + "epoch": 0.45920223932820153, + "grad_norm": 0.3984648051790019, + "learning_rate": 2.9508519710183772e-05, + "loss": 0.5745, + "step": 3281 + }, + { + "epoch": 0.4593421973407978, + "grad_norm": 0.42412243772475006, + "learning_rate": 2.9497371759221347e-05, + "loss": 0.5765, + "step": 3282 + }, + { + "epoch": 0.459482155353394, + "grad_norm": 0.4155137965851478, + "learning_rate": 2.9486222884003393e-05, + "loss": 0.5742, + "step": 3283 + }, + { + "epoch": 0.4596221133659902, + "grad_norm": 0.3929372634486545, + "learning_rate": 2.9475073086821115e-05, + "loss": 0.5467, + "step": 3284 + }, + { + "epoch": 0.45976207137858643, + "grad_norm": 0.4188647769421151, + "learning_rate": 2.9463922369965917e-05, + "loss": 0.6122, + "step": 3285 + }, + { + "epoch": 0.45990202939118263, + "grad_norm": 0.39936115798519434, + "learning_rate": 2.9452770735729374e-05, + "loss": 0.5592, + "step": 3286 + }, + { + "epoch": 0.4600419874037789, + "grad_norm": 0.41793644764176796, + "learning_rate": 2.944161818640327e-05, + "loss": 0.5679, + "step": 3287 + }, + { + "epoch": 0.4601819454163751, + "grad_norm": 0.4053297174029381, + "learning_rate": 2.943046472427956e-05, + "loss": 0.5731, + "step": 3288 + }, + { + "epoch": 0.46032190342897134, + "grad_norm": 0.4026483342781206, + "learning_rate": 2.9419310351650392e-05, + "loss": 0.5884, + "step": 3289 + }, + { + "epoch": 0.46046186144156753, + "grad_norm": 0.4014937430945868, + "learning_rate": 2.9408155070808113e-05, + "loss": 0.5753, + "step": 3290 + }, + { + "epoch": 0.46060181945416373, + "grad_norm": 0.4168370671409344, + "learning_rate": 2.9396998884045235e-05, + "loss": 0.563, + "step": 3291 + }, + { + "epoch": 0.46074177746676, + "grad_norm": 0.4015569692729826, + "learning_rate": 2.9385841793654474e-05, + "loss": 0.603, + "step": 3292 + }, + { + "epoch": 0.4608817354793562, + "grad_norm": 0.3918827502679464, + "learning_rate": 2.937468380192872e-05, + "loss": 0.5587, + "step": 3293 + }, + { + "epoch": 0.46102169349195243, + "grad_norm": 0.4129995494719521, + "learning_rate": 2.936352491116106e-05, + "loss": 0.5554, + "step": 3294 + }, + { + "epoch": 0.46116165150454863, + "grad_norm": 0.4281727152857462, + "learning_rate": 2.9352365123644755e-05, + "loss": 0.5632, + "step": 3295 + }, + { + "epoch": 0.4613016095171449, + "grad_norm": 0.4078100385278002, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.5717, + "step": 3296 + }, + { + "epoch": 0.4614415675297411, + "grad_norm": 0.4670973011247201, + "learning_rate": 2.93300428675402e-05, + "loss": 0.5701, + "step": 3297 + }, + { + "epoch": 0.4615815255423373, + "grad_norm": 0.4149076463346234, + "learning_rate": 2.93188804035394e-05, + "loss": 0.5477, + "step": 3298 + }, + { + "epoch": 0.46172148355493353, + "grad_norm": 0.41094417493173746, + "learning_rate": 2.9307717051964862e-05, + "loss": 0.5499, + "step": 3299 + }, + { + "epoch": 0.46186144156752973, + "grad_norm": 0.40933405766552283, + "learning_rate": 2.929655281511075e-05, + "loss": 0.5389, + "step": 3300 + }, + { + "epoch": 0.462001399580126, + "grad_norm": 0.4235858324194054, + "learning_rate": 2.9285387695271444e-05, + "loss": 0.5568, + "step": 3301 + }, + { + "epoch": 0.4621413575927222, + "grad_norm": 0.3997529823349059, + "learning_rate": 2.9274221694741484e-05, + "loss": 0.5384, + "step": 3302 + }, + { + "epoch": 0.4622813156053184, + "grad_norm": 0.42004597023741636, + "learning_rate": 2.92630548158156e-05, + "loss": 0.5613, + "step": 3303 + }, + { + "epoch": 0.46242127361791463, + "grad_norm": 0.4076618497301079, + "learning_rate": 2.925188706078869e-05, + "loss": 0.533, + "step": 3304 + }, + { + "epoch": 0.4625612316305108, + "grad_norm": 0.4131076996508653, + "learning_rate": 2.9240718431955855e-05, + "loss": 0.5809, + "step": 3305 + }, + { + "epoch": 0.4627011896431071, + "grad_norm": 0.4246693787392879, + "learning_rate": 2.9229548931612348e-05, + "loss": 0.553, + "step": 3306 + }, + { + "epoch": 0.4628411476557033, + "grad_norm": 0.4081853198228349, + "learning_rate": 2.9218378562053623e-05, + "loss": 0.5269, + "step": 3307 + }, + { + "epoch": 0.46298110566829953, + "grad_norm": 0.4185513766982846, + "learning_rate": 2.9207207325575304e-05, + "loss": 0.5391, + "step": 3308 + }, + { + "epoch": 0.46312106368089573, + "grad_norm": 0.40971729273974855, + "learning_rate": 2.9196035224473196e-05, + "loss": 0.5562, + "step": 3309 + }, + { + "epoch": 0.4632610216934919, + "grad_norm": 0.39899049100540745, + "learning_rate": 2.918486226104327e-05, + "loss": 0.5574, + "step": 3310 + }, + { + "epoch": 0.4634009797060882, + "grad_norm": 0.4210144546249132, + "learning_rate": 2.917368843758168e-05, + "loss": 0.5593, + "step": 3311 + }, + { + "epoch": 0.4635409377186844, + "grad_norm": 0.43313267085828555, + "learning_rate": 2.916251375638478e-05, + "loss": 0.5827, + "step": 3312 + }, + { + "epoch": 0.46368089573128063, + "grad_norm": 0.4159179322901382, + "learning_rate": 2.9151338219749065e-05, + "loss": 0.555, + "step": 3313 + }, + { + "epoch": 0.4638208537438768, + "grad_norm": 0.4383620819549179, + "learning_rate": 2.9140161829971223e-05, + "loss": 0.5638, + "step": 3314 + }, + { + "epoch": 0.4639608117564731, + "grad_norm": 0.4176966506673776, + "learning_rate": 2.9128984589348114e-05, + "loss": 0.5802, + "step": 3315 + }, + { + "epoch": 0.4641007697690693, + "grad_norm": 0.3985420143299249, + "learning_rate": 2.9117806500176774e-05, + "loss": 0.5614, + "step": 3316 + }, + { + "epoch": 0.4642407277816655, + "grad_norm": 0.40635759034414026, + "learning_rate": 2.910662756475443e-05, + "loss": 0.5502, + "step": 3317 + }, + { + "epoch": 0.4643806857942617, + "grad_norm": 0.3920809381768575, + "learning_rate": 2.9095447785378443e-05, + "loss": 0.5187, + "step": 3318 + }, + { + "epoch": 0.4645206438068579, + "grad_norm": 0.4065824247557009, + "learning_rate": 2.9084267164346386e-05, + "loss": 0.5285, + "step": 3319 + }, + { + "epoch": 0.4646606018194542, + "grad_norm": 0.47569552376008784, + "learning_rate": 2.9073085703955987e-05, + "loss": 0.5592, + "step": 3320 + }, + { + "epoch": 0.4648005598320504, + "grad_norm": 0.41348964126371485, + "learning_rate": 2.9061903406505154e-05, + "loss": 0.573, + "step": 3321 + }, + { + "epoch": 0.46494051784464663, + "grad_norm": 0.3990216785800373, + "learning_rate": 2.9050720274291943e-05, + "loss": 0.5686, + "step": 3322 + }, + { + "epoch": 0.4650804758572428, + "grad_norm": 0.3834656155781017, + "learning_rate": 2.903953630961463e-05, + "loss": 0.5297, + "step": 3323 + }, + { + "epoch": 0.465220433869839, + "grad_norm": 0.4046993834562908, + "learning_rate": 2.9028351514771606e-05, + "loss": 0.5515, + "step": 3324 + }, + { + "epoch": 0.4653603918824353, + "grad_norm": 0.43593159937116366, + "learning_rate": 2.901716589206147e-05, + "loss": 0.5706, + "step": 3325 + }, + { + "epoch": 0.4655003498950315, + "grad_norm": 0.41907649826033294, + "learning_rate": 2.9005979443782993e-05, + "loss": 0.5738, + "step": 3326 + }, + { + "epoch": 0.4656403079076277, + "grad_norm": 0.39650396383746195, + "learning_rate": 2.899479217223509e-05, + "loss": 0.5678, + "step": 3327 + }, + { + "epoch": 0.4657802659202239, + "grad_norm": 0.41237846145466506, + "learning_rate": 2.898360407971687e-05, + "loss": 0.5743, + "step": 3328 + }, + { + "epoch": 0.4659202239328202, + "grad_norm": 0.39685046167702703, + "learning_rate": 2.8972415168527584e-05, + "loss": 0.5439, + "step": 3329 + }, + { + "epoch": 0.4660601819454164, + "grad_norm": 0.4050270937895372, + "learning_rate": 2.896122544096668e-05, + "loss": 0.5541, + "step": 3330 + }, + { + "epoch": 0.46620013995801257, + "grad_norm": 0.40567639713038367, + "learning_rate": 2.895003489933375e-05, + "loss": 0.5712, + "step": 3331 + }, + { + "epoch": 0.4663400979706088, + "grad_norm": 0.4174104058437715, + "learning_rate": 2.8938843545928573e-05, + "loss": 0.5634, + "step": 3332 + }, + { + "epoch": 0.466480055983205, + "grad_norm": 0.4174890988710686, + "learning_rate": 2.892765138305108e-05, + "loss": 0.5633, + "step": 3333 + }, + { + "epoch": 0.4666200139958013, + "grad_norm": 0.4085166170083092, + "learning_rate": 2.8916458413001375e-05, + "loss": 0.5661, + "step": 3334 + }, + { + "epoch": 0.4667599720083975, + "grad_norm": 0.416230735105726, + "learning_rate": 2.8905264638079732e-05, + "loss": 0.5736, + "step": 3335 + }, + { + "epoch": 0.4668999300209937, + "grad_norm": 0.3905749725084316, + "learning_rate": 2.8894070060586576e-05, + "loss": 0.5483, + "step": 3336 + }, + { + "epoch": 0.4670398880335899, + "grad_norm": 0.40890374703665905, + "learning_rate": 2.888287468282252e-05, + "loss": 0.5615, + "step": 3337 + }, + { + "epoch": 0.4671798460461861, + "grad_norm": 0.41635405487950694, + "learning_rate": 2.8871678507088312e-05, + "loss": 0.5467, + "step": 3338 + }, + { + "epoch": 0.4673198040587824, + "grad_norm": 0.4336846869668269, + "learning_rate": 2.886048153568489e-05, + "loss": 0.5511, + "step": 3339 + }, + { + "epoch": 0.46745976207137857, + "grad_norm": 0.38470189006529587, + "learning_rate": 2.8849283770913337e-05, + "loss": 0.5488, + "step": 3340 + }, + { + "epoch": 0.4675997200839748, + "grad_norm": 0.43922504036128623, + "learning_rate": 2.8838085215074923e-05, + "loss": 0.592, + "step": 3341 + }, + { + "epoch": 0.467739678096571, + "grad_norm": 0.4046341572147428, + "learning_rate": 2.8826885870471043e-05, + "loss": 0.5618, + "step": 3342 + }, + { + "epoch": 0.4678796361091673, + "grad_norm": 0.38781336644907494, + "learning_rate": 2.8815685739403298e-05, + "loss": 0.556, + "step": 3343 + }, + { + "epoch": 0.46801959412176347, + "grad_norm": 0.4120764675677611, + "learning_rate": 2.8804484824173417e-05, + "loss": 0.5407, + "step": 3344 + }, + { + "epoch": 0.46815955213435967, + "grad_norm": 0.4309704301588916, + "learning_rate": 2.8793283127083292e-05, + "loss": 0.5491, + "step": 3345 + }, + { + "epoch": 0.4682995101469559, + "grad_norm": 0.3937722091804054, + "learning_rate": 2.8782080650435006e-05, + "loss": 0.5799, + "step": 3346 + }, + { + "epoch": 0.4684394681595521, + "grad_norm": 0.39196249510935055, + "learning_rate": 2.8770877396530766e-05, + "loss": 0.5386, + "step": 3347 + }, + { + "epoch": 0.46857942617214837, + "grad_norm": 0.410878891101837, + "learning_rate": 2.875967336767296e-05, + "loss": 0.603, + "step": 3348 + }, + { + "epoch": 0.46871938418474457, + "grad_norm": 0.4020037749774897, + "learning_rate": 2.8748468566164134e-05, + "loss": 0.5499, + "step": 3349 + }, + { + "epoch": 0.4688593421973408, + "grad_norm": 0.45651213620060477, + "learning_rate": 2.8737262994306985e-05, + "loss": 0.6113, + "step": 3350 + }, + { + "epoch": 0.468999300209937, + "grad_norm": 0.4187097393109256, + "learning_rate": 2.872605665440436e-05, + "loss": 0.548, + "step": 3351 + }, + { + "epoch": 0.4691392582225332, + "grad_norm": 0.4096624177728241, + "learning_rate": 2.8714849548759293e-05, + "loss": 0.5629, + "step": 3352 + }, + { + "epoch": 0.46927921623512947, + "grad_norm": 0.4106487034595635, + "learning_rate": 2.8703641679674954e-05, + "loss": 0.5743, + "step": 3353 + }, + { + "epoch": 0.46941917424772567, + "grad_norm": 0.4160938815791059, + "learning_rate": 2.869243304945467e-05, + "loss": 0.5366, + "step": 3354 + }, + { + "epoch": 0.4695591322603219, + "grad_norm": 0.401353123051922, + "learning_rate": 2.8681223660401935e-05, + "loss": 0.5601, + "step": 3355 + }, + { + "epoch": 0.4696990902729181, + "grad_norm": 0.3960216537704636, + "learning_rate": 2.8670013514820375e-05, + "loss": 0.5472, + "step": 3356 + }, + { + "epoch": 0.46983904828551437, + "grad_norm": 0.40124295326684595, + "learning_rate": 2.8658802615013807e-05, + "loss": 0.5582, + "step": 3357 + }, + { + "epoch": 0.46997900629811057, + "grad_norm": 0.4061763668246899, + "learning_rate": 2.8647590963286175e-05, + "loss": 0.5582, + "step": 3358 + }, + { + "epoch": 0.47011896431070677, + "grad_norm": 0.42540877415275535, + "learning_rate": 2.8636378561941592e-05, + "loss": 0.5555, + "step": 3359 + }, + { + "epoch": 0.470258922323303, + "grad_norm": 0.40556363836960674, + "learning_rate": 2.862516541328431e-05, + "loss": 0.5677, + "step": 3360 + }, + { + "epoch": 0.4703988803358992, + "grad_norm": 0.4163599612619921, + "learning_rate": 2.8613951519618765e-05, + "loss": 0.5648, + "step": 3361 + }, + { + "epoch": 0.47053883834849547, + "grad_norm": 0.4088068121264862, + "learning_rate": 2.8602736883249503e-05, + "loss": 0.5592, + "step": 3362 + }, + { + "epoch": 0.47067879636109167, + "grad_norm": 0.37314677325016854, + "learning_rate": 2.859152150648126e-05, + "loss": 0.5145, + "step": 3363 + }, + { + "epoch": 0.4708187543736879, + "grad_norm": 0.7930198911401396, + "learning_rate": 2.8580305391618912e-05, + "loss": 0.5536, + "step": 3364 + }, + { + "epoch": 0.4709587123862841, + "grad_norm": 0.41677939251494683, + "learning_rate": 2.8569088540967476e-05, + "loss": 0.5369, + "step": 3365 + }, + { + "epoch": 0.4710986703988803, + "grad_norm": 0.3922638579854847, + "learning_rate": 2.8557870956832132e-05, + "loss": 0.5565, + "step": 3366 + }, + { + "epoch": 0.47123862841147657, + "grad_norm": 0.4290159510473203, + "learning_rate": 2.8546652641518208e-05, + "loss": 0.5655, + "step": 3367 + }, + { + "epoch": 0.47137858642407277, + "grad_norm": 0.41113045647723195, + "learning_rate": 2.8535433597331173e-05, + "loss": 0.5651, + "step": 3368 + }, + { + "epoch": 0.471518544436669, + "grad_norm": 0.4241008004580075, + "learning_rate": 2.8524213826576667e-05, + "loss": 0.5634, + "step": 3369 + }, + { + "epoch": 0.4716585024492652, + "grad_norm": 0.4461069068987371, + "learning_rate": 2.851299333156046e-05, + "loss": 0.5641, + "step": 3370 + }, + { + "epoch": 0.47179846046186147, + "grad_norm": 0.4284897736677648, + "learning_rate": 2.8501772114588476e-05, + "loss": 0.5711, + "step": 3371 + }, + { + "epoch": 0.47193841847445767, + "grad_norm": 0.3880573082608547, + "learning_rate": 2.8490550177966797e-05, + "loss": 0.4844, + "step": 3372 + }, + { + "epoch": 0.47207837648705386, + "grad_norm": 0.4343172019973716, + "learning_rate": 2.8479327524001636e-05, + "loss": 0.5544, + "step": 3373 + }, + { + "epoch": 0.4722183344996501, + "grad_norm": 0.40211668252102023, + "learning_rate": 2.8468104154999366e-05, + "loss": 0.5577, + "step": 3374 + }, + { + "epoch": 0.4723582925122463, + "grad_norm": 0.4165983505211132, + "learning_rate": 2.845688007326651e-05, + "loss": 0.5462, + "step": 3375 + }, + { + "epoch": 0.47249825052484257, + "grad_norm": 0.41318809239332804, + "learning_rate": 2.8445655281109718e-05, + "loss": 0.5821, + "step": 3376 + }, + { + "epoch": 0.47263820853743876, + "grad_norm": 0.43834923618793703, + "learning_rate": 2.8434429780835807e-05, + "loss": 0.5682, + "step": 3377 + }, + { + "epoch": 0.472778166550035, + "grad_norm": 0.4069711220875038, + "learning_rate": 2.8423203574751727e-05, + "loss": 0.5339, + "step": 3378 + }, + { + "epoch": 0.4729181245626312, + "grad_norm": 0.39860818609724685, + "learning_rate": 2.8411976665164585e-05, + "loss": 0.5379, + "step": 3379 + }, + { + "epoch": 0.4730580825752274, + "grad_norm": 0.41823989656114025, + "learning_rate": 2.840074905438161e-05, + "loss": 0.5801, + "step": 3380 + }, + { + "epoch": 0.47319804058782367, + "grad_norm": 0.41218220409199874, + "learning_rate": 2.8389520744710196e-05, + "loss": 0.5221, + "step": 3381 + }, + { + "epoch": 0.47333799860041986, + "grad_norm": 0.42461569878650546, + "learning_rate": 2.8378291738457887e-05, + "loss": 0.5785, + "step": 3382 + }, + { + "epoch": 0.4734779566130161, + "grad_norm": 0.4076091943362167, + "learning_rate": 2.8367062037932342e-05, + "loss": 0.5415, + "step": 3383 + }, + { + "epoch": 0.4736179146256123, + "grad_norm": 0.41452613569881464, + "learning_rate": 2.8355831645441388e-05, + "loss": 0.5649, + "step": 3384 + }, + { + "epoch": 0.4737578726382085, + "grad_norm": 0.4037756702805257, + "learning_rate": 2.834460056329298e-05, + "loss": 0.5523, + "step": 3385 + }, + { + "epoch": 0.47389783065080476, + "grad_norm": 0.4011452181351811, + "learning_rate": 2.8333368793795224e-05, + "loss": 0.5246, + "step": 3386 + }, + { + "epoch": 0.47403778866340096, + "grad_norm": 0.413887567092737, + "learning_rate": 2.8322136339256356e-05, + "loss": 0.5281, + "step": 3387 + }, + { + "epoch": 0.4741777466759972, + "grad_norm": 0.42257963112995583, + "learning_rate": 2.8310903201984763e-05, + "loss": 0.5612, + "step": 3388 + }, + { + "epoch": 0.4743177046885934, + "grad_norm": 0.6050510544766738, + "learning_rate": 2.829966938428897e-05, + "loss": 0.5807, + "step": 3389 + }, + { + "epoch": 0.47445766270118966, + "grad_norm": 0.42337956289277995, + "learning_rate": 2.8288434888477627e-05, + "loss": 0.5648, + "step": 3390 + }, + { + "epoch": 0.47459762071378586, + "grad_norm": 0.39066953133834564, + "learning_rate": 2.827719971685956e-05, + "loss": 0.5896, + "step": 3391 + }, + { + "epoch": 0.47473757872638206, + "grad_norm": 0.40333322646376846, + "learning_rate": 2.8265963871743696e-05, + "loss": 0.5761, + "step": 3392 + }, + { + "epoch": 0.4748775367389783, + "grad_norm": 0.41387027644417956, + "learning_rate": 2.825472735543912e-05, + "loss": 0.5925, + "step": 3393 + }, + { + "epoch": 0.4750174947515745, + "grad_norm": 0.3928152148704509, + "learning_rate": 2.8243490170255043e-05, + "loss": 0.5828, + "step": 3394 + }, + { + "epoch": 0.47515745276417076, + "grad_norm": 0.3929042322405297, + "learning_rate": 2.8232252318500834e-05, + "loss": 0.5831, + "step": 3395 + }, + { + "epoch": 0.47529741077676696, + "grad_norm": 0.4673310747874466, + "learning_rate": 2.8221013802485975e-05, + "loss": 0.5564, + "step": 3396 + }, + { + "epoch": 0.4754373687893632, + "grad_norm": 0.3888740851313229, + "learning_rate": 2.8209774624520097e-05, + "loss": 0.5575, + "step": 3397 + }, + { + "epoch": 0.4755773268019594, + "grad_norm": 0.40361151709091403, + "learning_rate": 2.8198534786912965e-05, + "loss": 0.5496, + "step": 3398 + }, + { + "epoch": 0.4757172848145556, + "grad_norm": 0.4106448187857887, + "learning_rate": 2.818729429197448e-05, + "loss": 0.5607, + "step": 3399 + }, + { + "epoch": 0.47585724282715186, + "grad_norm": 0.42784648142356363, + "learning_rate": 2.8176053142014687e-05, + "loss": 0.6264, + "step": 3400 + }, + { + "epoch": 0.47599720083974806, + "grad_norm": 0.40344827404813044, + "learning_rate": 2.8164811339343732e-05, + "loss": 0.5812, + "step": 3401 + }, + { + "epoch": 0.4761371588523443, + "grad_norm": 0.41503031802135976, + "learning_rate": 2.815356888627195e-05, + "loss": 0.5635, + "step": 3402 + }, + { + "epoch": 0.4762771168649405, + "grad_norm": 0.3854795531096736, + "learning_rate": 2.814232578510975e-05, + "loss": 0.5635, + "step": 3403 + }, + { + "epoch": 0.47641707487753676, + "grad_norm": 0.4009076755410169, + "learning_rate": 2.8131082038167735e-05, + "loss": 0.5731, + "step": 3404 + }, + { + "epoch": 0.47655703289013296, + "grad_norm": 0.38835090703799413, + "learning_rate": 2.8119837647756574e-05, + "loss": 0.565, + "step": 3405 + }, + { + "epoch": 0.47669699090272916, + "grad_norm": 0.4286357705154638, + "learning_rate": 2.8108592616187133e-05, + "loss": 0.549, + "step": 3406 + }, + { + "epoch": 0.4768369489153254, + "grad_norm": 0.4275083145446453, + "learning_rate": 2.8097346945770364e-05, + "loss": 0.5719, + "step": 3407 + }, + { + "epoch": 0.4769769069279216, + "grad_norm": 0.3940008020310687, + "learning_rate": 2.808610063881737e-05, + "loss": 0.5208, + "step": 3408 + }, + { + "epoch": 0.47711686494051786, + "grad_norm": 0.4229335142753006, + "learning_rate": 2.807485369763938e-05, + "loss": 0.5763, + "step": 3409 + }, + { + "epoch": 0.47725682295311406, + "grad_norm": 0.4161121773460535, + "learning_rate": 2.8063606124547765e-05, + "loss": 0.5443, + "step": 3410 + }, + { + "epoch": 0.4773967809657103, + "grad_norm": 0.40765919854648125, + "learning_rate": 2.8052357921854e-05, + "loss": 0.5702, + "step": 3411 + }, + { + "epoch": 0.4775367389783065, + "grad_norm": 0.4163390293585378, + "learning_rate": 2.804110909186971e-05, + "loss": 0.5728, + "step": 3412 + }, + { + "epoch": 0.4776766969909027, + "grad_norm": 0.4377093314827132, + "learning_rate": 2.8029859636906654e-05, + "loss": 0.5785, + "step": 3413 + }, + { + "epoch": 0.47781665500349896, + "grad_norm": 0.3997496345348747, + "learning_rate": 2.8018609559276686e-05, + "loss": 0.5872, + "step": 3414 + }, + { + "epoch": 0.47795661301609516, + "grad_norm": 0.41218479804115327, + "learning_rate": 2.800735886129184e-05, + "loss": 0.5606, + "step": 3415 + }, + { + "epoch": 0.4780965710286914, + "grad_norm": 0.4227372321145177, + "learning_rate": 2.7996107545264223e-05, + "loss": 0.5708, + "step": 3416 + }, + { + "epoch": 0.4782365290412876, + "grad_norm": 0.42665670114635346, + "learning_rate": 2.7984855613506107e-05, + "loss": 0.5704, + "step": 3417 + }, + { + "epoch": 0.47837648705388386, + "grad_norm": 0.4216359313020928, + "learning_rate": 2.7973603068329884e-05, + "loss": 0.5378, + "step": 3418 + }, + { + "epoch": 0.47851644506648006, + "grad_norm": 0.4320525421405627, + "learning_rate": 2.796234991204805e-05, + "loss": 0.5626, + "step": 3419 + }, + { + "epoch": 0.47865640307907625, + "grad_norm": 0.4087593290796549, + "learning_rate": 2.795109614697326e-05, + "loss": 0.5472, + "step": 3420 + }, + { + "epoch": 0.4787963610916725, + "grad_norm": 0.4311719285838966, + "learning_rate": 2.793984177541827e-05, + "loss": 0.5585, + "step": 3421 + }, + { + "epoch": 0.4789363191042687, + "grad_norm": 0.4101348935742422, + "learning_rate": 2.792858679969596e-05, + "loss": 0.5539, + "step": 3422 + }, + { + "epoch": 0.47907627711686496, + "grad_norm": 0.4240688742729598, + "learning_rate": 2.7917331222119346e-05, + "loss": 0.5853, + "step": 3423 + }, + { + "epoch": 0.47921623512946115, + "grad_norm": 0.4192530280413205, + "learning_rate": 2.790607504500157e-05, + "loss": 0.5476, + "step": 3424 + }, + { + "epoch": 0.4793561931420574, + "grad_norm": 0.39900981412049086, + "learning_rate": 2.7894818270655882e-05, + "loss": 0.5582, + "step": 3425 + }, + { + "epoch": 0.4794961511546536, + "grad_norm": 0.3965711350604459, + "learning_rate": 2.7883560901395667e-05, + "loss": 0.5419, + "step": 3426 + }, + { + "epoch": 0.4796361091672498, + "grad_norm": 0.4312456683665289, + "learning_rate": 2.7872302939534433e-05, + "loss": 0.5536, + "step": 3427 + }, + { + "epoch": 0.47977606717984606, + "grad_norm": 0.40864635902503055, + "learning_rate": 2.78610443873858e-05, + "loss": 0.5812, + "step": 3428 + }, + { + "epoch": 0.47991602519244225, + "grad_norm": 0.40901705450143366, + "learning_rate": 2.7849785247263515e-05, + "loss": 0.5284, + "step": 3429 + }, + { + "epoch": 0.4800559832050385, + "grad_norm": 0.40430272789500593, + "learning_rate": 2.7838525521481444e-05, + "loss": 0.5587, + "step": 3430 + }, + { + "epoch": 0.4801959412176347, + "grad_norm": 0.3934946456153737, + "learning_rate": 2.7827265212353587e-05, + "loss": 0.5509, + "step": 3431 + }, + { + "epoch": 0.48033589923023096, + "grad_norm": 0.42366137833048706, + "learning_rate": 2.781600432219404e-05, + "loss": 0.5478, + "step": 3432 + }, + { + "epoch": 0.48047585724282715, + "grad_norm": 0.4067675335773988, + "learning_rate": 2.7804742853317027e-05, + "loss": 0.5348, + "step": 3433 + }, + { + "epoch": 0.48061581525542335, + "grad_norm": 0.3990940892590606, + "learning_rate": 2.7793480808036898e-05, + "loss": 0.5449, + "step": 3434 + }, + { + "epoch": 0.4807557732680196, + "grad_norm": 0.40172847447577514, + "learning_rate": 2.7782218188668118e-05, + "loss": 0.5898, + "step": 3435 + }, + { + "epoch": 0.4808957312806158, + "grad_norm": 0.3955704115554125, + "learning_rate": 2.7770954997525277e-05, + "loss": 0.5357, + "step": 3436 + }, + { + "epoch": 0.48103568929321205, + "grad_norm": 0.37493116627514156, + "learning_rate": 2.7759691236923064e-05, + "loss": 0.5197, + "step": 3437 + }, + { + "epoch": 0.48117564730580825, + "grad_norm": 0.40865836884433765, + "learning_rate": 2.7748426909176307e-05, + "loss": 0.5567, + "step": 3438 + }, + { + "epoch": 0.4813156053184045, + "grad_norm": 0.41498192510745757, + "learning_rate": 2.7737162016599927e-05, + "loss": 0.595, + "step": 3439 + }, + { + "epoch": 0.4814555633310007, + "grad_norm": 0.4299760976868325, + "learning_rate": 2.7725896561508983e-05, + "loss": 0.5465, + "step": 3440 + }, + { + "epoch": 0.4815955213435969, + "grad_norm": 0.40031579978939574, + "learning_rate": 2.7714630546218635e-05, + "loss": 0.5965, + "step": 3441 + }, + { + "epoch": 0.48173547935619315, + "grad_norm": 0.40096718317347985, + "learning_rate": 2.770336397304417e-05, + "loss": 0.5321, + "step": 3442 + }, + { + "epoch": 0.48187543736878935, + "grad_norm": 0.3981130281220963, + "learning_rate": 2.769209684430098e-05, + "loss": 0.5558, + "step": 3443 + }, + { + "epoch": 0.4820153953813856, + "grad_norm": 0.45577292418268894, + "learning_rate": 2.7680829162304567e-05, + "loss": 0.5586, + "step": 3444 + }, + { + "epoch": 0.4821553533939818, + "grad_norm": 0.4112624187601042, + "learning_rate": 2.7669560929370564e-05, + "loss": 0.5457, + "step": 3445 + }, + { + "epoch": 0.48229531140657805, + "grad_norm": 0.39264802624439415, + "learning_rate": 2.76582921478147e-05, + "loss": 0.547, + "step": 3446 + }, + { + "epoch": 0.48243526941917425, + "grad_norm": 0.4141847607595186, + "learning_rate": 2.7647022819952835e-05, + "loss": 0.5274, + "step": 3447 + }, + { + "epoch": 0.48257522743177045, + "grad_norm": 0.40886257454682706, + "learning_rate": 2.763575294810091e-05, + "loss": 0.5305, + "step": 3448 + }, + { + "epoch": 0.4827151854443667, + "grad_norm": 0.39545619852603475, + "learning_rate": 2.7624482534575026e-05, + "loss": 0.5839, + "step": 3449 + }, + { + "epoch": 0.4828551434569629, + "grad_norm": 0.40937623131741985, + "learning_rate": 2.761321158169134e-05, + "loss": 0.5638, + "step": 3450 + }, + { + "epoch": 0.48299510146955915, + "grad_norm": 0.407186107707272, + "learning_rate": 2.7601940091766164e-05, + "loss": 0.5158, + "step": 3451 + }, + { + "epoch": 0.48313505948215535, + "grad_norm": 0.4076092863747084, + "learning_rate": 2.7590668067115895e-05, + "loss": 0.5874, + "step": 3452 + }, + { + "epoch": 0.4832750174947516, + "grad_norm": 0.4221563990293702, + "learning_rate": 2.757939551005706e-05, + "loss": 0.5484, + "step": 3453 + }, + { + "epoch": 0.4834149755073478, + "grad_norm": 0.38895523634768725, + "learning_rate": 2.7568122422906273e-05, + "loss": 0.5166, + "step": 3454 + }, + { + "epoch": 0.483554933519944, + "grad_norm": 0.4268904833505735, + "learning_rate": 2.755684880798026e-05, + "loss": 0.5862, + "step": 3455 + }, + { + "epoch": 0.48369489153254025, + "grad_norm": 0.41707456856060987, + "learning_rate": 2.754557466759589e-05, + "loss": 0.5704, + "step": 3456 + }, + { + "epoch": 0.48383484954513645, + "grad_norm": 0.41401351680807885, + "learning_rate": 2.7534300004070086e-05, + "loss": 0.5452, + "step": 3457 + }, + { + "epoch": 0.4839748075577327, + "grad_norm": 0.4000475828565154, + "learning_rate": 2.7523024819719922e-05, + "loss": 0.5862, + "step": 3458 + }, + { + "epoch": 0.4841147655703289, + "grad_norm": 0.4126946869503196, + "learning_rate": 2.7511749116862558e-05, + "loss": 0.5499, + "step": 3459 + }, + { + "epoch": 0.4842547235829251, + "grad_norm": 0.4046742301519851, + "learning_rate": 2.7500472897815265e-05, + "loss": 0.5295, + "step": 3460 + }, + { + "epoch": 0.48439468159552135, + "grad_norm": 0.42125194708107067, + "learning_rate": 2.748919616489542e-05, + "loss": 0.5356, + "step": 3461 + }, + { + "epoch": 0.48453463960811755, + "grad_norm": 0.4040649587533329, + "learning_rate": 2.7477918920420504e-05, + "loss": 0.553, + "step": 3462 + }, + { + "epoch": 0.4846745976207138, + "grad_norm": 0.3886454789211703, + "learning_rate": 2.7466641166708113e-05, + "loss": 0.5567, + "step": 3463 + }, + { + "epoch": 0.48481455563331, + "grad_norm": 0.4996582332055803, + "learning_rate": 2.7455362906075932e-05, + "loss": 0.5722, + "step": 3464 + }, + { + "epoch": 0.48495451364590625, + "grad_norm": 0.423979275094494, + "learning_rate": 2.7444084140841765e-05, + "loss": 0.5837, + "step": 3465 + }, + { + "epoch": 0.48509447165850245, + "grad_norm": 0.39551615527180584, + "learning_rate": 2.74328048733235e-05, + "loss": 0.5754, + "step": 3466 + }, + { + "epoch": 0.48523442967109864, + "grad_norm": 0.45978728132052005, + "learning_rate": 2.7421525105839152e-05, + "loss": 0.5367, + "step": 3467 + }, + { + "epoch": 0.4853743876836949, + "grad_norm": 0.4193876985062947, + "learning_rate": 2.741024484070682e-05, + "loss": 0.5576, + "step": 3468 + }, + { + "epoch": 0.4855143456962911, + "grad_norm": 0.397308979018639, + "learning_rate": 2.739896408024473e-05, + "loss": 0.5577, + "step": 3469 + }, + { + "epoch": 0.48565430370888735, + "grad_norm": 0.4127845230541042, + "learning_rate": 2.7387682826771173e-05, + "loss": 0.5876, + "step": 3470 + }, + { + "epoch": 0.48579426172148354, + "grad_norm": 0.40489978417434586, + "learning_rate": 2.7376401082604564e-05, + "loss": 0.5622, + "step": 3471 + }, + { + "epoch": 0.4859342197340798, + "grad_norm": 0.3839599583629985, + "learning_rate": 2.736511885006343e-05, + "loss": 0.5432, + "step": 3472 + }, + { + "epoch": 0.486074177746676, + "grad_norm": 0.39225392267678383, + "learning_rate": 2.7353836131466372e-05, + "loss": 0.5639, + "step": 3473 + }, + { + "epoch": 0.4862141357592722, + "grad_norm": 0.4051784046488118, + "learning_rate": 2.7342552929132104e-05, + "loss": 0.5664, + "step": 3474 + }, + { + "epoch": 0.48635409377186845, + "grad_norm": 0.40996524593878775, + "learning_rate": 2.7331269245379443e-05, + "loss": 0.5601, + "step": 3475 + }, + { + "epoch": 0.48649405178446464, + "grad_norm": 0.42173107295905043, + "learning_rate": 2.7319985082527292e-05, + "loss": 0.5497, + "step": 3476 + }, + { + "epoch": 0.4866340097970609, + "grad_norm": 0.40466464089704407, + "learning_rate": 2.7308700442894675e-05, + "loss": 0.5397, + "step": 3477 + }, + { + "epoch": 0.4867739678096571, + "grad_norm": 0.3942022841659981, + "learning_rate": 2.7297415328800692e-05, + "loss": 0.5313, + "step": 3478 + }, + { + "epoch": 0.48691392582225335, + "grad_norm": 0.40743336481779613, + "learning_rate": 2.728612974256454e-05, + "loss": 0.5313, + "step": 3479 + }, + { + "epoch": 0.48705388383484954, + "grad_norm": 0.4332085486342721, + "learning_rate": 2.7274843686505536e-05, + "loss": 0.5455, + "step": 3480 + }, + { + "epoch": 0.48719384184744574, + "grad_norm": 0.4094748981538972, + "learning_rate": 2.726355716294307e-05, + "loss": 0.5727, + "step": 3481 + }, + { + "epoch": 0.487333799860042, + "grad_norm": 0.4111671844080047, + "learning_rate": 2.7252270174196643e-05, + "loss": 0.5635, + "step": 3482 + }, + { + "epoch": 0.4874737578726382, + "grad_norm": 0.39174881532897554, + "learning_rate": 2.724098272258584e-05, + "loss": 0.5414, + "step": 3483 + }, + { + "epoch": 0.48761371588523444, + "grad_norm": 0.3907484104141741, + "learning_rate": 2.7229694810430344e-05, + "loss": 0.5766, + "step": 3484 + }, + { + "epoch": 0.48775367389783064, + "grad_norm": 0.4010543272236602, + "learning_rate": 2.7218406440049954e-05, + "loss": 0.5371, + "step": 3485 + }, + { + "epoch": 0.4878936319104269, + "grad_norm": 0.4346615757792164, + "learning_rate": 2.720711761376452e-05, + "loss": 0.5729, + "step": 3486 + }, + { + "epoch": 0.4880335899230231, + "grad_norm": 0.4341695351875495, + "learning_rate": 2.719582833389403e-05, + "loss": 0.6109, + "step": 3487 + }, + { + "epoch": 0.4881735479356193, + "grad_norm": 0.43134633918438864, + "learning_rate": 2.7184538602758536e-05, + "loss": 0.5904, + "step": 3488 + }, + { + "epoch": 0.48831350594821554, + "grad_norm": 0.40924934907989946, + "learning_rate": 2.71732484226782e-05, + "loss": 0.5533, + "step": 3489 + }, + { + "epoch": 0.48845346396081174, + "grad_norm": 0.4117613614479626, + "learning_rate": 2.7161957795973246e-05, + "loss": 0.519, + "step": 3490 + }, + { + "epoch": 0.488593421973408, + "grad_norm": 0.3876515998640682, + "learning_rate": 2.7150666724964035e-05, + "loss": 0.5543, + "step": 3491 + }, + { + "epoch": 0.4887333799860042, + "grad_norm": 0.407828217550299, + "learning_rate": 2.7139375211970996e-05, + "loss": 0.5515, + "step": 3492 + }, + { + "epoch": 0.48887333799860044, + "grad_norm": 0.40672013393542944, + "learning_rate": 2.712808325931464e-05, + "loss": 0.5445, + "step": 3493 + }, + { + "epoch": 0.48901329601119664, + "grad_norm": 0.40868169613662847, + "learning_rate": 2.7116790869315582e-05, + "loss": 0.5712, + "step": 3494 + }, + { + "epoch": 0.48915325402379284, + "grad_norm": 0.4092633353198152, + "learning_rate": 2.710549804429452e-05, + "loss": 0.5515, + "step": 3495 + }, + { + "epoch": 0.4892932120363891, + "grad_norm": 0.4366896965056272, + "learning_rate": 2.7094204786572254e-05, + "loss": 0.5883, + "step": 3496 + }, + { + "epoch": 0.4894331700489853, + "grad_norm": 0.41669884030755355, + "learning_rate": 2.7082911098469648e-05, + "loss": 0.5386, + "step": 3497 + }, + { + "epoch": 0.48957312806158154, + "grad_norm": 0.39572509135352624, + "learning_rate": 2.7071616982307684e-05, + "loss": 0.5563, + "step": 3498 + }, + { + "epoch": 0.48971308607417774, + "grad_norm": 0.41695121258573753, + "learning_rate": 2.706032244040741e-05, + "loss": 0.5796, + "step": 3499 + }, + { + "epoch": 0.489853044086774, + "grad_norm": 0.4236891067324582, + "learning_rate": 2.704902747508996e-05, + "loss": 0.5799, + "step": 3500 + }, + { + "epoch": 0.4899930020993702, + "grad_norm": 0.3977818931519091, + "learning_rate": 2.7037732088676582e-05, + "loss": 0.6042, + "step": 3501 + }, + { + "epoch": 0.4901329601119664, + "grad_norm": 0.4233529044212103, + "learning_rate": 2.7026436283488583e-05, + "loss": 0.5673, + "step": 3502 + }, + { + "epoch": 0.49027291812456264, + "grad_norm": 0.4082847133569342, + "learning_rate": 2.7015140061847365e-05, + "loss": 0.513, + "step": 3503 + }, + { + "epoch": 0.49041287613715884, + "grad_norm": 0.4200304747170798, + "learning_rate": 2.7003843426074416e-05, + "loss": 0.5791, + "step": 3504 + }, + { + "epoch": 0.4905528341497551, + "grad_norm": 0.39533501738229987, + "learning_rate": 2.6992546378491318e-05, + "loss": 0.5704, + "step": 3505 + }, + { + "epoch": 0.4906927921623513, + "grad_norm": 0.41764209923224177, + "learning_rate": 2.698124892141971e-05, + "loss": 0.5457, + "step": 3506 + }, + { + "epoch": 0.49083275017494754, + "grad_norm": 0.4064990179760341, + "learning_rate": 2.6969951057181358e-05, + "loss": 0.5574, + "step": 3507 + }, + { + "epoch": 0.49097270818754374, + "grad_norm": 0.420505722395169, + "learning_rate": 2.695865278809807e-05, + "loss": 0.5699, + "step": 3508 + }, + { + "epoch": 0.49111266620013994, + "grad_norm": 0.40016382436365205, + "learning_rate": 2.6947354116491763e-05, + "loss": 0.5567, + "step": 3509 + }, + { + "epoch": 0.4912526242127362, + "grad_norm": 0.4021262284998323, + "learning_rate": 2.693605504468443e-05, + "loss": 0.5676, + "step": 3510 + }, + { + "epoch": 0.4913925822253324, + "grad_norm": 0.39530073122269704, + "learning_rate": 2.692475557499813e-05, + "loss": 0.5328, + "step": 3511 + }, + { + "epoch": 0.49153254023792864, + "grad_norm": 0.397282341609004, + "learning_rate": 2.6913455709755042e-05, + "loss": 0.5567, + "step": 3512 + }, + { + "epoch": 0.49167249825052484, + "grad_norm": 0.41008312840461486, + "learning_rate": 2.6902155451277377e-05, + "loss": 0.5393, + "step": 3513 + }, + { + "epoch": 0.4918124562631211, + "grad_norm": 0.4096259675286922, + "learning_rate": 2.6890854801887478e-05, + "loss": 0.5951, + "step": 3514 + }, + { + "epoch": 0.4919524142757173, + "grad_norm": 0.3919817250852191, + "learning_rate": 2.6879553763907726e-05, + "loss": 0.536, + "step": 3515 + }, + { + "epoch": 0.4920923722883135, + "grad_norm": 0.39561344260712744, + "learning_rate": 2.686825233966061e-05, + "loss": 0.5536, + "step": 3516 + }, + { + "epoch": 0.49223233030090974, + "grad_norm": 0.409797831934292, + "learning_rate": 2.685695053146868e-05, + "loss": 0.5395, + "step": 3517 + }, + { + "epoch": 0.49237228831350593, + "grad_norm": 0.3998242277512286, + "learning_rate": 2.684564834165457e-05, + "loss": 0.5404, + "step": 3518 + }, + { + "epoch": 0.4925122463261022, + "grad_norm": 0.4267398317265081, + "learning_rate": 2.6834345772541002e-05, + "loss": 0.5711, + "step": 3519 + }, + { + "epoch": 0.4926522043386984, + "grad_norm": 0.42572746496869157, + "learning_rate": 2.6823042826450774e-05, + "loss": 0.5921, + "step": 3520 + }, + { + "epoch": 0.49279216235129464, + "grad_norm": 0.400101008335866, + "learning_rate": 2.681173950570674e-05, + "loss": 0.5325, + "step": 3521 + }, + { + "epoch": 0.49293212036389084, + "grad_norm": 0.4029901094408229, + "learning_rate": 2.6800435812631854e-05, + "loss": 0.5678, + "step": 3522 + }, + { + "epoch": 0.49307207837648703, + "grad_norm": 0.41541374559781874, + "learning_rate": 2.678913174954914e-05, + "loss": 0.5285, + "step": 3523 + }, + { + "epoch": 0.4932120363890833, + "grad_norm": 0.4370900148410592, + "learning_rate": 2.6777827318781697e-05, + "loss": 0.5666, + "step": 3524 + }, + { + "epoch": 0.4933519944016795, + "grad_norm": 0.42959019276244964, + "learning_rate": 2.6766522522652704e-05, + "loss": 0.5402, + "step": 3525 + }, + { + "epoch": 0.49349195241427574, + "grad_norm": 0.40725453736472333, + "learning_rate": 2.6755217363485406e-05, + "loss": 0.5171, + "step": 3526 + }, + { + "epoch": 0.49363191042687193, + "grad_norm": 0.40719701171899414, + "learning_rate": 2.674391184360313e-05, + "loss": 0.5283, + "step": 3527 + }, + { + "epoch": 0.4937718684394682, + "grad_norm": 0.40279902099076637, + "learning_rate": 2.6732605965329283e-05, + "loss": 0.5477, + "step": 3528 + }, + { + "epoch": 0.4939118264520644, + "grad_norm": 0.3802988661016701, + "learning_rate": 2.6721299730987324e-05, + "loss": 0.548, + "step": 3529 + }, + { + "epoch": 0.4940517844646606, + "grad_norm": 0.42211573921030193, + "learning_rate": 2.670999314290081e-05, + "loss": 0.5365, + "step": 3530 + }, + { + "epoch": 0.49419174247725683, + "grad_norm": 0.4001947861590466, + "learning_rate": 2.6698686203393354e-05, + "loss": 0.5495, + "step": 3531 + }, + { + "epoch": 0.49433170048985303, + "grad_norm": 0.4075041830130491, + "learning_rate": 2.6687378914788645e-05, + "loss": 0.5598, + "step": 3532 + }, + { + "epoch": 0.4944716585024493, + "grad_norm": 0.42826625295627585, + "learning_rate": 2.6676071279410448e-05, + "loss": 0.546, + "step": 3533 + }, + { + "epoch": 0.4946116165150455, + "grad_norm": 0.4325314823046203, + "learning_rate": 2.6664763299582602e-05, + "loss": 0.5549, + "step": 3534 + }, + { + "epoch": 0.4947515745276417, + "grad_norm": 0.4257334322531565, + "learning_rate": 2.6653454977629e-05, + "loss": 0.5714, + "step": 3535 + }, + { + "epoch": 0.49489153254023793, + "grad_norm": 0.41145085533484926, + "learning_rate": 2.6642146315873622e-05, + "loss": 0.5637, + "step": 3536 + }, + { + "epoch": 0.49503149055283413, + "grad_norm": 0.41951469265452185, + "learning_rate": 2.6630837316640523e-05, + "loss": 0.5531, + "step": 3537 + }, + { + "epoch": 0.4951714485654304, + "grad_norm": 0.42251053392716215, + "learning_rate": 2.6619527982253794e-05, + "loss": 0.5631, + "step": 3538 + }, + { + "epoch": 0.4953114065780266, + "grad_norm": 0.4071751122281306, + "learning_rate": 2.6608218315037648e-05, + "loss": 0.5791, + "step": 3539 + }, + { + "epoch": 0.49545136459062283, + "grad_norm": 0.41230607754128246, + "learning_rate": 2.659690831731631e-05, + "loss": 0.533, + "step": 3540 + }, + { + "epoch": 0.49559132260321903, + "grad_norm": 0.4049581094697219, + "learning_rate": 2.6585597991414114e-05, + "loss": 0.5314, + "step": 3541 + }, + { + "epoch": 0.49573128061581523, + "grad_norm": 0.38934941480909324, + "learning_rate": 2.6574287339655447e-05, + "loss": 0.5359, + "step": 3542 + }, + { + "epoch": 0.4958712386284115, + "grad_norm": 0.4183426624654952, + "learning_rate": 2.656297636436475e-05, + "loss": 0.5885, + "step": 3543 + }, + { + "epoch": 0.4960111966410077, + "grad_norm": 0.4426159573559465, + "learning_rate": 2.6551665067866556e-05, + "loss": 0.5809, + "step": 3544 + }, + { + "epoch": 0.49615115465360393, + "grad_norm": 0.5149102513551677, + "learning_rate": 2.6540353452485443e-05, + "loss": 0.5692, + "step": 3545 + }, + { + "epoch": 0.49629111266620013, + "grad_norm": 0.4109744615781001, + "learning_rate": 2.652904152054607e-05, + "loss": 0.5789, + "step": 3546 + }, + { + "epoch": 0.4964310706787964, + "grad_norm": 0.44177673319774247, + "learning_rate": 2.651772927437315e-05, + "loss": 0.5912, + "step": 3547 + }, + { + "epoch": 0.4965710286913926, + "grad_norm": 0.4052859032621841, + "learning_rate": 2.6506416716291465e-05, + "loss": 0.5616, + "step": 3548 + }, + { + "epoch": 0.4967109867039888, + "grad_norm": 0.393563528563957, + "learning_rate": 2.649510384862586e-05, + "loss": 0.5379, + "step": 3549 + }, + { + "epoch": 0.49685094471658503, + "grad_norm": 0.41866504482839206, + "learning_rate": 2.6483790673701242e-05, + "loss": 0.5486, + "step": 3550 + }, + { + "epoch": 0.4969909027291812, + "grad_norm": 0.41154455431337716, + "learning_rate": 2.6472477193842583e-05, + "loss": 0.5647, + "step": 3551 + }, + { + "epoch": 0.4971308607417775, + "grad_norm": 0.39736456239087703, + "learning_rate": 2.6461163411374923e-05, + "loss": 0.5901, + "step": 3552 + }, + { + "epoch": 0.4972708187543737, + "grad_norm": 0.4060568204824789, + "learning_rate": 2.6449849328623355e-05, + "loss": 0.5435, + "step": 3553 + }, + { + "epoch": 0.49741077676696993, + "grad_norm": 0.4115697803723554, + "learning_rate": 2.6438534947913047e-05, + "loss": 0.5655, + "step": 3554 + }, + { + "epoch": 0.49755073477956613, + "grad_norm": 0.3956194134720466, + "learning_rate": 2.6427220271569203e-05, + "loss": 0.5394, + "step": 3555 + }, + { + "epoch": 0.4976906927921623, + "grad_norm": 0.38712507902856763, + "learning_rate": 2.6415905301917114e-05, + "loss": 0.5345, + "step": 3556 + }, + { + "epoch": 0.4978306508047586, + "grad_norm": 0.9891507537143042, + "learning_rate": 2.6404590041282116e-05, + "loss": 0.5452, + "step": 3557 + }, + { + "epoch": 0.4979706088173548, + "grad_norm": 0.40961536120801306, + "learning_rate": 2.6393274491989617e-05, + "loss": 0.5193, + "step": 3558 + }, + { + "epoch": 0.49811056682995103, + "grad_norm": 0.43943809296308844, + "learning_rate": 2.6381958656365073e-05, + "loss": 0.5351, + "step": 3559 + }, + { + "epoch": 0.4982505248425472, + "grad_norm": 0.4108272260762031, + "learning_rate": 2.6370642536734004e-05, + "loss": 0.5348, + "step": 3560 + }, + { + "epoch": 0.4983904828551435, + "grad_norm": 0.4119616541998687, + "learning_rate": 2.6359326135421986e-05, + "loss": 0.5255, + "step": 3561 + }, + { + "epoch": 0.4985304408677397, + "grad_norm": 0.4495829632664617, + "learning_rate": 2.6348009454754653e-05, + "loss": 0.5828, + "step": 3562 + }, + { + "epoch": 0.4986703988803359, + "grad_norm": 0.41137247203721966, + "learning_rate": 2.6336692497057696e-05, + "loss": 0.5541, + "step": 3563 + }, + { + "epoch": 0.4988103568929321, + "grad_norm": 0.4367447061362204, + "learning_rate": 2.632537526465687e-05, + "loss": 0.5993, + "step": 3564 + }, + { + "epoch": 0.4989503149055283, + "grad_norm": 0.4326908998762443, + "learning_rate": 2.6314057759877985e-05, + "loss": 0.5765, + "step": 3565 + }, + { + "epoch": 0.4990902729181246, + "grad_norm": 0.4192288623903318, + "learning_rate": 2.6302739985046898e-05, + "loss": 0.5376, + "step": 3566 + }, + { + "epoch": 0.4992302309307208, + "grad_norm": 0.4248836776012947, + "learning_rate": 2.629142194248952e-05, + "loss": 0.5938, + "step": 3567 + }, + { + "epoch": 0.49937018894331703, + "grad_norm": 0.4047565365438917, + "learning_rate": 2.6280103634531833e-05, + "loss": 0.5811, + "step": 3568 + }, + { + "epoch": 0.4995101469559132, + "grad_norm": 0.4144316245441231, + "learning_rate": 2.626878506349986e-05, + "loss": 0.5902, + "step": 3569 + }, + { + "epoch": 0.4996501049685094, + "grad_norm": 0.4178965245669895, + "learning_rate": 2.625746623171968e-05, + "loss": 0.525, + "step": 3570 + }, + { + "epoch": 0.4997900629811057, + "grad_norm": 0.4159969957434166, + "learning_rate": 2.624614714151743e-05, + "loss": 0.5576, + "step": 3571 + }, + { + "epoch": 0.4999300209937019, + "grad_norm": 0.40173942508061367, + "learning_rate": 2.6234827795219297e-05, + "loss": 0.5358, + "step": 3572 + }, + { + "epoch": 0.5000699790062981, + "grad_norm": 0.41640033764604484, + "learning_rate": 2.622350819515153e-05, + "loss": 0.5571, + "step": 3573 + }, + { + "epoch": 0.5002099370188944, + "grad_norm": 0.40378307872853386, + "learning_rate": 2.621218834364041e-05, + "loss": 0.5472, + "step": 3574 + }, + { + "epoch": 0.5003498950314905, + "grad_norm": 0.398494511998799, + "learning_rate": 2.620086824301229e-05, + "loss": 0.5331, + "step": 3575 + }, + { + "epoch": 0.5004898530440868, + "grad_norm": 0.40959256585724063, + "learning_rate": 2.6189547895593562e-05, + "loss": 0.5496, + "step": 3576 + }, + { + "epoch": 0.500629811056683, + "grad_norm": 0.41352440579598176, + "learning_rate": 2.6178227303710673e-05, + "loss": 0.5697, + "step": 3577 + }, + { + "epoch": 0.5007697690692792, + "grad_norm": 0.408240609494089, + "learning_rate": 2.616690646969011e-05, + "loss": 0.56, + "step": 3578 + }, + { + "epoch": 0.5009097270818754, + "grad_norm": 0.3832170219371199, + "learning_rate": 2.6155585395858435e-05, + "loss": 0.523, + "step": 3579 + }, + { + "epoch": 0.5010496850944717, + "grad_norm": 0.405736469986696, + "learning_rate": 2.6144264084542224e-05, + "loss": 0.5743, + "step": 3580 + }, + { + "epoch": 0.5011896431070679, + "grad_norm": 0.4036128469748438, + "learning_rate": 2.6132942538068146e-05, + "loss": 0.5807, + "step": 3581 + }, + { + "epoch": 0.5013296011196641, + "grad_norm": 0.40173202764431026, + "learning_rate": 2.6121620758762877e-05, + "loss": 0.5703, + "step": 3582 + }, + { + "epoch": 0.5014695591322603, + "grad_norm": 0.42011031965776624, + "learning_rate": 2.6110298748953153e-05, + "loss": 0.5602, + "step": 3583 + }, + { + "epoch": 0.5016095171448566, + "grad_norm": 0.39388389881683084, + "learning_rate": 2.6098976510965788e-05, + "loss": 0.5268, + "step": 3584 + }, + { + "epoch": 0.5017494751574527, + "grad_norm": 0.5531371102734028, + "learning_rate": 2.6087654047127587e-05, + "loss": 0.5644, + "step": 3585 + }, + { + "epoch": 0.501889433170049, + "grad_norm": 0.40679492045140125, + "learning_rate": 2.6076331359765448e-05, + "loss": 0.5047, + "step": 3586 + }, + { + "epoch": 0.5020293911826452, + "grad_norm": 0.4165461319341358, + "learning_rate": 2.6065008451206296e-05, + "loss": 0.5296, + "step": 3587 + }, + { + "epoch": 0.5021693491952415, + "grad_norm": 0.39190580835869326, + "learning_rate": 2.60536853237771e-05, + "loss": 0.5346, + "step": 3588 + }, + { + "epoch": 0.5023093072078376, + "grad_norm": 0.3991014002270092, + "learning_rate": 2.6042361979804874e-05, + "loss": 0.5006, + "step": 3589 + }, + { + "epoch": 0.5024492652204339, + "grad_norm": 0.4371615498576483, + "learning_rate": 2.6031038421616683e-05, + "loss": 0.5738, + "step": 3590 + }, + { + "epoch": 0.5025892232330301, + "grad_norm": 0.42427696712386304, + "learning_rate": 2.6019714651539646e-05, + "loss": 0.5857, + "step": 3591 + }, + { + "epoch": 0.5027291812456263, + "grad_norm": 0.4155090445862282, + "learning_rate": 2.600839067190089e-05, + "loss": 0.584, + "step": 3592 + }, + { + "epoch": 0.5028691392582225, + "grad_norm": 0.4233864118860884, + "learning_rate": 2.5997066485027626e-05, + "loss": 0.5393, + "step": 3593 + }, + { + "epoch": 0.5030090972708188, + "grad_norm": 0.4286876156223743, + "learning_rate": 2.5985742093247078e-05, + "loss": 0.5787, + "step": 3594 + }, + { + "epoch": 0.503149055283415, + "grad_norm": 0.4103875069797372, + "learning_rate": 2.5974417498886532e-05, + "loss": 0.5617, + "step": 3595 + }, + { + "epoch": 0.5032890132960112, + "grad_norm": 0.43666777401439383, + "learning_rate": 2.59630927042733e-05, + "loss": 0.6215, + "step": 3596 + }, + { + "epoch": 0.5034289713086074, + "grad_norm": 0.3967739646837927, + "learning_rate": 2.5951767711734753e-05, + "loss": 0.5521, + "step": 3597 + }, + { + "epoch": 0.5035689293212037, + "grad_norm": 0.4134700097976131, + "learning_rate": 2.594044252359828e-05, + "loss": 0.5564, + "step": 3598 + }, + { + "epoch": 0.5037088873337998, + "grad_norm": 0.4193374425121163, + "learning_rate": 2.592911714219132e-05, + "loss": 0.5436, + "step": 3599 + }, + { + "epoch": 0.5038488453463961, + "grad_norm": 0.41979302117803585, + "learning_rate": 2.591779156984137e-05, + "loss": 0.5439, + "step": 3600 + }, + { + "epoch": 0.5039888033589923, + "grad_norm": 0.40958102976844213, + "learning_rate": 2.590646580887593e-05, + "loss": 0.5436, + "step": 3601 + }, + { + "epoch": 0.5041287613715886, + "grad_norm": 0.4201739272976801, + "learning_rate": 2.589513986162258e-05, + "loss": 0.5466, + "step": 3602 + }, + { + "epoch": 0.5042687193841847, + "grad_norm": 0.4258535420468519, + "learning_rate": 2.5883813730408894e-05, + "loss": 0.58, + "step": 3603 + }, + { + "epoch": 0.504408677396781, + "grad_norm": 0.41534287545104986, + "learning_rate": 2.587248741756253e-05, + "loss": 0.576, + "step": 3604 + }, + { + "epoch": 0.5045486354093772, + "grad_norm": 0.42287342650084503, + "learning_rate": 2.5861160925411138e-05, + "loss": 0.5832, + "step": 3605 + }, + { + "epoch": 0.5046885934219734, + "grad_norm": 0.4501273841496384, + "learning_rate": 2.5849834256282447e-05, + "loss": 0.6017, + "step": 3606 + }, + { + "epoch": 0.5048285514345696, + "grad_norm": 0.4274714205267487, + "learning_rate": 2.5838507412504187e-05, + "loss": 0.5484, + "step": 3607 + }, + { + "epoch": 0.5049685094471659, + "grad_norm": 0.3888114387545627, + "learning_rate": 2.5827180396404156e-05, + "loss": 0.5101, + "step": 3608 + }, + { + "epoch": 0.5051084674597621, + "grad_norm": 0.43140565124104524, + "learning_rate": 2.5815853210310152e-05, + "loss": 0.5742, + "step": 3609 + }, + { + "epoch": 0.5052484254723583, + "grad_norm": 0.41280804310487207, + "learning_rate": 2.580452585655004e-05, + "loss": 0.5187, + "step": 3610 + }, + { + "epoch": 0.5053883834849545, + "grad_norm": 0.4010119649911929, + "learning_rate": 2.5793198337451696e-05, + "loss": 0.5268, + "step": 3611 + }, + { + "epoch": 0.5055283414975508, + "grad_norm": 0.43498896699155226, + "learning_rate": 2.5781870655343045e-05, + "loss": 0.5975, + "step": 3612 + }, + { + "epoch": 0.5056682995101469, + "grad_norm": 0.3871046921614993, + "learning_rate": 2.5770542812552047e-05, + "loss": 0.5777, + "step": 3613 + }, + { + "epoch": 0.5058082575227432, + "grad_norm": 0.3836731432807253, + "learning_rate": 2.5759214811406678e-05, + "loss": 0.5591, + "step": 3614 + }, + { + "epoch": 0.5059482155353394, + "grad_norm": 0.4186045435948251, + "learning_rate": 2.5747886654234967e-05, + "loss": 0.5929, + "step": 3615 + }, + { + "epoch": 0.5060881735479357, + "grad_norm": 0.3989221255230432, + "learning_rate": 2.5736558343364953e-05, + "loss": 0.5484, + "step": 3616 + }, + { + "epoch": 0.5062281315605318, + "grad_norm": 0.3802059534422492, + "learning_rate": 2.5725229881124734e-05, + "loss": 0.5194, + "step": 3617 + }, + { + "epoch": 0.5063680895731281, + "grad_norm": 0.40770746004051617, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.5294, + "step": 3618 + }, + { + "epoch": 0.5065080475857243, + "grad_norm": 0.4179059282812127, + "learning_rate": 2.5702572511846134e-05, + "loss": 0.56, + "step": 3619 + }, + { + "epoch": 0.5066480055983205, + "grad_norm": 0.402518977593649, + "learning_rate": 2.569124360946407e-05, + "loss": 0.5518, + "step": 3620 + }, + { + "epoch": 0.5067879636109167, + "grad_norm": 0.40070397027204235, + "learning_rate": 2.5679914565024443e-05, + "loss": 0.5229, + "step": 3621 + }, + { + "epoch": 0.506927921623513, + "grad_norm": 0.41261781341428777, + "learning_rate": 2.5668585380855475e-05, + "loss": 0.5716, + "step": 3622 + }, + { + "epoch": 0.5070678796361092, + "grad_norm": 0.4086792886609616, + "learning_rate": 2.5657256059285417e-05, + "loss": 0.5469, + "step": 3623 + }, + { + "epoch": 0.5072078376487054, + "grad_norm": 0.41885939317076915, + "learning_rate": 2.564592660264258e-05, + "loss": 0.5896, + "step": 3624 + }, + { + "epoch": 0.5073477956613016, + "grad_norm": 0.3962896649868784, + "learning_rate": 2.563459701325526e-05, + "loss": 0.5581, + "step": 3625 + }, + { + "epoch": 0.5074877536738979, + "grad_norm": 0.39540667903653226, + "learning_rate": 2.5623267293451826e-05, + "loss": 0.5078, + "step": 3626 + }, + { + "epoch": 0.507627711686494, + "grad_norm": 0.42585105817986957, + "learning_rate": 2.5611937445560634e-05, + "loss": 0.5645, + "step": 3627 + }, + { + "epoch": 0.5077676696990903, + "grad_norm": 0.38842241786452814, + "learning_rate": 2.5600607471910088e-05, + "loss": 0.574, + "step": 3628 + }, + { + "epoch": 0.5079076277116865, + "grad_norm": 0.4605380093032065, + "learning_rate": 2.5589277374828613e-05, + "loss": 0.5743, + "step": 3629 + }, + { + "epoch": 0.5080475857242828, + "grad_norm": 0.4103790649353433, + "learning_rate": 2.5577947156644655e-05, + "loss": 0.5652, + "step": 3630 + }, + { + "epoch": 0.5081875437368789, + "grad_norm": 0.4037333605021007, + "learning_rate": 2.55666168196867e-05, + "loss": 0.5374, + "step": 3631 + }, + { + "epoch": 0.5083275017494752, + "grad_norm": 0.40487097389779486, + "learning_rate": 2.5555286366283237e-05, + "loss": 0.5631, + "step": 3632 + }, + { + "epoch": 0.5084674597620714, + "grad_norm": 0.4082664258736793, + "learning_rate": 2.5543955798762798e-05, + "loss": 0.5598, + "step": 3633 + }, + { + "epoch": 0.5086074177746676, + "grad_norm": 0.41864775722779846, + "learning_rate": 2.553262511945391e-05, + "loss": 0.5544, + "step": 3634 + }, + { + "epoch": 0.5087473757872638, + "grad_norm": 0.41778038341235235, + "learning_rate": 2.5521294330685165e-05, + "loss": 0.545, + "step": 3635 + }, + { + "epoch": 0.5088873337998601, + "grad_norm": 0.3965960636107479, + "learning_rate": 2.550996343478514e-05, + "loss": 0.5629, + "step": 3636 + }, + { + "epoch": 0.5090272918124563, + "grad_norm": 0.39741586166804516, + "learning_rate": 2.5498632434082452e-05, + "loss": 0.5261, + "step": 3637 + }, + { + "epoch": 0.5091672498250525, + "grad_norm": 0.4036662033874021, + "learning_rate": 2.5487301330905745e-05, + "loss": 0.5913, + "step": 3638 + }, + { + "epoch": 0.5093072078376487, + "grad_norm": 0.41863388691988607, + "learning_rate": 2.5475970127583666e-05, + "loss": 0.5442, + "step": 3639 + }, + { + "epoch": 0.509447165850245, + "grad_norm": 0.4135621919846456, + "learning_rate": 2.5464638826444904e-05, + "loss": 0.5563, + "step": 3640 + }, + { + "epoch": 0.5095871238628411, + "grad_norm": 0.4351748088714143, + "learning_rate": 2.545330742981814e-05, + "loss": 0.5551, + "step": 3641 + }, + { + "epoch": 0.5097270818754374, + "grad_norm": 0.4234047595420709, + "learning_rate": 2.544197594003211e-05, + "loss": 0.5876, + "step": 3642 + }, + { + "epoch": 0.5098670398880336, + "grad_norm": 0.4191593238313807, + "learning_rate": 2.5430644359415535e-05, + "loss": 0.5505, + "step": 3643 + }, + { + "epoch": 0.5100069979006299, + "grad_norm": 0.4163770493254072, + "learning_rate": 2.5419312690297176e-05, + "loss": 0.5234, + "step": 3644 + }, + { + "epoch": 0.510146955913226, + "grad_norm": 0.4326672484948109, + "learning_rate": 2.54079809350058e-05, + "loss": 0.5565, + "step": 3645 + }, + { + "epoch": 0.5102869139258223, + "grad_norm": 0.4293727352297737, + "learning_rate": 2.5396649095870202e-05, + "loss": 0.5651, + "step": 3646 + }, + { + "epoch": 0.5104268719384185, + "grad_norm": 0.3928814112239087, + "learning_rate": 2.5385317175219193e-05, + "loss": 0.5223, + "step": 3647 + }, + { + "epoch": 0.5105668299510147, + "grad_norm": 0.39498197465504736, + "learning_rate": 2.5373985175381594e-05, + "loss": 0.546, + "step": 3648 + }, + { + "epoch": 0.5107067879636109, + "grad_norm": 0.3934270157464578, + "learning_rate": 2.5362653098686245e-05, + "loss": 0.532, + "step": 3649 + }, + { + "epoch": 0.5108467459762072, + "grad_norm": 0.41869564796385766, + "learning_rate": 2.5351320947462005e-05, + "loss": 0.5768, + "step": 3650 + }, + { + "epoch": 0.5109867039888033, + "grad_norm": 0.42028182516820867, + "learning_rate": 2.533998872403775e-05, + "loss": 0.5859, + "step": 3651 + }, + { + "epoch": 0.5111266620013996, + "grad_norm": 0.39465753139334825, + "learning_rate": 2.532865643074236e-05, + "loss": 0.5363, + "step": 3652 + }, + { + "epoch": 0.5112666200139958, + "grad_norm": 0.3962177186218334, + "learning_rate": 2.531732406990474e-05, + "loss": 0.5417, + "step": 3653 + }, + { + "epoch": 0.5114065780265921, + "grad_norm": 0.4175265234416395, + "learning_rate": 2.5305991643853806e-05, + "loss": 0.542, + "step": 3654 + }, + { + "epoch": 0.5115465360391882, + "grad_norm": 0.4149054282303681, + "learning_rate": 2.5294659154918475e-05, + "loss": 0.5876, + "step": 3655 + }, + { + "epoch": 0.5116864940517845, + "grad_norm": 0.433229070178671, + "learning_rate": 2.528332660542771e-05, + "loss": 0.5516, + "step": 3656 + }, + { + "epoch": 0.5118264520643807, + "grad_norm": 0.4482742289905204, + "learning_rate": 2.527199399771044e-05, + "loss": 0.5785, + "step": 3657 + }, + { + "epoch": 0.5119664100769769, + "grad_norm": 0.41488401148674847, + "learning_rate": 2.526066133409566e-05, + "loss": 0.5107, + "step": 3658 + }, + { + "epoch": 0.5121063680895731, + "grad_norm": 0.3865352913695365, + "learning_rate": 2.5249328616912316e-05, + "loss": 0.4911, + "step": 3659 + }, + { + "epoch": 0.5122463261021694, + "grad_norm": 0.47638789508637014, + "learning_rate": 2.523799584848942e-05, + "loss": 0.5472, + "step": 3660 + }, + { + "epoch": 0.5123862841147656, + "grad_norm": 0.4067671688157654, + "learning_rate": 2.5226663031155954e-05, + "loss": 0.532, + "step": 3661 + }, + { + "epoch": 0.5125262421273618, + "grad_norm": 0.36222693303435777, + "learning_rate": 2.5215330167240947e-05, + "loss": 0.5085, + "step": 3662 + }, + { + "epoch": 0.512666200139958, + "grad_norm": 0.403199238488262, + "learning_rate": 2.52039972590734e-05, + "loss": 0.5497, + "step": 3663 + }, + { + "epoch": 0.5128061581525543, + "grad_norm": 0.4052356626415982, + "learning_rate": 2.5192664308982344e-05, + "loss": 0.5224, + "step": 3664 + }, + { + "epoch": 0.5129461161651504, + "grad_norm": 0.40489338303806344, + "learning_rate": 2.5181331319296825e-05, + "loss": 0.5621, + "step": 3665 + }, + { + "epoch": 0.5130860741777467, + "grad_norm": 0.40195392432260063, + "learning_rate": 2.516999829234587e-05, + "loss": 0.5221, + "step": 3666 + }, + { + "epoch": 0.5132260321903429, + "grad_norm": 0.44319445941617425, + "learning_rate": 2.515866523045855e-05, + "loss": 0.5607, + "step": 3667 + }, + { + "epoch": 0.5133659902029392, + "grad_norm": 0.42549718785093565, + "learning_rate": 2.514733213596391e-05, + "loss": 0.5678, + "step": 3668 + }, + { + "epoch": 0.5135059482155353, + "grad_norm": 0.4012452520954793, + "learning_rate": 2.5135999011191018e-05, + "loss": 0.574, + "step": 3669 + }, + { + "epoch": 0.5136459062281316, + "grad_norm": 0.3988377623943865, + "learning_rate": 2.5124665858468954e-05, + "loss": 0.5638, + "step": 3670 + }, + { + "epoch": 0.5137858642407278, + "grad_norm": 0.4068508250249883, + "learning_rate": 2.5113332680126795e-05, + "loss": 0.531, + "step": 3671 + }, + { + "epoch": 0.513925822253324, + "grad_norm": 0.40607709238953865, + "learning_rate": 2.510199947849361e-05, + "loss": 0.5555, + "step": 3672 + }, + { + "epoch": 0.5140657802659202, + "grad_norm": 0.40363285844425767, + "learning_rate": 2.50906662558985e-05, + "loss": 0.549, + "step": 3673 + }, + { + "epoch": 0.5142057382785165, + "grad_norm": 0.42540315547468743, + "learning_rate": 2.507933301467056e-05, + "loss": 0.562, + "step": 3674 + }, + { + "epoch": 0.5143456962911127, + "grad_norm": 0.41197452211967045, + "learning_rate": 2.5067999757138884e-05, + "loss": 0.5572, + "step": 3675 + }, + { + "epoch": 0.5144856543037089, + "grad_norm": 0.42328084101201563, + "learning_rate": 2.505666648563256e-05, + "loss": 0.5651, + "step": 3676 + }, + { + "epoch": 0.5146256123163051, + "grad_norm": 0.39617586204555455, + "learning_rate": 2.5045333202480698e-05, + "loss": 0.5505, + "step": 3677 + }, + { + "epoch": 0.5147655703289014, + "grad_norm": 0.40450493806111026, + "learning_rate": 2.5033999910012414e-05, + "loss": 0.5387, + "step": 3678 + }, + { + "epoch": 0.5149055283414975, + "grad_norm": 0.41125264241739257, + "learning_rate": 2.502266661055679e-05, + "loss": 0.5338, + "step": 3679 + }, + { + "epoch": 0.5150454863540938, + "grad_norm": 0.41606733051241485, + "learning_rate": 2.5011333306442953e-05, + "loss": 0.5472, + "step": 3680 + }, + { + "epoch": 0.51518544436669, + "grad_norm": 0.3940959299812521, + "learning_rate": 2.5e-05, + "loss": 0.5264, + "step": 3681 + }, + { + "epoch": 0.5153254023792863, + "grad_norm": 0.4160510111032202, + "learning_rate": 2.498866669355706e-05, + "loss": 0.5365, + "step": 3682 + }, + { + "epoch": 0.5154653603918824, + "grad_norm": 0.40780249685180014, + "learning_rate": 2.497733338944321e-05, + "loss": 0.5137, + "step": 3683 + }, + { + "epoch": 0.5156053184044787, + "grad_norm": 0.41930837922293446, + "learning_rate": 2.4966000089987596e-05, + "loss": 0.549, + "step": 3684 + }, + { + "epoch": 0.5157452764170749, + "grad_norm": 0.41175787454056617, + "learning_rate": 2.4954666797519305e-05, + "loss": 0.5445, + "step": 3685 + }, + { + "epoch": 0.515885234429671, + "grad_norm": 0.40112997035094405, + "learning_rate": 2.4943333514367448e-05, + "loss": 0.5323, + "step": 3686 + }, + { + "epoch": 0.5160251924422673, + "grad_norm": 0.41601050967799824, + "learning_rate": 2.4932000242861122e-05, + "loss": 0.5705, + "step": 3687 + }, + { + "epoch": 0.5161651504548636, + "grad_norm": 0.3938594765036972, + "learning_rate": 2.4920666985329443e-05, + "loss": 0.5147, + "step": 3688 + }, + { + "epoch": 0.5163051084674598, + "grad_norm": 0.4187589160579126, + "learning_rate": 2.4909333744101503e-05, + "loss": 0.5461, + "step": 3689 + }, + { + "epoch": 0.516445066480056, + "grad_norm": 0.4447365782109597, + "learning_rate": 2.48980005215064e-05, + "loss": 0.5388, + "step": 3690 + }, + { + "epoch": 0.5165850244926522, + "grad_norm": 0.40572901279081225, + "learning_rate": 2.4886667319873214e-05, + "loss": 0.5746, + "step": 3691 + }, + { + "epoch": 0.5167249825052485, + "grad_norm": 0.3937401384845004, + "learning_rate": 2.4875334141531052e-05, + "loss": 0.5514, + "step": 3692 + }, + { + "epoch": 0.5168649405178446, + "grad_norm": 0.4251082426934148, + "learning_rate": 2.486400098880899e-05, + "loss": 0.5531, + "step": 3693 + }, + { + "epoch": 0.5170048985304408, + "grad_norm": 0.4233026623262488, + "learning_rate": 2.4852667864036093e-05, + "loss": 0.5698, + "step": 3694 + }, + { + "epoch": 0.5171448565430371, + "grad_norm": 0.4184619207216555, + "learning_rate": 2.484133476954146e-05, + "loss": 0.5445, + "step": 3695 + }, + { + "epoch": 0.5172848145556334, + "grad_norm": 0.38758737521657194, + "learning_rate": 2.4830001707654134e-05, + "loss": 0.5383, + "step": 3696 + }, + { + "epoch": 0.5174247725682295, + "grad_norm": 0.41121949288648046, + "learning_rate": 2.4818668680703187e-05, + "loss": 0.5548, + "step": 3697 + }, + { + "epoch": 0.5175647305808257, + "grad_norm": 0.3912926343783076, + "learning_rate": 2.4807335691017662e-05, + "loss": 0.5993, + "step": 3698 + }, + { + "epoch": 0.517704688593422, + "grad_norm": 0.4352916580631968, + "learning_rate": 2.479600274092661e-05, + "loss": 0.5947, + "step": 3699 + }, + { + "epoch": 0.5178446466060181, + "grad_norm": 0.4117712323458744, + "learning_rate": 2.4784669832759065e-05, + "loss": 0.566, + "step": 3700 + }, + { + "epoch": 0.5179846046186144, + "grad_norm": 0.48543517712737916, + "learning_rate": 2.4773336968844045e-05, + "loss": 0.5217, + "step": 3701 + }, + { + "epoch": 0.5181245626312107, + "grad_norm": 0.4201945605915251, + "learning_rate": 2.4762004151510584e-05, + "loss": 0.5817, + "step": 3702 + }, + { + "epoch": 0.5182645206438069, + "grad_norm": 0.40028199720572183, + "learning_rate": 2.475067138308769e-05, + "loss": 0.5305, + "step": 3703 + }, + { + "epoch": 0.518404478656403, + "grad_norm": 0.45130189575312646, + "learning_rate": 2.4739338665904356e-05, + "loss": 0.5762, + "step": 3704 + }, + { + "epoch": 0.5185444366689993, + "grad_norm": 0.40520084504522474, + "learning_rate": 2.4728006002289555e-05, + "loss": 0.5618, + "step": 3705 + }, + { + "epoch": 0.5186843946815956, + "grad_norm": 0.39762527724883795, + "learning_rate": 2.4716673394572297e-05, + "loss": 0.5519, + "step": 3706 + }, + { + "epoch": 0.5188243526941917, + "grad_norm": 0.4303843474015456, + "learning_rate": 2.4705340845081527e-05, + "loss": 0.56, + "step": 3707 + }, + { + "epoch": 0.518964310706788, + "grad_norm": 0.4169723723648604, + "learning_rate": 2.4694008356146207e-05, + "loss": 0.5436, + "step": 3708 + }, + { + "epoch": 0.5191042687193842, + "grad_norm": 0.40055909799700246, + "learning_rate": 2.4682675930095263e-05, + "loss": 0.536, + "step": 3709 + }, + { + "epoch": 0.5192442267319805, + "grad_norm": 0.4109688655223356, + "learning_rate": 2.4671343569257647e-05, + "loss": 0.5324, + "step": 3710 + }, + { + "epoch": 0.5193841847445766, + "grad_norm": 0.4149175387680195, + "learning_rate": 2.4660011275962258e-05, + "loss": 0.5388, + "step": 3711 + }, + { + "epoch": 0.5195241427571728, + "grad_norm": 0.4264139741717757, + "learning_rate": 2.4648679052537994e-05, + "loss": 0.5625, + "step": 3712 + }, + { + "epoch": 0.5196641007697691, + "grad_norm": 0.4289357178860241, + "learning_rate": 2.4637346901313757e-05, + "loss": 0.5689, + "step": 3713 + }, + { + "epoch": 0.5198040587823652, + "grad_norm": 0.42933380182165265, + "learning_rate": 2.4626014824618415e-05, + "loss": 0.5751, + "step": 3714 + }, + { + "epoch": 0.5199440167949615, + "grad_norm": 0.41237458627558815, + "learning_rate": 2.461468282478082e-05, + "loss": 0.5708, + "step": 3715 + }, + { + "epoch": 0.5200839748075577, + "grad_norm": 0.40783837110167326, + "learning_rate": 2.46033509041298e-05, + "loss": 0.5163, + "step": 3716 + }, + { + "epoch": 0.520223932820154, + "grad_norm": 0.3934394614065878, + "learning_rate": 2.4592019064994207e-05, + "loss": 0.5123, + "step": 3717 + }, + { + "epoch": 0.5203638908327501, + "grad_norm": 0.42393320677683094, + "learning_rate": 2.4580687309702836e-05, + "loss": 0.5459, + "step": 3718 + }, + { + "epoch": 0.5205038488453464, + "grad_norm": 0.4201737718371511, + "learning_rate": 2.456935564058447e-05, + "loss": 0.5484, + "step": 3719 + }, + { + "epoch": 0.5206438068579426, + "grad_norm": 0.4037513584821948, + "learning_rate": 2.45580240599679e-05, + "loss": 0.539, + "step": 3720 + }, + { + "epoch": 0.5207837648705388, + "grad_norm": 0.40384281587615306, + "learning_rate": 2.4546692570181863e-05, + "loss": 0.5594, + "step": 3721 + }, + { + "epoch": 0.520923722883135, + "grad_norm": 0.39646345117822984, + "learning_rate": 2.453536117355511e-05, + "loss": 0.5344, + "step": 3722 + }, + { + "epoch": 0.5210636808957313, + "grad_norm": 0.402351444419845, + "learning_rate": 2.4524029872416333e-05, + "loss": 0.548, + "step": 3723 + }, + { + "epoch": 0.5212036389083275, + "grad_norm": 0.4310017935995615, + "learning_rate": 2.451269866909426e-05, + "loss": 0.5654, + "step": 3724 + }, + { + "epoch": 0.5213435969209237, + "grad_norm": 0.39852961133599507, + "learning_rate": 2.4501367565917554e-05, + "loss": 0.5248, + "step": 3725 + }, + { + "epoch": 0.5214835549335199, + "grad_norm": 0.3978940832017032, + "learning_rate": 2.4490036565214873e-05, + "loss": 0.5423, + "step": 3726 + }, + { + "epoch": 0.5216235129461162, + "grad_norm": 0.4032221188907521, + "learning_rate": 2.447870566931484e-05, + "loss": 0.5625, + "step": 3727 + }, + { + "epoch": 0.5217634709587123, + "grad_norm": 0.40124015142391184, + "learning_rate": 2.4467374880546095e-05, + "loss": 0.5173, + "step": 3728 + }, + { + "epoch": 0.5219034289713086, + "grad_norm": 0.419761697749084, + "learning_rate": 2.4456044201237215e-05, + "loss": 0.577, + "step": 3729 + }, + { + "epoch": 0.5220433869839048, + "grad_norm": 0.4068938084974087, + "learning_rate": 2.4444713633716765e-05, + "loss": 0.5089, + "step": 3730 + }, + { + "epoch": 0.5221833449965011, + "grad_norm": 0.4372637791239717, + "learning_rate": 2.4433383180313303e-05, + "loss": 0.6125, + "step": 3731 + }, + { + "epoch": 0.5223233030090972, + "grad_norm": 0.4211367854727307, + "learning_rate": 2.4422052843355347e-05, + "loss": 0.5837, + "step": 3732 + }, + { + "epoch": 0.5224632610216935, + "grad_norm": 0.4138228399759103, + "learning_rate": 2.4410722625171396e-05, + "loss": 0.5904, + "step": 3733 + }, + { + "epoch": 0.5226032190342897, + "grad_norm": 0.43035321865290377, + "learning_rate": 2.4399392528089915e-05, + "loss": 0.5543, + "step": 3734 + }, + { + "epoch": 0.5227431770468859, + "grad_norm": 0.393376380647825, + "learning_rate": 2.4388062554439372e-05, + "loss": 0.5625, + "step": 3735 + }, + { + "epoch": 0.5228831350594821, + "grad_norm": 0.4057572933128106, + "learning_rate": 2.4376732706548183e-05, + "loss": 0.5386, + "step": 3736 + }, + { + "epoch": 0.5230230930720784, + "grad_norm": 0.4127940511172469, + "learning_rate": 2.4365402986744738e-05, + "loss": 0.5424, + "step": 3737 + }, + { + "epoch": 0.5231630510846746, + "grad_norm": 0.43512898556014096, + "learning_rate": 2.4354073397357427e-05, + "loss": 0.5886, + "step": 3738 + }, + { + "epoch": 0.5233030090972708, + "grad_norm": 0.429632898236204, + "learning_rate": 2.434274394071459e-05, + "loss": 0.5897, + "step": 3739 + }, + { + "epoch": 0.523442967109867, + "grad_norm": 0.5024683991705309, + "learning_rate": 2.4331414619144537e-05, + "loss": 0.5389, + "step": 3740 + }, + { + "epoch": 0.5235829251224633, + "grad_norm": 0.426305756774051, + "learning_rate": 2.432008543497556e-05, + "loss": 0.5319, + "step": 3741 + }, + { + "epoch": 0.5237228831350594, + "grad_norm": 0.4067282442774484, + "learning_rate": 2.430875639053593e-05, + "loss": 0.5364, + "step": 3742 + }, + { + "epoch": 0.5238628411476557, + "grad_norm": 0.40523378137952, + "learning_rate": 2.4297427488153872e-05, + "loss": 0.5512, + "step": 3743 + }, + { + "epoch": 0.5240027991602519, + "grad_norm": 0.42309697388292367, + "learning_rate": 2.42860987301576e-05, + "loss": 0.5832, + "step": 3744 + }, + { + "epoch": 0.5241427571728482, + "grad_norm": 0.39749349792214256, + "learning_rate": 2.427477011887527e-05, + "loss": 0.5453, + "step": 3745 + }, + { + "epoch": 0.5242827151854443, + "grad_norm": 0.4128499134441993, + "learning_rate": 2.4263441656635053e-05, + "loss": 0.5614, + "step": 3746 + }, + { + "epoch": 0.5244226731980406, + "grad_norm": 0.4307471936718753, + "learning_rate": 2.4252113345765046e-05, + "loss": 0.5426, + "step": 3747 + }, + { + "epoch": 0.5245626312106368, + "grad_norm": 0.40120347811779566, + "learning_rate": 2.4240785188593325e-05, + "loss": 0.5562, + "step": 3748 + }, + { + "epoch": 0.524702589223233, + "grad_norm": 0.40709801793493633, + "learning_rate": 2.4229457187447956e-05, + "loss": 0.5326, + "step": 3749 + }, + { + "epoch": 0.5248425472358292, + "grad_norm": 0.38783041950630354, + "learning_rate": 2.4218129344656958e-05, + "loss": 0.5801, + "step": 3750 + }, + { + "epoch": 0.5249825052484255, + "grad_norm": 0.41702390350925966, + "learning_rate": 2.4206801662548314e-05, + "loss": 0.543, + "step": 3751 + }, + { + "epoch": 0.5251224632610217, + "grad_norm": 0.40842859770281464, + "learning_rate": 2.419547414344997e-05, + "loss": 0.5719, + "step": 3752 + }, + { + "epoch": 0.5252624212736179, + "grad_norm": 0.40966706938584196, + "learning_rate": 2.418414678968985e-05, + "loss": 0.5679, + "step": 3753 + }, + { + "epoch": 0.5254023792862141, + "grad_norm": 0.4217418453013296, + "learning_rate": 2.4172819603595853e-05, + "loss": 0.5721, + "step": 3754 + }, + { + "epoch": 0.5255423372988104, + "grad_norm": 0.41200690319133665, + "learning_rate": 2.4161492587495812e-05, + "loss": 0.5681, + "step": 3755 + }, + { + "epoch": 0.5256822953114065, + "grad_norm": 0.4307870892864373, + "learning_rate": 2.4150165743717556e-05, + "loss": 0.5991, + "step": 3756 + }, + { + "epoch": 0.5258222533240028, + "grad_norm": 0.3964187463617023, + "learning_rate": 2.4138839074588868e-05, + "loss": 0.5381, + "step": 3757 + }, + { + "epoch": 0.525962211336599, + "grad_norm": 0.41691171273978006, + "learning_rate": 2.4127512582437485e-05, + "loss": 0.5336, + "step": 3758 + }, + { + "epoch": 0.5261021693491953, + "grad_norm": 0.4243161504581818, + "learning_rate": 2.4116186269591105e-05, + "loss": 0.5466, + "step": 3759 + }, + { + "epoch": 0.5262421273617914, + "grad_norm": 0.41746283237341214, + "learning_rate": 2.410486013837743e-05, + "loss": 0.5292, + "step": 3760 + }, + { + "epoch": 0.5263820853743877, + "grad_norm": 0.4258425078942338, + "learning_rate": 2.4093534191124072e-05, + "loss": 0.597, + "step": 3761 + }, + { + "epoch": 0.5265220433869839, + "grad_norm": 0.4082388057932214, + "learning_rate": 2.408220843015864e-05, + "loss": 0.5604, + "step": 3762 + }, + { + "epoch": 0.5266620013995801, + "grad_norm": 0.40439792503437516, + "learning_rate": 2.4070882857808678e-05, + "loss": 0.5599, + "step": 3763 + }, + { + "epoch": 0.5268019594121763, + "grad_norm": 0.403671819972436, + "learning_rate": 2.4059557476401726e-05, + "loss": 0.5701, + "step": 3764 + }, + { + "epoch": 0.5269419174247726, + "grad_norm": 0.4115634599388499, + "learning_rate": 2.4048232288265253e-05, + "loss": 0.5613, + "step": 3765 + }, + { + "epoch": 0.5270818754373688, + "grad_norm": 0.4196728696799674, + "learning_rate": 2.40369072957267e-05, + "loss": 0.5456, + "step": 3766 + }, + { + "epoch": 0.527221833449965, + "grad_norm": 0.45632616155283706, + "learning_rate": 2.4025582501113474e-05, + "loss": 0.5986, + "step": 3767 + }, + { + "epoch": 0.5273617914625612, + "grad_norm": 0.41907920921841135, + "learning_rate": 2.4014257906752928e-05, + "loss": 0.5691, + "step": 3768 + }, + { + "epoch": 0.5275017494751575, + "grad_norm": 0.4593046177587895, + "learning_rate": 2.4002933514972383e-05, + "loss": 0.5491, + "step": 3769 + }, + { + "epoch": 0.5276417074877536, + "grad_norm": 0.4141383704701082, + "learning_rate": 2.399160932809911e-05, + "loss": 0.5405, + "step": 3770 + }, + { + "epoch": 0.5277816655003499, + "grad_norm": 0.40874131996221996, + "learning_rate": 2.3980285348460363e-05, + "loss": 0.5549, + "step": 3771 + }, + { + "epoch": 0.5279216235129461, + "grad_norm": 0.42334424503896645, + "learning_rate": 2.3968961578383323e-05, + "loss": 0.5596, + "step": 3772 + }, + { + "epoch": 0.5280615815255424, + "grad_norm": 0.40408381518564374, + "learning_rate": 2.395763802019513e-05, + "loss": 0.5541, + "step": 3773 + }, + { + "epoch": 0.5282015395381385, + "grad_norm": 0.3892924948143325, + "learning_rate": 2.3946314676222905e-05, + "loss": 0.5561, + "step": 3774 + }, + { + "epoch": 0.5283414975507348, + "grad_norm": 0.38424570502320887, + "learning_rate": 2.393499154879371e-05, + "loss": 0.5513, + "step": 3775 + }, + { + "epoch": 0.528481455563331, + "grad_norm": 0.40542835424755946, + "learning_rate": 2.3923668640234558e-05, + "loss": 0.5511, + "step": 3776 + }, + { + "epoch": 0.5286214135759272, + "grad_norm": 0.43305443591622345, + "learning_rate": 2.3912345952872416e-05, + "loss": 0.5534, + "step": 3777 + }, + { + "epoch": 0.5287613715885234, + "grad_norm": 0.39079289404660855, + "learning_rate": 2.3901023489034218e-05, + "loss": 0.5693, + "step": 3778 + }, + { + "epoch": 0.5289013296011197, + "grad_norm": 0.4993326382660523, + "learning_rate": 2.388970125104685e-05, + "loss": 0.5665, + "step": 3779 + }, + { + "epoch": 0.5290412876137159, + "grad_norm": 0.4013031423647386, + "learning_rate": 2.3878379241237136e-05, + "loss": 0.5059, + "step": 3780 + }, + { + "epoch": 0.5291812456263121, + "grad_norm": 0.4122753651945142, + "learning_rate": 2.3867057461931857e-05, + "loss": 0.5643, + "step": 3781 + }, + { + "epoch": 0.5293212036389083, + "grad_norm": 0.3958098061908628, + "learning_rate": 2.3855735915457778e-05, + "loss": 0.5783, + "step": 3782 + }, + { + "epoch": 0.5294611616515046, + "grad_norm": 0.40430666249800307, + "learning_rate": 2.384441460414158e-05, + "loss": 0.532, + "step": 3783 + }, + { + "epoch": 0.5296011196641007, + "grad_norm": 0.43159094392859726, + "learning_rate": 2.383309353030989e-05, + "loss": 0.5648, + "step": 3784 + }, + { + "epoch": 0.529741077676697, + "grad_norm": 0.4278430956102745, + "learning_rate": 2.3821772696289336e-05, + "loss": 0.5525, + "step": 3785 + }, + { + "epoch": 0.5298810356892932, + "grad_norm": 0.42297623639530246, + "learning_rate": 2.3810452104406444e-05, + "loss": 0.5557, + "step": 3786 + }, + { + "epoch": 0.5300209937018895, + "grad_norm": 0.4194124443087706, + "learning_rate": 2.3799131756987716e-05, + "loss": 0.5805, + "step": 3787 + }, + { + "epoch": 0.5301609517144856, + "grad_norm": 0.42223996989148205, + "learning_rate": 2.378781165635959e-05, + "loss": 0.553, + "step": 3788 + }, + { + "epoch": 0.5303009097270819, + "grad_norm": 0.4051480666509419, + "learning_rate": 2.3776491804848474e-05, + "loss": 0.5439, + "step": 3789 + }, + { + "epoch": 0.5304408677396781, + "grad_norm": 0.6913758982209272, + "learning_rate": 2.3765172204780705e-05, + "loss": 0.5037, + "step": 3790 + }, + { + "epoch": 0.5305808257522743, + "grad_norm": 0.39883852712442486, + "learning_rate": 2.375385285848257e-05, + "loss": 0.5373, + "step": 3791 + }, + { + "epoch": 0.5307207837648705, + "grad_norm": 0.41473538961504164, + "learning_rate": 2.3742533768280322e-05, + "loss": 0.558, + "step": 3792 + }, + { + "epoch": 0.5308607417774668, + "grad_norm": 0.4338240458587132, + "learning_rate": 2.3731214936500147e-05, + "loss": 0.5703, + "step": 3793 + }, + { + "epoch": 0.531000699790063, + "grad_norm": 0.44632810155747676, + "learning_rate": 2.3719896365468176e-05, + "loss": 0.5428, + "step": 3794 + }, + { + "epoch": 0.5311406578026592, + "grad_norm": 0.4693366131147214, + "learning_rate": 2.370857805751048e-05, + "loss": 0.5601, + "step": 3795 + }, + { + "epoch": 0.5312806158152554, + "grad_norm": 0.4328684886571571, + "learning_rate": 2.3697260014953108e-05, + "loss": 0.5984, + "step": 3796 + }, + { + "epoch": 0.5314205738278517, + "grad_norm": 0.4150511950700261, + "learning_rate": 2.3685942240122017e-05, + "loss": 0.5578, + "step": 3797 + }, + { + "epoch": 0.5315605318404478, + "grad_norm": 0.4140437625288529, + "learning_rate": 2.3674624735343133e-05, + "loss": 0.57, + "step": 3798 + }, + { + "epoch": 0.5317004898530441, + "grad_norm": 0.4075001120603276, + "learning_rate": 2.3663307502942306e-05, + "loss": 0.5392, + "step": 3799 + }, + { + "epoch": 0.5318404478656403, + "grad_norm": 0.4132932952092581, + "learning_rate": 2.3651990545245356e-05, + "loss": 0.5826, + "step": 3800 + }, + { + "epoch": 0.5319804058782366, + "grad_norm": 0.4260946689124058, + "learning_rate": 2.3640673864578023e-05, + "loss": 0.5291, + "step": 3801 + }, + { + "epoch": 0.5321203638908327, + "grad_norm": 0.42352605916443814, + "learning_rate": 2.3629357463265995e-05, + "loss": 0.5619, + "step": 3802 + }, + { + "epoch": 0.532260321903429, + "grad_norm": 0.4071157003238009, + "learning_rate": 2.361804134363493e-05, + "loss": 0.5693, + "step": 3803 + }, + { + "epoch": 0.5324002799160252, + "grad_norm": 0.40527112461883336, + "learning_rate": 2.360672550801039e-05, + "loss": 0.5573, + "step": 3804 + }, + { + "epoch": 0.5325402379286214, + "grad_norm": 0.4185227436712144, + "learning_rate": 2.359540995871789e-05, + "loss": 0.5519, + "step": 3805 + }, + { + "epoch": 0.5326801959412176, + "grad_norm": 0.40474881222817827, + "learning_rate": 2.3584094698082888e-05, + "loss": 0.5416, + "step": 3806 + }, + { + "epoch": 0.5328201539538139, + "grad_norm": 0.4035189283812099, + "learning_rate": 2.35727797284308e-05, + "loss": 0.5386, + "step": 3807 + }, + { + "epoch": 0.53296011196641, + "grad_norm": 0.4126483303377044, + "learning_rate": 2.3561465052086962e-05, + "loss": 0.5513, + "step": 3808 + }, + { + "epoch": 0.5331000699790063, + "grad_norm": 0.41908332941234616, + "learning_rate": 2.3550150671376644e-05, + "loss": 0.598, + "step": 3809 + }, + { + "epoch": 0.5332400279916025, + "grad_norm": 0.42507750506666686, + "learning_rate": 2.353883658862508e-05, + "loss": 0.5289, + "step": 3810 + }, + { + "epoch": 0.5333799860041988, + "grad_norm": 0.3908992343447489, + "learning_rate": 2.3527522806157422e-05, + "loss": 0.5955, + "step": 3811 + }, + { + "epoch": 0.5335199440167949, + "grad_norm": 0.4088150381618844, + "learning_rate": 2.351620932629877e-05, + "loss": 0.5434, + "step": 3812 + }, + { + "epoch": 0.5336599020293912, + "grad_norm": 0.4313685833974606, + "learning_rate": 2.3504896151374144e-05, + "loss": 0.5751, + "step": 3813 + }, + { + "epoch": 0.5337998600419874, + "grad_norm": 0.4274490027569291, + "learning_rate": 2.349358328370854e-05, + "loss": 0.5459, + "step": 3814 + }, + { + "epoch": 0.5339398180545836, + "grad_norm": 0.4224342493673025, + "learning_rate": 2.3482270725626856e-05, + "loss": 0.5372, + "step": 3815 + }, + { + "epoch": 0.5340797760671798, + "grad_norm": 0.508038847256669, + "learning_rate": 2.3470958479453938e-05, + "loss": 0.5545, + "step": 3816 + }, + { + "epoch": 0.5342197340797761, + "grad_norm": 0.40285477548710635, + "learning_rate": 2.345964654751456e-05, + "loss": 0.5348, + "step": 3817 + }, + { + "epoch": 0.5343596920923723, + "grad_norm": 0.4204468078785021, + "learning_rate": 2.3448334932133446e-05, + "loss": 0.5873, + "step": 3818 + }, + { + "epoch": 0.5344996501049685, + "grad_norm": 0.3926665806611564, + "learning_rate": 2.3437023635635254e-05, + "loss": 0.5856, + "step": 3819 + }, + { + "epoch": 0.5346396081175647, + "grad_norm": 0.4281667973965022, + "learning_rate": 2.342571266034456e-05, + "loss": 0.5829, + "step": 3820 + }, + { + "epoch": 0.534779566130161, + "grad_norm": 0.4235798408466391, + "learning_rate": 2.3414402008585888e-05, + "loss": 0.5345, + "step": 3821 + }, + { + "epoch": 0.5349195241427571, + "grad_norm": 0.41560783094264714, + "learning_rate": 2.34030916826837e-05, + "loss": 0.5745, + "step": 3822 + }, + { + "epoch": 0.5350594821553534, + "grad_norm": 0.4120864111860111, + "learning_rate": 2.3391781684962368e-05, + "loss": 0.5304, + "step": 3823 + }, + { + "epoch": 0.5351994401679496, + "grad_norm": 0.3998366908738693, + "learning_rate": 2.3380472017746202e-05, + "loss": 0.5579, + "step": 3824 + }, + { + "epoch": 0.5353393981805459, + "grad_norm": 0.4078921865892698, + "learning_rate": 2.3369162683359486e-05, + "loss": 0.5532, + "step": 3825 + }, + { + "epoch": 0.535479356193142, + "grad_norm": 0.42102589317173017, + "learning_rate": 2.3357853684126384e-05, + "loss": 0.5367, + "step": 3826 + }, + { + "epoch": 0.5356193142057383, + "grad_norm": 0.4303168029475006, + "learning_rate": 2.3346545022371015e-05, + "loss": 0.5275, + "step": 3827 + }, + { + "epoch": 0.5357592722183345, + "grad_norm": 0.4084962687065381, + "learning_rate": 2.3335236700417404e-05, + "loss": 0.5643, + "step": 3828 + }, + { + "epoch": 0.5358992302309307, + "grad_norm": 0.42704269523521693, + "learning_rate": 2.3323928720589555e-05, + "loss": 0.5145, + "step": 3829 + }, + { + "epoch": 0.5360391882435269, + "grad_norm": 0.4251581534304365, + "learning_rate": 2.331262108521136e-05, + "loss": 0.5768, + "step": 3830 + }, + { + "epoch": 0.5361791462561232, + "grad_norm": 0.4296437560622346, + "learning_rate": 2.3301313796606652e-05, + "loss": 0.5804, + "step": 3831 + }, + { + "epoch": 0.5363191042687194, + "grad_norm": 0.4160558480550044, + "learning_rate": 2.3290006857099194e-05, + "loss": 0.5618, + "step": 3832 + }, + { + "epoch": 0.5364590622813156, + "grad_norm": 0.43852307849271205, + "learning_rate": 2.327870026901268e-05, + "loss": 0.582, + "step": 3833 + }, + { + "epoch": 0.5365990202939118, + "grad_norm": 0.42358328346140833, + "learning_rate": 2.326739403467073e-05, + "loss": 0.5265, + "step": 3834 + }, + { + "epoch": 0.5367389783065081, + "grad_norm": 0.4344046385915696, + "learning_rate": 2.3256088156396868e-05, + "loss": 0.5855, + "step": 3835 + }, + { + "epoch": 0.5368789363191042, + "grad_norm": 0.4318461945295688, + "learning_rate": 2.3244782636514596e-05, + "loss": 0.5665, + "step": 3836 + }, + { + "epoch": 0.5370188943317005, + "grad_norm": 0.4129881442304272, + "learning_rate": 2.3233477477347305e-05, + "loss": 0.5313, + "step": 3837 + }, + { + "epoch": 0.5371588523442967, + "grad_norm": 0.4277048919308099, + "learning_rate": 2.3222172681218302e-05, + "loss": 0.5632, + "step": 3838 + }, + { + "epoch": 0.537298810356893, + "grad_norm": 0.4200455327121287, + "learning_rate": 2.3210868250450865e-05, + "loss": 0.5406, + "step": 3839 + }, + { + "epoch": 0.5374387683694891, + "grad_norm": 0.4205148102378079, + "learning_rate": 2.3199564187368156e-05, + "loss": 0.5609, + "step": 3840 + }, + { + "epoch": 0.5375787263820854, + "grad_norm": 0.41158756784707023, + "learning_rate": 2.3188260494293273e-05, + "loss": 0.5872, + "step": 3841 + }, + { + "epoch": 0.5377186843946816, + "grad_norm": 0.43038344004576323, + "learning_rate": 2.3176957173549235e-05, + "loss": 0.5768, + "step": 3842 + }, + { + "epoch": 0.5378586424072778, + "grad_norm": 0.4358694072281102, + "learning_rate": 2.3165654227459004e-05, + "loss": 0.5357, + "step": 3843 + }, + { + "epoch": 0.537998600419874, + "grad_norm": 0.4343491080187233, + "learning_rate": 2.3154351658345437e-05, + "loss": 0.5608, + "step": 3844 + }, + { + "epoch": 0.5381385584324703, + "grad_norm": 0.42585193830336604, + "learning_rate": 2.3143049468531334e-05, + "loss": 0.6127, + "step": 3845 + }, + { + "epoch": 0.5382785164450665, + "grad_norm": 0.4168594356306299, + "learning_rate": 2.3131747660339394e-05, + "loss": 0.5421, + "step": 3846 + }, + { + "epoch": 0.5384184744576627, + "grad_norm": 0.4143482797388666, + "learning_rate": 2.3120446236092276e-05, + "loss": 0.5634, + "step": 3847 + }, + { + "epoch": 0.5385584324702589, + "grad_norm": 0.42347488623228974, + "learning_rate": 2.310914519811253e-05, + "loss": 0.553, + "step": 3848 + }, + { + "epoch": 0.5386983904828552, + "grad_norm": 0.4232797867273413, + "learning_rate": 2.309784454872262e-05, + "loss": 0.597, + "step": 3849 + }, + { + "epoch": 0.5388383484954513, + "grad_norm": 0.4226580404199005, + "learning_rate": 2.3086544290244967e-05, + "loss": 0.5418, + "step": 3850 + }, + { + "epoch": 0.5389783065080476, + "grad_norm": 0.43647901356290086, + "learning_rate": 2.3075244425001874e-05, + "loss": 0.6088, + "step": 3851 + }, + { + "epoch": 0.5391182645206438, + "grad_norm": 0.4110315073763923, + "learning_rate": 2.3063944955315584e-05, + "loss": 0.54, + "step": 3852 + }, + { + "epoch": 0.5392582225332401, + "grad_norm": 0.370641365809909, + "learning_rate": 2.3052645883508242e-05, + "loss": 0.5216, + "step": 3853 + }, + { + "epoch": 0.5393981805458362, + "grad_norm": 0.4035453298163194, + "learning_rate": 2.3041347211901935e-05, + "loss": 0.559, + "step": 3854 + }, + { + "epoch": 0.5395381385584325, + "grad_norm": 0.426639073461407, + "learning_rate": 2.303004894281865e-05, + "loss": 0.5409, + "step": 3855 + }, + { + "epoch": 0.5396780965710287, + "grad_norm": 0.42512057290523336, + "learning_rate": 2.3018751078580287e-05, + "loss": 0.607, + "step": 3856 + }, + { + "epoch": 0.5398180545836249, + "grad_norm": 0.42427900709447586, + "learning_rate": 2.300745362150869e-05, + "loss": 0.5736, + "step": 3857 + }, + { + "epoch": 0.5399580125962211, + "grad_norm": 0.4308088797406061, + "learning_rate": 2.299615657392559e-05, + "loss": 0.5737, + "step": 3858 + }, + { + "epoch": 0.5400979706088174, + "grad_norm": 0.39893806446075136, + "learning_rate": 2.2984859938152644e-05, + "loss": 0.5167, + "step": 3859 + }, + { + "epoch": 0.5402379286214136, + "grad_norm": 0.4227311792800813, + "learning_rate": 2.297356371651142e-05, + "loss": 0.5559, + "step": 3860 + }, + { + "epoch": 0.5403778866340098, + "grad_norm": 0.4307602196718643, + "learning_rate": 2.296226791132342e-05, + "loss": 0.567, + "step": 3861 + }, + { + "epoch": 0.540517844646606, + "grad_norm": 0.4048870602226458, + "learning_rate": 2.2950972524910045e-05, + "loss": 0.5474, + "step": 3862 + }, + { + "epoch": 0.5406578026592023, + "grad_norm": 0.4130105755887349, + "learning_rate": 2.2939677559592605e-05, + "loss": 0.5438, + "step": 3863 + }, + { + "epoch": 0.5407977606717984, + "grad_norm": 0.4388089801883774, + "learning_rate": 2.2928383017692322e-05, + "loss": 0.562, + "step": 3864 + }, + { + "epoch": 0.5409377186843947, + "grad_norm": 0.4591220588088252, + "learning_rate": 2.2917088901530358e-05, + "loss": 0.5396, + "step": 3865 + }, + { + "epoch": 0.5410776766969909, + "grad_norm": 0.42951530189135795, + "learning_rate": 2.290579521342776e-05, + "loss": 0.5726, + "step": 3866 + }, + { + "epoch": 0.5412176347095872, + "grad_norm": 0.40253890454723895, + "learning_rate": 2.2894501955705477e-05, + "loss": 0.5234, + "step": 3867 + }, + { + "epoch": 0.5413575927221833, + "grad_norm": 0.42549136435980095, + "learning_rate": 2.288320913068442e-05, + "loss": 0.5505, + "step": 3868 + }, + { + "epoch": 0.5414975507347796, + "grad_norm": 0.4060031912669076, + "learning_rate": 2.2871916740685366e-05, + "loss": 0.5863, + "step": 3869 + }, + { + "epoch": 0.5416375087473758, + "grad_norm": 0.4242326527971933, + "learning_rate": 2.2860624788029013e-05, + "loss": 0.5408, + "step": 3870 + }, + { + "epoch": 0.541777466759972, + "grad_norm": 0.43284739833626334, + "learning_rate": 2.2849333275035964e-05, + "loss": 0.5781, + "step": 3871 + }, + { + "epoch": 0.5419174247725682, + "grad_norm": 0.38138784670752485, + "learning_rate": 2.283804220402676e-05, + "loss": 0.5374, + "step": 3872 + }, + { + "epoch": 0.5420573827851645, + "grad_norm": 0.42596112184398915, + "learning_rate": 2.2826751577321813e-05, + "loss": 0.5407, + "step": 3873 + }, + { + "epoch": 0.5421973407977607, + "grad_norm": 0.4016705521461287, + "learning_rate": 2.2815461397241466e-05, + "loss": 0.543, + "step": 3874 + }, + { + "epoch": 0.5423372988103569, + "grad_norm": 0.41007081598495276, + "learning_rate": 2.2804171666105976e-05, + "loss": 0.5606, + "step": 3875 + }, + { + "epoch": 0.5424772568229531, + "grad_norm": 0.41478676365527245, + "learning_rate": 2.2792882386235485e-05, + "loss": 0.5264, + "step": 3876 + }, + { + "epoch": 0.5426172148355494, + "grad_norm": 0.4120034933406958, + "learning_rate": 2.2781593559950052e-05, + "loss": 0.5392, + "step": 3877 + }, + { + "epoch": 0.5427571728481455, + "grad_norm": 0.4154088448683998, + "learning_rate": 2.277030518956965e-05, + "loss": 0.5667, + "step": 3878 + }, + { + "epoch": 0.5428971308607418, + "grad_norm": 0.4099026076541546, + "learning_rate": 2.2759017277414166e-05, + "loss": 0.5248, + "step": 3879 + }, + { + "epoch": 0.543037088873338, + "grad_norm": 0.44769525609475164, + "learning_rate": 2.2747729825803366e-05, + "loss": 0.5772, + "step": 3880 + }, + { + "epoch": 0.5431770468859343, + "grad_norm": 0.4131464054022151, + "learning_rate": 2.273644283705694e-05, + "loss": 0.5797, + "step": 3881 + }, + { + "epoch": 0.5433170048985304, + "grad_norm": 0.4261856744618942, + "learning_rate": 2.2725156313494466e-05, + "loss": 0.549, + "step": 3882 + }, + { + "epoch": 0.5434569629111267, + "grad_norm": 0.4207414434352504, + "learning_rate": 2.271387025743546e-05, + "loss": 0.5738, + "step": 3883 + }, + { + "epoch": 0.5435969209237229, + "grad_norm": 0.4087475690432694, + "learning_rate": 2.2702584671199317e-05, + "loss": 0.5674, + "step": 3884 + }, + { + "epoch": 0.5437368789363191, + "grad_norm": 0.40838344825260353, + "learning_rate": 2.2691299557105328e-05, + "loss": 0.5503, + "step": 3885 + }, + { + "epoch": 0.5438768369489153, + "grad_norm": 0.42246887497450447, + "learning_rate": 2.268001491747271e-05, + "loss": 0.5276, + "step": 3886 + }, + { + "epoch": 0.5440167949615116, + "grad_norm": 0.4325429890817475, + "learning_rate": 2.266873075462056e-05, + "loss": 0.5538, + "step": 3887 + }, + { + "epoch": 0.5441567529741078, + "grad_norm": 0.420910542262214, + "learning_rate": 2.2657447070867902e-05, + "loss": 0.5475, + "step": 3888 + }, + { + "epoch": 0.544296710986704, + "grad_norm": 0.422876814725259, + "learning_rate": 2.264616386853363e-05, + "loss": 0.5477, + "step": 3889 + }, + { + "epoch": 0.5444366689993002, + "grad_norm": 0.4143598457765582, + "learning_rate": 2.2634881149936575e-05, + "loss": 0.5419, + "step": 3890 + }, + { + "epoch": 0.5445766270118965, + "grad_norm": 0.4105061536286951, + "learning_rate": 2.2623598917395438e-05, + "loss": 0.5475, + "step": 3891 + }, + { + "epoch": 0.5447165850244926, + "grad_norm": 0.41659655042186283, + "learning_rate": 2.261231717322883e-05, + "loss": 0.5753, + "step": 3892 + }, + { + "epoch": 0.5448565430370889, + "grad_norm": 0.4530820570399967, + "learning_rate": 2.2601035919755274e-05, + "loss": 0.5281, + "step": 3893 + }, + { + "epoch": 0.5449965010496851, + "grad_norm": 0.39850319238936455, + "learning_rate": 2.258975515929318e-05, + "loss": 0.5278, + "step": 3894 + }, + { + "epoch": 0.5451364590622814, + "grad_norm": 0.3978091253457419, + "learning_rate": 2.2578474894160857e-05, + "loss": 0.5453, + "step": 3895 + }, + { + "epoch": 0.5452764170748775, + "grad_norm": 0.4231460383028299, + "learning_rate": 2.2567195126676507e-05, + "loss": 0.5963, + "step": 3896 + }, + { + "epoch": 0.5454163750874738, + "grad_norm": 0.42063829224368665, + "learning_rate": 2.2555915859158244e-05, + "loss": 0.5614, + "step": 3897 + }, + { + "epoch": 0.54555633310007, + "grad_norm": 0.4083469272821722, + "learning_rate": 2.2544637093924074e-05, + "loss": 0.5389, + "step": 3898 + }, + { + "epoch": 0.5456962911126662, + "grad_norm": 0.4217541606558565, + "learning_rate": 2.2533358833291896e-05, + "loss": 0.5544, + "step": 3899 + }, + { + "epoch": 0.5458362491252624, + "grad_norm": 0.4303357717984438, + "learning_rate": 2.2522081079579498e-05, + "loss": 0.5524, + "step": 3900 + }, + { + "epoch": 0.5459762071378587, + "grad_norm": 0.41929973760406747, + "learning_rate": 2.251080383510459e-05, + "loss": 0.5652, + "step": 3901 + }, + { + "epoch": 0.5461161651504549, + "grad_norm": 0.4126809759165879, + "learning_rate": 2.2499527102184744e-05, + "loss": 0.5917, + "step": 3902 + }, + { + "epoch": 0.546256123163051, + "grad_norm": 0.39993684724127465, + "learning_rate": 2.2488250883137445e-05, + "loss": 0.532, + "step": 3903 + }, + { + "epoch": 0.5463960811756473, + "grad_norm": 0.4070252666557728, + "learning_rate": 2.247697518028008e-05, + "loss": 0.5381, + "step": 3904 + }, + { + "epoch": 0.5465360391882436, + "grad_norm": 0.4203749044451171, + "learning_rate": 2.2465699995929916e-05, + "loss": 0.5445, + "step": 3905 + }, + { + "epoch": 0.5466759972008397, + "grad_norm": 0.4313492162077952, + "learning_rate": 2.2454425332404122e-05, + "loss": 0.55, + "step": 3906 + }, + { + "epoch": 0.546815955213436, + "grad_norm": 0.39695375283438905, + "learning_rate": 2.2443151192019735e-05, + "loss": 0.5418, + "step": 3907 + }, + { + "epoch": 0.5469559132260322, + "grad_norm": 0.4051789619978703, + "learning_rate": 2.2431877577093737e-05, + "loss": 0.5059, + "step": 3908 + }, + { + "epoch": 0.5470958712386285, + "grad_norm": 0.4218855232884939, + "learning_rate": 2.2420604489942946e-05, + "loss": 0.5617, + "step": 3909 + }, + { + "epoch": 0.5472358292512246, + "grad_norm": 0.3989929337699537, + "learning_rate": 2.2409331932884108e-05, + "loss": 0.5244, + "step": 3910 + }, + { + "epoch": 0.5473757872638209, + "grad_norm": 0.4010087179047754, + "learning_rate": 2.2398059908233842e-05, + "loss": 0.5371, + "step": 3911 + }, + { + "epoch": 0.5475157452764171, + "grad_norm": 0.4466643997788068, + "learning_rate": 2.238678841830867e-05, + "loss": 0.5934, + "step": 3912 + }, + { + "epoch": 0.5476557032890133, + "grad_norm": 0.39728611887637755, + "learning_rate": 2.237551746542499e-05, + "loss": 0.5587, + "step": 3913 + }, + { + "epoch": 0.5477956613016095, + "grad_norm": 0.41809901084229356, + "learning_rate": 2.236424705189909e-05, + "loss": 0.5865, + "step": 3914 + }, + { + "epoch": 0.5479356193142058, + "grad_norm": 0.40468351305300143, + "learning_rate": 2.2352977180047175e-05, + "loss": 0.5213, + "step": 3915 + }, + { + "epoch": 0.548075577326802, + "grad_norm": 0.4113028432405159, + "learning_rate": 2.2341707852185305e-05, + "loss": 0.5488, + "step": 3916 + }, + { + "epoch": 0.5482155353393982, + "grad_norm": 0.4448481069088446, + "learning_rate": 2.2330439070629448e-05, + "loss": 0.5665, + "step": 3917 + }, + { + "epoch": 0.5483554933519944, + "grad_norm": 0.4360732301218836, + "learning_rate": 2.2319170837695435e-05, + "loss": 0.5586, + "step": 3918 + }, + { + "epoch": 0.5484954513645907, + "grad_norm": 0.4194014787432002, + "learning_rate": 2.2307903155699027e-05, + "loss": 0.5265, + "step": 3919 + }, + { + "epoch": 0.5486354093771868, + "grad_norm": 0.4085819437326063, + "learning_rate": 2.2296636026955835e-05, + "loss": 0.5485, + "step": 3920 + }, + { + "epoch": 0.548775367389783, + "grad_norm": 0.40406354838300534, + "learning_rate": 2.2285369453781364e-05, + "loss": 0.5409, + "step": 3921 + }, + { + "epoch": 0.5489153254023793, + "grad_norm": 0.41085941144789107, + "learning_rate": 2.2274103438491022e-05, + "loss": 0.5228, + "step": 3922 + }, + { + "epoch": 0.5490552834149756, + "grad_norm": 0.44664374072567387, + "learning_rate": 2.2262837983400082e-05, + "loss": 0.5663, + "step": 3923 + }, + { + "epoch": 0.5491952414275717, + "grad_norm": 0.3848765417302251, + "learning_rate": 2.2251573090823706e-05, + "loss": 0.5645, + "step": 3924 + }, + { + "epoch": 0.549335199440168, + "grad_norm": 0.40234337327172925, + "learning_rate": 2.2240308763076935e-05, + "loss": 0.573, + "step": 3925 + }, + { + "epoch": 0.5494751574527642, + "grad_norm": 0.4124065219528931, + "learning_rate": 2.222904500247473e-05, + "loss": 0.5579, + "step": 3926 + }, + { + "epoch": 0.5496151154653603, + "grad_norm": 0.4085451573297374, + "learning_rate": 2.2217781811331885e-05, + "loss": 0.5534, + "step": 3927 + }, + { + "epoch": 0.5497550734779566, + "grad_norm": 0.413762414731123, + "learning_rate": 2.22065191919631e-05, + "loss": 0.5493, + "step": 3928 + }, + { + "epoch": 0.5498950314905529, + "grad_norm": 0.4148155757825788, + "learning_rate": 2.2195257146682975e-05, + "loss": 0.5439, + "step": 3929 + }, + { + "epoch": 0.5500349895031491, + "grad_norm": 0.44638831807610185, + "learning_rate": 2.218399567780597e-05, + "loss": 0.5899, + "step": 3930 + }, + { + "epoch": 0.5501749475157452, + "grad_norm": 0.41295788493334096, + "learning_rate": 2.217273478764642e-05, + "loss": 0.5524, + "step": 3931 + }, + { + "epoch": 0.5503149055283415, + "grad_norm": 0.402717034759649, + "learning_rate": 2.216147447851855e-05, + "loss": 0.5534, + "step": 3932 + }, + { + "epoch": 0.5504548635409378, + "grad_norm": 0.40126403258397014, + "learning_rate": 2.2150214752736488e-05, + "loss": 0.5227, + "step": 3933 + }, + { + "epoch": 0.5505948215535339, + "grad_norm": 0.4081017029720488, + "learning_rate": 2.2138955612614207e-05, + "loss": 0.5316, + "step": 3934 + }, + { + "epoch": 0.5507347795661302, + "grad_norm": 0.3952776691289986, + "learning_rate": 2.2127697060465576e-05, + "loss": 0.54, + "step": 3935 + }, + { + "epoch": 0.5508747375787264, + "grad_norm": 0.4086382501258365, + "learning_rate": 2.211643909860433e-05, + "loss": 0.5376, + "step": 3936 + }, + { + "epoch": 0.5510146955913227, + "grad_norm": 0.44788307946149386, + "learning_rate": 2.210518172934412e-05, + "loss": 0.5916, + "step": 3937 + }, + { + "epoch": 0.5511546536039188, + "grad_norm": 0.4092566815599357, + "learning_rate": 2.2093924954998438e-05, + "loss": 0.5282, + "step": 3938 + }, + { + "epoch": 0.551294611616515, + "grad_norm": 0.43673368860544104, + "learning_rate": 2.2082668777880653e-05, + "loss": 0.5458, + "step": 3939 + }, + { + "epoch": 0.5514345696291113, + "grad_norm": 0.406082695925654, + "learning_rate": 2.2071413200304043e-05, + "loss": 0.5375, + "step": 3940 + }, + { + "epoch": 0.5515745276417074, + "grad_norm": 0.3883615387736086, + "learning_rate": 2.206015822458174e-05, + "loss": 0.5221, + "step": 3941 + }, + { + "epoch": 0.5517144856543037, + "grad_norm": 0.4315662304861121, + "learning_rate": 2.2048903853026745e-05, + "loss": 0.5598, + "step": 3942 + }, + { + "epoch": 0.5518544436669, + "grad_norm": 0.40829821883297823, + "learning_rate": 2.203765008795195e-05, + "loss": 0.5876, + "step": 3943 + }, + { + "epoch": 0.5519944016794962, + "grad_norm": 0.4263164757098248, + "learning_rate": 2.2026396931670125e-05, + "loss": 0.5506, + "step": 3944 + }, + { + "epoch": 0.5521343596920923, + "grad_norm": 0.4109132538637934, + "learning_rate": 2.2015144386493896e-05, + "loss": 0.5435, + "step": 3945 + }, + { + "epoch": 0.5522743177046886, + "grad_norm": 0.40656446743742547, + "learning_rate": 2.2003892454735786e-05, + "loss": 0.5169, + "step": 3946 + }, + { + "epoch": 0.5524142757172849, + "grad_norm": 0.4174968254449107, + "learning_rate": 2.1992641138708166e-05, + "loss": 0.5506, + "step": 3947 + }, + { + "epoch": 0.552554233729881, + "grad_norm": 0.41392476596527283, + "learning_rate": 2.1981390440723316e-05, + "loss": 0.5705, + "step": 3948 + }, + { + "epoch": 0.5526941917424772, + "grad_norm": 0.40768401049876407, + "learning_rate": 2.197014036309336e-05, + "loss": 0.5546, + "step": 3949 + }, + { + "epoch": 0.5528341497550735, + "grad_norm": 0.43659985592955725, + "learning_rate": 2.1958890908130288e-05, + "loss": 0.5448, + "step": 3950 + }, + { + "epoch": 0.5529741077676698, + "grad_norm": 0.4191971534215795, + "learning_rate": 2.1947642078146004e-05, + "loss": 0.5442, + "step": 3951 + }, + { + "epoch": 0.5531140657802659, + "grad_norm": 0.43109773584435357, + "learning_rate": 2.193639387545224e-05, + "loss": 0.5506, + "step": 3952 + }, + { + "epoch": 0.5532540237928621, + "grad_norm": 0.42420382449551325, + "learning_rate": 2.1925146302360625e-05, + "loss": 0.5578, + "step": 3953 + }, + { + "epoch": 0.5533939818054584, + "grad_norm": 0.426621743894938, + "learning_rate": 2.1913899361182632e-05, + "loss": 0.5453, + "step": 3954 + }, + { + "epoch": 0.5535339398180545, + "grad_norm": 0.42952974584048115, + "learning_rate": 2.1902653054229642e-05, + "loss": 0.5514, + "step": 3955 + }, + { + "epoch": 0.5536738978306508, + "grad_norm": 0.42155187844422864, + "learning_rate": 2.189140738381288e-05, + "loss": 0.5622, + "step": 3956 + }, + { + "epoch": 0.553813855843247, + "grad_norm": 0.4260436696061489, + "learning_rate": 2.1880162352243425e-05, + "loss": 0.5845, + "step": 3957 + }, + { + "epoch": 0.5539538138558433, + "grad_norm": 0.4048895896225377, + "learning_rate": 2.1868917961832274e-05, + "loss": 0.5593, + "step": 3958 + }, + { + "epoch": 0.5540937718684394, + "grad_norm": 0.4088135684372084, + "learning_rate": 2.1857674214890254e-05, + "loss": 0.517, + "step": 3959 + }, + { + "epoch": 0.5542337298810357, + "grad_norm": 0.40953844739681516, + "learning_rate": 2.1846431113728064e-05, + "loss": 0.57, + "step": 3960 + }, + { + "epoch": 0.554373687893632, + "grad_norm": 0.41049014674652273, + "learning_rate": 2.1835188660656267e-05, + "loss": 0.582, + "step": 3961 + }, + { + "epoch": 0.5545136459062281, + "grad_norm": 0.44214177894488643, + "learning_rate": 2.1823946857985323e-05, + "loss": 0.5558, + "step": 3962 + }, + { + "epoch": 0.5546536039188243, + "grad_norm": 0.40688749353479037, + "learning_rate": 2.1812705708025526e-05, + "loss": 0.5647, + "step": 3963 + }, + { + "epoch": 0.5547935619314206, + "grad_norm": 0.43619186505830493, + "learning_rate": 2.1801465213087044e-05, + "loss": 0.5819, + "step": 3964 + }, + { + "epoch": 0.5549335199440167, + "grad_norm": 0.3873905618002296, + "learning_rate": 2.179022537547991e-05, + "loss": 0.5015, + "step": 3965 + }, + { + "epoch": 0.555073477956613, + "grad_norm": 0.40366380009710195, + "learning_rate": 2.1778986197514034e-05, + "loss": 0.5364, + "step": 3966 + }, + { + "epoch": 0.5552134359692092, + "grad_norm": 0.41225036663129866, + "learning_rate": 2.1767747681499176e-05, + "loss": 0.5395, + "step": 3967 + }, + { + "epoch": 0.5553533939818055, + "grad_norm": 0.41221185236407193, + "learning_rate": 2.1756509829744956e-05, + "loss": 0.5375, + "step": 3968 + }, + { + "epoch": 0.5554933519944016, + "grad_norm": 0.41637063641888467, + "learning_rate": 2.1745272644560885e-05, + "loss": 0.5695, + "step": 3969 + }, + { + "epoch": 0.5556333100069979, + "grad_norm": 0.4246669886827112, + "learning_rate": 2.173403612825631e-05, + "loss": 0.5226, + "step": 3970 + }, + { + "epoch": 0.5557732680195941, + "grad_norm": 0.4399810838843672, + "learning_rate": 2.172280028314045e-05, + "loss": 0.5637, + "step": 3971 + }, + { + "epoch": 0.5559132260321903, + "grad_norm": 0.42264632884515463, + "learning_rate": 2.1711565111522372e-05, + "loss": 0.5715, + "step": 3972 + }, + { + "epoch": 0.5560531840447865, + "grad_norm": 0.40102465338699234, + "learning_rate": 2.170033061571104e-05, + "loss": 0.5468, + "step": 3973 + }, + { + "epoch": 0.5561931420573828, + "grad_norm": 0.4330504312787022, + "learning_rate": 2.1689096798015247e-05, + "loss": 0.5544, + "step": 3974 + }, + { + "epoch": 0.556333100069979, + "grad_norm": 0.41663293988065675, + "learning_rate": 2.167786366074365e-05, + "loss": 0.5855, + "step": 3975 + }, + { + "epoch": 0.5564730580825752, + "grad_norm": 0.40801980733280413, + "learning_rate": 2.1666631206204786e-05, + "loss": 0.5431, + "step": 3976 + }, + { + "epoch": 0.5566130160951714, + "grad_norm": 0.42663243125510747, + "learning_rate": 2.1655399436707026e-05, + "loss": 0.5574, + "step": 3977 + }, + { + "epoch": 0.5567529741077677, + "grad_norm": 0.4605067091274325, + "learning_rate": 2.164416835455862e-05, + "loss": 0.5592, + "step": 3978 + }, + { + "epoch": 0.5568929321203638, + "grad_norm": 0.42331401492783005, + "learning_rate": 2.1632937962067657e-05, + "loss": 0.5798, + "step": 3979 + }, + { + "epoch": 0.5570328901329601, + "grad_norm": 0.5978199091551641, + "learning_rate": 2.1621708261542116e-05, + "loss": 0.5474, + "step": 3980 + }, + { + "epoch": 0.5571728481455563, + "grad_norm": 0.39934848914020754, + "learning_rate": 2.161047925528981e-05, + "loss": 0.5362, + "step": 3981 + }, + { + "epoch": 0.5573128061581526, + "grad_norm": 0.41963116860550964, + "learning_rate": 2.1599250945618402e-05, + "loss": 0.5506, + "step": 3982 + }, + { + "epoch": 0.5574527641707487, + "grad_norm": 0.41948040388705365, + "learning_rate": 2.158802333483542e-05, + "loss": 0.5723, + "step": 3983 + }, + { + "epoch": 0.557592722183345, + "grad_norm": 0.4188462923542639, + "learning_rate": 2.157679642524828e-05, + "loss": 0.5542, + "step": 3984 + }, + { + "epoch": 0.5577326801959412, + "grad_norm": 0.4356609128042065, + "learning_rate": 2.15655702191642e-05, + "loss": 0.5568, + "step": 3985 + }, + { + "epoch": 0.5578726382085374, + "grad_norm": 0.4295054521048054, + "learning_rate": 2.1554344718890284e-05, + "loss": 0.6145, + "step": 3986 + }, + { + "epoch": 0.5580125962211336, + "grad_norm": 0.41247657463875265, + "learning_rate": 2.1543119926733495e-05, + "loss": 0.5561, + "step": 3987 + }, + { + "epoch": 0.5581525542337299, + "grad_norm": 0.41532771461538054, + "learning_rate": 2.1531895845000637e-05, + "loss": 0.5414, + "step": 3988 + }, + { + "epoch": 0.5582925122463261, + "grad_norm": 0.40582165725579067, + "learning_rate": 2.1520672475998373e-05, + "loss": 0.5509, + "step": 3989 + }, + { + "epoch": 0.5584324702589223, + "grad_norm": 0.4060285622816394, + "learning_rate": 2.1509449822033205e-05, + "loss": 0.5484, + "step": 3990 + }, + { + "epoch": 0.5585724282715185, + "grad_norm": 0.4029440998090135, + "learning_rate": 2.1498227885411526e-05, + "loss": 0.585, + "step": 3991 + }, + { + "epoch": 0.5587123862841148, + "grad_norm": 0.4104149112850885, + "learning_rate": 2.148700666843955e-05, + "loss": 0.52, + "step": 3992 + }, + { + "epoch": 0.5588523442967109, + "grad_norm": 0.4171536565223565, + "learning_rate": 2.1475786173423335e-05, + "loss": 0.5623, + "step": 3993 + }, + { + "epoch": 0.5589923023093072, + "grad_norm": 0.3980539899980082, + "learning_rate": 2.146456640266883e-05, + "loss": 0.5812, + "step": 3994 + }, + { + "epoch": 0.5591322603219034, + "grad_norm": 0.41514890565668344, + "learning_rate": 2.14533473584818e-05, + "loss": 0.5826, + "step": 3995 + }, + { + "epoch": 0.5592722183344997, + "grad_norm": 0.4166753133049829, + "learning_rate": 2.1442129043167874e-05, + "loss": 0.5543, + "step": 3996 + }, + { + "epoch": 0.5594121763470958, + "grad_norm": 0.4114882654130403, + "learning_rate": 2.1430911459032526e-05, + "loss": 0.5741, + "step": 3997 + }, + { + "epoch": 0.5595521343596921, + "grad_norm": 0.4017370253857547, + "learning_rate": 2.1419694608381094e-05, + "loss": 0.5963, + "step": 3998 + }, + { + "epoch": 0.5596920923722883, + "grad_norm": 0.42955031856189274, + "learning_rate": 2.1408478493518742e-05, + "loss": 0.5896, + "step": 3999 + }, + { + "epoch": 0.5598320503848845, + "grad_norm": 0.40378298742736296, + "learning_rate": 2.1397263116750503e-05, + "loss": 0.5657, + "step": 4000 } ], "logging_steps": 1, @@ -7027,7 +28027,7 @@ "attributes": {} } }, - "total_flos": 175094048915456.0, + "total_flos": 700001933443072.0, "train_batch_size": 1, "trial_name": null, "trial_params": null