diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17122 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.994882292732855, + "eval_steps": 500, + "global_step": 2440, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020470829068577278, + "grad_norm": 5.914718943943769, + "learning_rate": 1.639344262295082e-07, + "loss": 0.89, + "step": 1 + }, + { + "epoch": 0.0040941658137154556, + "grad_norm": 5.711215790032282, + "learning_rate": 3.278688524590164e-07, + "loss": 0.8602, + "step": 2 + }, + { + "epoch": 0.006141248720573183, + "grad_norm": 6.070030223088026, + "learning_rate": 4.918032786885246e-07, + "loss": 0.8902, + "step": 3 + }, + { + "epoch": 0.008188331627430911, + "grad_norm": 5.739027819285582, + "learning_rate": 6.557377049180328e-07, + "loss": 0.9194, + "step": 4 + }, + { + "epoch": 0.01023541453428864, + "grad_norm": 5.558991142229951, + "learning_rate": 8.196721311475409e-07, + "loss": 0.8523, + "step": 5 + }, + { + "epoch": 0.012282497441146366, + "grad_norm": 5.622988925862499, + "learning_rate": 9.836065573770493e-07, + "loss": 0.9102, + "step": 6 + }, + { + "epoch": 0.014329580348004094, + "grad_norm": 5.360893456655671, + "learning_rate": 1.1475409836065575e-06, + "loss": 0.8682, + "step": 7 + }, + { + "epoch": 0.016376663254861822, + "grad_norm": 5.095538528802894, + "learning_rate": 1.3114754098360657e-06, + "loss": 0.8689, + "step": 8 + }, + { + "epoch": 0.01842374616171955, + "grad_norm": 4.59455985194371, + "learning_rate": 1.4754098360655739e-06, + "loss": 0.852, + "step": 9 + }, + { + "epoch": 0.02047082906857728, + "grad_norm": 4.139437078984008, + "learning_rate": 1.6393442622950819e-06, + "loss": 0.8102, + "step": 10 + }, + { + "epoch": 0.022517911975435005, + "grad_norm": 4.171848417796509, + "learning_rate": 1.8032786885245903e-06, + "loss": 0.8102, + "step": 11 + }, + { + "epoch": 0.02456499488229273, + "grad_norm": 2.4194756147420646, + "learning_rate": 1.9672131147540985e-06, + "loss": 0.7481, + "step": 12 + }, + { + "epoch": 0.02661207778915046, + "grad_norm": 2.28600629836227, + "learning_rate": 2.1311475409836067e-06, + "loss": 0.7631, + "step": 13 + }, + { + "epoch": 0.028659160696008188, + "grad_norm": 2.075915465619067, + "learning_rate": 2.295081967213115e-06, + "loss": 0.7506, + "step": 14 + }, + { + "epoch": 0.030706243602865915, + "grad_norm": 1.952947905310399, + "learning_rate": 2.459016393442623e-06, + "loss": 0.7781, + "step": 15 + }, + { + "epoch": 0.032753326509723645, + "grad_norm": 3.070659688365803, + "learning_rate": 2.6229508196721314e-06, + "loss": 0.7488, + "step": 16 + }, + { + "epoch": 0.03480040941658137, + "grad_norm": 3.5264876012722652, + "learning_rate": 2.786885245901639e-06, + "loss": 0.7746, + "step": 17 + }, + { + "epoch": 0.0368474923234391, + "grad_norm": 3.580605082704591, + "learning_rate": 2.9508196721311478e-06, + "loss": 0.7618, + "step": 18 + }, + { + "epoch": 0.038894575230296824, + "grad_norm": 3.194230800018389, + "learning_rate": 3.114754098360656e-06, + "loss": 0.7207, + "step": 19 + }, + { + "epoch": 0.04094165813715456, + "grad_norm": 3.116544112683001, + "learning_rate": 3.2786885245901638e-06, + "loss": 0.7352, + "step": 20 + }, + { + "epoch": 0.042988741044012284, + "grad_norm": 2.6314247686078094, + "learning_rate": 3.4426229508196724e-06, + "loss": 0.7432, + "step": 21 + }, + { + "epoch": 0.04503582395087001, + "grad_norm": 2.2137046535068037, + "learning_rate": 3.6065573770491806e-06, + "loss": 0.7444, + "step": 22 + }, + { + "epoch": 0.04708290685772774, + "grad_norm": 1.6649733496725676, + "learning_rate": 3.7704918032786884e-06, + "loss": 0.6646, + "step": 23 + }, + { + "epoch": 0.04912998976458546, + "grad_norm": 1.3074947984675702, + "learning_rate": 3.934426229508197e-06, + "loss": 0.6388, + "step": 24 + }, + { + "epoch": 0.0511770726714432, + "grad_norm": 1.190645815657471, + "learning_rate": 4.098360655737705e-06, + "loss": 0.6575, + "step": 25 + }, + { + "epoch": 0.05322415557830092, + "grad_norm": 1.2707845894729972, + "learning_rate": 4.2622950819672135e-06, + "loss": 0.671, + "step": 26 + }, + { + "epoch": 0.05527123848515865, + "grad_norm": 1.4412640499522065, + "learning_rate": 4.426229508196722e-06, + "loss": 0.6526, + "step": 27 + }, + { + "epoch": 0.057318321392016376, + "grad_norm": 1.3167145329441712, + "learning_rate": 4.59016393442623e-06, + "loss": 0.6533, + "step": 28 + }, + { + "epoch": 0.0593654042988741, + "grad_norm": 1.338399336461372, + "learning_rate": 4.754098360655738e-06, + "loss": 0.6525, + "step": 29 + }, + { + "epoch": 0.06141248720573183, + "grad_norm": 1.1005314901400522, + "learning_rate": 4.918032786885246e-06, + "loss": 0.6513, + "step": 30 + }, + { + "epoch": 0.06345957011258956, + "grad_norm": 1.0341365252216648, + "learning_rate": 5.0819672131147545e-06, + "loss": 0.6517, + "step": 31 + }, + { + "epoch": 0.06550665301944729, + "grad_norm": 0.8191371315934305, + "learning_rate": 5.245901639344263e-06, + "loss": 0.6562, + "step": 32 + }, + { + "epoch": 0.06755373592630501, + "grad_norm": 1.0105078954121915, + "learning_rate": 5.409836065573772e-06, + "loss": 0.6318, + "step": 33 + }, + { + "epoch": 0.06960081883316274, + "grad_norm": 1.083317612523659, + "learning_rate": 5.573770491803278e-06, + "loss": 0.6528, + "step": 34 + }, + { + "epoch": 0.07164790174002048, + "grad_norm": 1.054453013530755, + "learning_rate": 5.737704918032787e-06, + "loss": 0.6233, + "step": 35 + }, + { + "epoch": 0.0736949846468782, + "grad_norm": 0.693669694861969, + "learning_rate": 5.9016393442622956e-06, + "loss": 0.5959, + "step": 36 + }, + { + "epoch": 0.07574206755373593, + "grad_norm": 0.6429960279803676, + "learning_rate": 6.065573770491804e-06, + "loss": 0.6031, + "step": 37 + }, + { + "epoch": 0.07778915046059365, + "grad_norm": 0.8326921183964671, + "learning_rate": 6.229508196721312e-06, + "loss": 0.6252, + "step": 38 + }, + { + "epoch": 0.07983623336745138, + "grad_norm": 0.7826089791231328, + "learning_rate": 6.393442622950821e-06, + "loss": 0.5891, + "step": 39 + }, + { + "epoch": 0.08188331627430911, + "grad_norm": 0.6837753928275052, + "learning_rate": 6.5573770491803276e-06, + "loss": 0.6151, + "step": 40 + }, + { + "epoch": 0.08393039918116683, + "grad_norm": 0.557265922988386, + "learning_rate": 6.721311475409837e-06, + "loss": 0.5832, + "step": 41 + }, + { + "epoch": 0.08597748208802457, + "grad_norm": 0.697548950763872, + "learning_rate": 6.885245901639345e-06, + "loss": 0.629, + "step": 42 + }, + { + "epoch": 0.08802456499488229, + "grad_norm": 0.7262822585766031, + "learning_rate": 7.049180327868853e-06, + "loss": 0.5531, + "step": 43 + }, + { + "epoch": 0.09007164790174002, + "grad_norm": 0.7701295257257081, + "learning_rate": 7.213114754098361e-06, + "loss": 0.612, + "step": 44 + }, + { + "epoch": 0.09211873080859775, + "grad_norm": 0.4681801446847226, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.5675, + "step": 45 + }, + { + "epoch": 0.09416581371545547, + "grad_norm": 0.5678252337287879, + "learning_rate": 7.540983606557377e-06, + "loss": 0.6391, + "step": 46 + }, + { + "epoch": 0.09621289662231321, + "grad_norm": 0.5704556041531674, + "learning_rate": 7.704918032786886e-06, + "loss": 0.5773, + "step": 47 + }, + { + "epoch": 0.09825997952917093, + "grad_norm": 0.5739689196908508, + "learning_rate": 7.868852459016394e-06, + "loss": 0.5928, + "step": 48 + }, + { + "epoch": 0.10030706243602866, + "grad_norm": 0.5604483444656064, + "learning_rate": 8.032786885245902e-06, + "loss": 0.5747, + "step": 49 + }, + { + "epoch": 0.1023541453428864, + "grad_norm": 0.5266605058896955, + "learning_rate": 8.19672131147541e-06, + "loss": 0.5506, + "step": 50 + }, + { + "epoch": 0.10440122824974411, + "grad_norm": 0.6855806410785544, + "learning_rate": 8.360655737704919e-06, + "loss": 0.5974, + "step": 51 + }, + { + "epoch": 0.10644831115660185, + "grad_norm": 0.6492542095140276, + "learning_rate": 8.524590163934427e-06, + "loss": 0.6027, + "step": 52 + }, + { + "epoch": 0.10849539406345957, + "grad_norm": 0.5702781382301559, + "learning_rate": 8.688524590163935e-06, + "loss": 0.5997, + "step": 53 + }, + { + "epoch": 0.1105424769703173, + "grad_norm": 0.5828822136164438, + "learning_rate": 8.852459016393443e-06, + "loss": 0.5633, + "step": 54 + }, + { + "epoch": 0.11258955987717502, + "grad_norm": 0.5923389847320444, + "learning_rate": 9.016393442622952e-06, + "loss": 0.5842, + "step": 55 + }, + { + "epoch": 0.11463664278403275, + "grad_norm": 0.5636209619702331, + "learning_rate": 9.18032786885246e-06, + "loss": 0.5645, + "step": 56 + }, + { + "epoch": 0.11668372569089049, + "grad_norm": 0.4912175098948749, + "learning_rate": 9.344262295081968e-06, + "loss": 0.5635, + "step": 57 + }, + { + "epoch": 0.1187308085977482, + "grad_norm": 0.754994674395481, + "learning_rate": 9.508196721311476e-06, + "loss": 0.5658, + "step": 58 + }, + { + "epoch": 0.12077789150460594, + "grad_norm": 0.5456693243174974, + "learning_rate": 9.672131147540984e-06, + "loss": 0.5559, + "step": 59 + }, + { + "epoch": 0.12282497441146366, + "grad_norm": 0.5877326554557568, + "learning_rate": 9.836065573770493e-06, + "loss": 0.5755, + "step": 60 + }, + { + "epoch": 0.12487205731832139, + "grad_norm": 0.4917195015950769, + "learning_rate": 1e-05, + "loss": 0.5422, + "step": 61 + }, + { + "epoch": 0.1269191402251791, + "grad_norm": 0.5000347167895204, + "learning_rate": 1.0163934426229509e-05, + "loss": 0.5638, + "step": 62 + }, + { + "epoch": 0.12896622313203684, + "grad_norm": 0.5165061332900804, + "learning_rate": 1.0327868852459017e-05, + "loss": 0.5874, + "step": 63 + }, + { + "epoch": 0.13101330603889458, + "grad_norm": 0.5647596683341366, + "learning_rate": 1.0491803278688525e-05, + "loss": 0.5549, + "step": 64 + }, + { + "epoch": 0.1330603889457523, + "grad_norm": 0.556985780595292, + "learning_rate": 1.0655737704918034e-05, + "loss": 0.5287, + "step": 65 + }, + { + "epoch": 0.13510747185261002, + "grad_norm": 0.5870046275772329, + "learning_rate": 1.0819672131147544e-05, + "loss": 0.5726, + "step": 66 + }, + { + "epoch": 0.13715455475946775, + "grad_norm": 0.510016021993097, + "learning_rate": 1.0983606557377052e-05, + "loss": 0.641, + "step": 67 + }, + { + "epoch": 0.13920163766632548, + "grad_norm": 0.5757122682850916, + "learning_rate": 1.1147540983606557e-05, + "loss": 0.5493, + "step": 68 + }, + { + "epoch": 0.14124872057318322, + "grad_norm": 0.5572446165702184, + "learning_rate": 1.1311475409836066e-05, + "loss": 0.5231, + "step": 69 + }, + { + "epoch": 0.14329580348004095, + "grad_norm": 0.5000459199001162, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.5221, + "step": 70 + }, + { + "epoch": 0.14534288638689866, + "grad_norm": 0.49933135856571004, + "learning_rate": 1.1639344262295083e-05, + "loss": 0.5254, + "step": 71 + }, + { + "epoch": 0.1473899692937564, + "grad_norm": 0.542314402205445, + "learning_rate": 1.1803278688524591e-05, + "loss": 0.5855, + "step": 72 + }, + { + "epoch": 0.14943705220061412, + "grad_norm": 0.4908515642706825, + "learning_rate": 1.19672131147541e-05, + "loss": 0.5277, + "step": 73 + }, + { + "epoch": 0.15148413510747186, + "grad_norm": 0.5541036948715271, + "learning_rate": 1.2131147540983608e-05, + "loss": 0.5226, + "step": 74 + }, + { + "epoch": 0.1535312180143296, + "grad_norm": 0.5925205612588014, + "learning_rate": 1.2295081967213116e-05, + "loss": 0.5588, + "step": 75 + }, + { + "epoch": 0.1555783009211873, + "grad_norm": 0.5472033034034951, + "learning_rate": 1.2459016393442624e-05, + "loss": 0.5601, + "step": 76 + }, + { + "epoch": 0.15762538382804503, + "grad_norm": 0.5831899459875634, + "learning_rate": 1.2622950819672132e-05, + "loss": 0.5208, + "step": 77 + }, + { + "epoch": 0.15967246673490276, + "grad_norm": 0.6725600970137904, + "learning_rate": 1.2786885245901642e-05, + "loss": 0.5646, + "step": 78 + }, + { + "epoch": 0.1617195496417605, + "grad_norm": 0.4997581515870628, + "learning_rate": 1.295081967213115e-05, + "loss": 0.5345, + "step": 79 + }, + { + "epoch": 0.16376663254861823, + "grad_norm": 0.6060573387394406, + "learning_rate": 1.3114754098360655e-05, + "loss": 0.5516, + "step": 80 + }, + { + "epoch": 0.16581371545547594, + "grad_norm": 0.5707723342314415, + "learning_rate": 1.3278688524590165e-05, + "loss": 0.5494, + "step": 81 + }, + { + "epoch": 0.16786079836233367, + "grad_norm": 0.5820968760684135, + "learning_rate": 1.3442622950819673e-05, + "loss": 0.5374, + "step": 82 + }, + { + "epoch": 0.1699078812691914, + "grad_norm": 0.801732853136766, + "learning_rate": 1.3606557377049181e-05, + "loss": 0.546, + "step": 83 + }, + { + "epoch": 0.17195496417604914, + "grad_norm": 0.5288994895334571, + "learning_rate": 1.377049180327869e-05, + "loss": 0.5761, + "step": 84 + }, + { + "epoch": 0.17400204708290687, + "grad_norm": 0.7883330207931984, + "learning_rate": 1.3934426229508198e-05, + "loss": 0.5326, + "step": 85 + }, + { + "epoch": 0.17604912998976457, + "grad_norm": 0.4807106883512578, + "learning_rate": 1.4098360655737706e-05, + "loss": 0.5311, + "step": 86 + }, + { + "epoch": 0.1780962128966223, + "grad_norm": 0.6954942427041093, + "learning_rate": 1.4262295081967214e-05, + "loss": 0.5206, + "step": 87 + }, + { + "epoch": 0.18014329580348004, + "grad_norm": 0.5706344929187627, + "learning_rate": 1.4426229508196722e-05, + "loss": 0.5575, + "step": 88 + }, + { + "epoch": 0.18219037871033777, + "grad_norm": 0.5660731771723676, + "learning_rate": 1.459016393442623e-05, + "loss": 0.5243, + "step": 89 + }, + { + "epoch": 0.1842374616171955, + "grad_norm": 0.7077259968257474, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.5375, + "step": 90 + }, + { + "epoch": 0.1862845445240532, + "grad_norm": 0.5640871854454458, + "learning_rate": 1.4918032786885249e-05, + "loss": 0.5678, + "step": 91 + }, + { + "epoch": 0.18833162743091095, + "grad_norm": 0.5686460669441292, + "learning_rate": 1.5081967213114754e-05, + "loss": 0.5317, + "step": 92 + }, + { + "epoch": 0.19037871033776868, + "grad_norm": 0.5667037000506248, + "learning_rate": 1.5245901639344264e-05, + "loss": 0.5208, + "step": 93 + }, + { + "epoch": 0.19242579324462641, + "grad_norm": 0.6528802562895782, + "learning_rate": 1.5409836065573772e-05, + "loss": 0.5538, + "step": 94 + }, + { + "epoch": 0.19447287615148415, + "grad_norm": 0.6104791131678972, + "learning_rate": 1.5573770491803278e-05, + "loss": 0.4945, + "step": 95 + }, + { + "epoch": 0.19651995905834185, + "grad_norm": 0.770297526442582, + "learning_rate": 1.5737704918032788e-05, + "loss": 0.5542, + "step": 96 + }, + { + "epoch": 0.1985670419651996, + "grad_norm": 0.639855439582837, + "learning_rate": 1.5901639344262295e-05, + "loss": 0.5604, + "step": 97 + }, + { + "epoch": 0.20061412487205732, + "grad_norm": 0.6516313518294781, + "learning_rate": 1.6065573770491805e-05, + "loss": 0.492, + "step": 98 + }, + { + "epoch": 0.20266120777891505, + "grad_norm": 0.7323248049827676, + "learning_rate": 1.6229508196721314e-05, + "loss": 0.5561, + "step": 99 + }, + { + "epoch": 0.2047082906857728, + "grad_norm": 0.614575250954447, + "learning_rate": 1.639344262295082e-05, + "loss": 0.5319, + "step": 100 + }, + { + "epoch": 0.2067553735926305, + "grad_norm": 0.6867103983793634, + "learning_rate": 1.655737704918033e-05, + "loss": 0.5087, + "step": 101 + }, + { + "epoch": 0.20880245649948823, + "grad_norm": 0.5125936025327898, + "learning_rate": 1.6721311475409837e-05, + "loss": 0.5187, + "step": 102 + }, + { + "epoch": 0.21084953940634596, + "grad_norm": 0.7791039154279353, + "learning_rate": 1.6885245901639347e-05, + "loss": 0.5831, + "step": 103 + }, + { + "epoch": 0.2128966223132037, + "grad_norm": 0.5610382412593208, + "learning_rate": 1.7049180327868854e-05, + "loss": 0.5979, + "step": 104 + }, + { + "epoch": 0.21494370522006143, + "grad_norm": 0.6872141481595387, + "learning_rate": 1.721311475409836e-05, + "loss": 0.5337, + "step": 105 + }, + { + "epoch": 0.21699078812691913, + "grad_norm": 0.621726895260148, + "learning_rate": 1.737704918032787e-05, + "loss": 0.5014, + "step": 106 + }, + { + "epoch": 0.21903787103377687, + "grad_norm": 0.6593450481678657, + "learning_rate": 1.7540983606557377e-05, + "loss": 0.5298, + "step": 107 + }, + { + "epoch": 0.2210849539406346, + "grad_norm": 0.6085504680048398, + "learning_rate": 1.7704918032786887e-05, + "loss": 0.5597, + "step": 108 + }, + { + "epoch": 0.22313203684749233, + "grad_norm": 0.5818331349760811, + "learning_rate": 1.7868852459016393e-05, + "loss": 0.5671, + "step": 109 + }, + { + "epoch": 0.22517911975435004, + "grad_norm": 0.5705507858388099, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.5608, + "step": 110 + }, + { + "epoch": 0.22722620266120777, + "grad_norm": 0.566454711636141, + "learning_rate": 1.8196721311475413e-05, + "loss": 0.536, + "step": 111 + }, + { + "epoch": 0.2292732855680655, + "grad_norm": 0.6584736492952961, + "learning_rate": 1.836065573770492e-05, + "loss": 0.5107, + "step": 112 + }, + { + "epoch": 0.23132036847492324, + "grad_norm": 0.5431857503415616, + "learning_rate": 1.852459016393443e-05, + "loss": 0.5216, + "step": 113 + }, + { + "epoch": 0.23336745138178097, + "grad_norm": 0.7429344164092465, + "learning_rate": 1.8688524590163936e-05, + "loss": 0.5673, + "step": 114 + }, + { + "epoch": 0.23541453428863868, + "grad_norm": 0.6303478113244917, + "learning_rate": 1.8852459016393446e-05, + "loss": 0.5173, + "step": 115 + }, + { + "epoch": 0.2374616171954964, + "grad_norm": 0.6632339341308846, + "learning_rate": 1.9016393442622952e-05, + "loss": 0.526, + "step": 116 + }, + { + "epoch": 0.23950870010235414, + "grad_norm": 0.7178085764934704, + "learning_rate": 1.918032786885246e-05, + "loss": 0.5501, + "step": 117 + }, + { + "epoch": 0.24155578300921188, + "grad_norm": 0.7261199143030841, + "learning_rate": 1.934426229508197e-05, + "loss": 0.5564, + "step": 118 + }, + { + "epoch": 0.2436028659160696, + "grad_norm": 0.6467257123886485, + "learning_rate": 1.9508196721311475e-05, + "loss": 0.5307, + "step": 119 + }, + { + "epoch": 0.24564994882292732, + "grad_norm": 0.7743559427761539, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.4867, + "step": 120 + }, + { + "epoch": 0.24769703172978505, + "grad_norm": 0.5777069325137312, + "learning_rate": 1.9836065573770492e-05, + "loss": 0.5235, + "step": 121 + }, + { + "epoch": 0.24974411463664278, + "grad_norm": 0.7434807781935519, + "learning_rate": 2e-05, + "loss": 0.524, + "step": 122 + }, + { + "epoch": 0.2517911975435005, + "grad_norm": 0.6635906710416195, + "learning_rate": 2.0163934426229508e-05, + "loss": 0.4759, + "step": 123 + }, + { + "epoch": 0.2538382804503582, + "grad_norm": 0.6164296619684109, + "learning_rate": 2.0327868852459018e-05, + "loss": 0.4925, + "step": 124 + }, + { + "epoch": 0.25588536335721596, + "grad_norm": 0.5800314323412163, + "learning_rate": 2.0491803278688525e-05, + "loss": 0.536, + "step": 125 + }, + { + "epoch": 0.2579324462640737, + "grad_norm": 0.6910504700298034, + "learning_rate": 2.0655737704918034e-05, + "loss": 0.5937, + "step": 126 + }, + { + "epoch": 0.2599795291709314, + "grad_norm": 0.5724201379088258, + "learning_rate": 2.081967213114754e-05, + "loss": 0.5019, + "step": 127 + }, + { + "epoch": 0.26202661207778916, + "grad_norm": 0.7529480886936586, + "learning_rate": 2.098360655737705e-05, + "loss": 0.5643, + "step": 128 + }, + { + "epoch": 0.2640736949846469, + "grad_norm": 0.6170979184951091, + "learning_rate": 2.1147540983606557e-05, + "loss": 0.5449, + "step": 129 + }, + { + "epoch": 0.2661207778915046, + "grad_norm": 0.6541686452612759, + "learning_rate": 2.1311475409836067e-05, + "loss": 0.5377, + "step": 130 + }, + { + "epoch": 0.26816786079836236, + "grad_norm": 0.7744689267947779, + "learning_rate": 2.1475409836065574e-05, + "loss": 0.5846, + "step": 131 + }, + { + "epoch": 0.27021494370522003, + "grad_norm": 0.5945036689673261, + "learning_rate": 2.1639344262295087e-05, + "loss": 0.5283, + "step": 132 + }, + { + "epoch": 0.27226202661207777, + "grad_norm": 0.661953998440321, + "learning_rate": 2.180327868852459e-05, + "loss": 0.557, + "step": 133 + }, + { + "epoch": 0.2743091095189355, + "grad_norm": 0.5763477769339282, + "learning_rate": 2.1967213114754104e-05, + "loss": 0.511, + "step": 134 + }, + { + "epoch": 0.27635619242579323, + "grad_norm": 0.5514346276124723, + "learning_rate": 2.213114754098361e-05, + "loss": 0.5182, + "step": 135 + }, + { + "epoch": 0.27840327533265097, + "grad_norm": 0.5695095404976926, + "learning_rate": 2.2295081967213113e-05, + "loss": 0.5146, + "step": 136 + }, + { + "epoch": 0.2804503582395087, + "grad_norm": 0.5986561008583392, + "learning_rate": 2.2459016393442626e-05, + "loss": 0.5083, + "step": 137 + }, + { + "epoch": 0.28249744114636643, + "grad_norm": 0.5610678921247985, + "learning_rate": 2.2622950819672133e-05, + "loss": 0.5313, + "step": 138 + }, + { + "epoch": 0.28454452405322417, + "grad_norm": 0.6049817345717909, + "learning_rate": 2.2786885245901643e-05, + "loss": 0.5052, + "step": 139 + }, + { + "epoch": 0.2865916069600819, + "grad_norm": 0.5464694050777636, + "learning_rate": 2.295081967213115e-05, + "loss": 0.5371, + "step": 140 + }, + { + "epoch": 0.28863868986693964, + "grad_norm": 0.6288012925738132, + "learning_rate": 2.311475409836066e-05, + "loss": 0.4964, + "step": 141 + }, + { + "epoch": 0.2906857727737973, + "grad_norm": 0.6292583801203022, + "learning_rate": 2.3278688524590166e-05, + "loss": 0.524, + "step": 142 + }, + { + "epoch": 0.29273285568065505, + "grad_norm": 0.5989675634399306, + "learning_rate": 2.3442622950819676e-05, + "loss": 0.5347, + "step": 143 + }, + { + "epoch": 0.2947799385875128, + "grad_norm": 0.711475328293837, + "learning_rate": 2.3606557377049182e-05, + "loss": 0.5212, + "step": 144 + }, + { + "epoch": 0.2968270214943705, + "grad_norm": 0.6147945346885277, + "learning_rate": 2.3770491803278692e-05, + "loss": 0.536, + "step": 145 + }, + { + "epoch": 0.29887410440122825, + "grad_norm": 0.7665801712563535, + "learning_rate": 2.39344262295082e-05, + "loss": 0.506, + "step": 146 + }, + { + "epoch": 0.300921187308086, + "grad_norm": 0.5675791883302023, + "learning_rate": 2.4098360655737705e-05, + "loss": 0.5165, + "step": 147 + }, + { + "epoch": 0.3029682702149437, + "grad_norm": 0.6749613199444409, + "learning_rate": 2.4262295081967215e-05, + "loss": 0.5353, + "step": 148 + }, + { + "epoch": 0.30501535312180145, + "grad_norm": 0.5449017130362618, + "learning_rate": 2.442622950819672e-05, + "loss": 0.5378, + "step": 149 + }, + { + "epoch": 0.3070624360286592, + "grad_norm": 0.6201210295020051, + "learning_rate": 2.459016393442623e-05, + "loss": 0.5094, + "step": 150 + }, + { + "epoch": 0.3091095189355169, + "grad_norm": 0.577151136698421, + "learning_rate": 2.4754098360655738e-05, + "loss": 0.5196, + "step": 151 + }, + { + "epoch": 0.3111566018423746, + "grad_norm": 0.5313033735643246, + "learning_rate": 2.4918032786885248e-05, + "loss": 0.4937, + "step": 152 + }, + { + "epoch": 0.3132036847492323, + "grad_norm": 0.5101509817549174, + "learning_rate": 2.5081967213114754e-05, + "loss": 0.4788, + "step": 153 + }, + { + "epoch": 0.31525076765609006, + "grad_norm": 0.6444658936554309, + "learning_rate": 2.5245901639344264e-05, + "loss": 0.5275, + "step": 154 + }, + { + "epoch": 0.3172978505629478, + "grad_norm": 0.44154622012146943, + "learning_rate": 2.540983606557377e-05, + "loss": 0.4861, + "step": 155 + }, + { + "epoch": 0.3193449334698055, + "grad_norm": 0.608818705758696, + "learning_rate": 2.5573770491803284e-05, + "loss": 0.5071, + "step": 156 + }, + { + "epoch": 0.32139201637666326, + "grad_norm": 0.616456605045568, + "learning_rate": 2.5737704918032787e-05, + "loss": 0.5195, + "step": 157 + }, + { + "epoch": 0.323439099283521, + "grad_norm": 0.67099003512138, + "learning_rate": 2.59016393442623e-05, + "loss": 0.5523, + "step": 158 + }, + { + "epoch": 0.3254861821903787, + "grad_norm": 0.6144728851907352, + "learning_rate": 2.6065573770491807e-05, + "loss": 0.5451, + "step": 159 + }, + { + "epoch": 0.32753326509723646, + "grad_norm": 0.6323429696875369, + "learning_rate": 2.622950819672131e-05, + "loss": 0.5309, + "step": 160 + }, + { + "epoch": 0.3295803480040942, + "grad_norm": 0.5465209069864494, + "learning_rate": 2.6393442622950824e-05, + "loss": 0.5342, + "step": 161 + }, + { + "epoch": 0.33162743091095187, + "grad_norm": 0.7352587986716179, + "learning_rate": 2.655737704918033e-05, + "loss": 0.5181, + "step": 162 + }, + { + "epoch": 0.3336745138178096, + "grad_norm": 0.5130967329382123, + "learning_rate": 2.672131147540984e-05, + "loss": 0.5074, + "step": 163 + }, + { + "epoch": 0.33572159672466734, + "grad_norm": 0.6870511656785425, + "learning_rate": 2.6885245901639346e-05, + "loss": 0.5603, + "step": 164 + }, + { + "epoch": 0.33776867963152507, + "grad_norm": 0.7048410066657461, + "learning_rate": 2.7049180327868856e-05, + "loss": 0.5272, + "step": 165 + }, + { + "epoch": 0.3398157625383828, + "grad_norm": 0.8082743167202432, + "learning_rate": 2.7213114754098363e-05, + "loss": 0.5196, + "step": 166 + }, + { + "epoch": 0.34186284544524054, + "grad_norm": 0.6901700346930703, + "learning_rate": 2.7377049180327873e-05, + "loss": 0.5439, + "step": 167 + }, + { + "epoch": 0.34390992835209827, + "grad_norm": 0.6543817136595013, + "learning_rate": 2.754098360655738e-05, + "loss": 0.5375, + "step": 168 + }, + { + "epoch": 0.345957011258956, + "grad_norm": 0.6015689406714214, + "learning_rate": 2.770491803278689e-05, + "loss": 0.5014, + "step": 169 + }, + { + "epoch": 0.34800409416581374, + "grad_norm": 0.658713714501858, + "learning_rate": 2.7868852459016396e-05, + "loss": 0.5262, + "step": 170 + }, + { + "epoch": 0.3500511770726714, + "grad_norm": 0.5872370736842677, + "learning_rate": 2.8032786885245902e-05, + "loss": 0.5454, + "step": 171 + }, + { + "epoch": 0.35209825997952915, + "grad_norm": 0.5434995669507285, + "learning_rate": 2.8196721311475412e-05, + "loss": 0.562, + "step": 172 + }, + { + "epoch": 0.3541453428863869, + "grad_norm": 0.7617505474169058, + "learning_rate": 2.836065573770492e-05, + "loss": 0.5147, + "step": 173 + }, + { + "epoch": 0.3561924257932446, + "grad_norm": 0.6940517866702995, + "learning_rate": 2.852459016393443e-05, + "loss": 0.561, + "step": 174 + }, + { + "epoch": 0.35823950870010235, + "grad_norm": 0.6026045944898959, + "learning_rate": 2.8688524590163935e-05, + "loss": 0.5294, + "step": 175 + }, + { + "epoch": 0.3602865916069601, + "grad_norm": 0.685003134185584, + "learning_rate": 2.8852459016393445e-05, + "loss": 0.5086, + "step": 176 + }, + { + "epoch": 0.3623336745138178, + "grad_norm": 0.7007920129863516, + "learning_rate": 2.901639344262295e-05, + "loss": 0.5021, + "step": 177 + }, + { + "epoch": 0.36438075742067555, + "grad_norm": 0.7355272145052204, + "learning_rate": 2.918032786885246e-05, + "loss": 0.5007, + "step": 178 + }, + { + "epoch": 0.3664278403275333, + "grad_norm": 0.6487776776109249, + "learning_rate": 2.9344262295081968e-05, + "loss": 0.5524, + "step": 179 + }, + { + "epoch": 0.368474923234391, + "grad_norm": 0.6902433390057133, + "learning_rate": 2.9508196721311478e-05, + "loss": 0.5142, + "step": 180 + }, + { + "epoch": 0.3705220061412487, + "grad_norm": 0.6867948754438059, + "learning_rate": 2.9672131147540984e-05, + "loss": 0.5187, + "step": 181 + }, + { + "epoch": 0.3725690890481064, + "grad_norm": 0.6159722804543436, + "learning_rate": 2.9836065573770498e-05, + "loss": 0.5508, + "step": 182 + }, + { + "epoch": 0.37461617195496416, + "grad_norm": 0.7696599736408124, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.5348, + "step": 183 + }, + { + "epoch": 0.3766632548618219, + "grad_norm": 0.6954090228635682, + "learning_rate": 3.0163934426229507e-05, + "loss": 0.5447, + "step": 184 + }, + { + "epoch": 0.37871033776867963, + "grad_norm": 0.6435121902381328, + "learning_rate": 3.032786885245902e-05, + "loss": 0.5474, + "step": 185 + }, + { + "epoch": 0.38075742067553736, + "grad_norm": 0.8038288638214993, + "learning_rate": 3.0491803278688527e-05, + "loss": 0.5559, + "step": 186 + }, + { + "epoch": 0.3828045035823951, + "grad_norm": 0.688956722367706, + "learning_rate": 3.065573770491804e-05, + "loss": 0.4875, + "step": 187 + }, + { + "epoch": 0.38485158648925283, + "grad_norm": 0.7159386414570686, + "learning_rate": 3.0819672131147544e-05, + "loss": 0.5499, + "step": 188 + }, + { + "epoch": 0.38689866939611056, + "grad_norm": 0.9285947357940207, + "learning_rate": 3.098360655737705e-05, + "loss": 0.5144, + "step": 189 + }, + { + "epoch": 0.3889457523029683, + "grad_norm": 0.6303078015541651, + "learning_rate": 3.1147540983606557e-05, + "loss": 0.6274, + "step": 190 + }, + { + "epoch": 0.390992835209826, + "grad_norm": 0.8868869883770375, + "learning_rate": 3.131147540983607e-05, + "loss": 0.5261, + "step": 191 + }, + { + "epoch": 0.3930399181166837, + "grad_norm": 0.8238279905547772, + "learning_rate": 3.1475409836065576e-05, + "loss": 0.5284, + "step": 192 + }, + { + "epoch": 0.39508700102354144, + "grad_norm": 0.7109735935815616, + "learning_rate": 3.163934426229509e-05, + "loss": 0.4981, + "step": 193 + }, + { + "epoch": 0.3971340839303992, + "grad_norm": 0.8555381934750876, + "learning_rate": 3.180327868852459e-05, + "loss": 0.5184, + "step": 194 + }, + { + "epoch": 0.3991811668372569, + "grad_norm": 0.7950926694758862, + "learning_rate": 3.19672131147541e-05, + "loss": 0.5286, + "step": 195 + }, + { + "epoch": 0.40122824974411464, + "grad_norm": 0.6893785135479137, + "learning_rate": 3.213114754098361e-05, + "loss": 0.5048, + "step": 196 + }, + { + "epoch": 0.4032753326509724, + "grad_norm": 0.8285127939060417, + "learning_rate": 3.2295081967213116e-05, + "loss": 0.5696, + "step": 197 + }, + { + "epoch": 0.4053224155578301, + "grad_norm": 0.6476667354794096, + "learning_rate": 3.245901639344263e-05, + "loss": 0.5175, + "step": 198 + }, + { + "epoch": 0.40736949846468784, + "grad_norm": 0.9133932587744947, + "learning_rate": 3.2622950819672136e-05, + "loss": 0.5426, + "step": 199 + }, + { + "epoch": 0.4094165813715456, + "grad_norm": 0.7532290387705938, + "learning_rate": 3.278688524590164e-05, + "loss": 0.5087, + "step": 200 + }, + { + "epoch": 0.41146366427840325, + "grad_norm": 0.9389363029784095, + "learning_rate": 3.295081967213115e-05, + "loss": 0.509, + "step": 201 + }, + { + "epoch": 0.413510747185261, + "grad_norm": 0.5686252927682621, + "learning_rate": 3.311475409836066e-05, + "loss": 0.5193, + "step": 202 + }, + { + "epoch": 0.4155578300921187, + "grad_norm": 0.8087812079712846, + "learning_rate": 3.327868852459017e-05, + "loss": 0.5197, + "step": 203 + }, + { + "epoch": 0.41760491299897645, + "grad_norm": 0.4881097207653879, + "learning_rate": 3.3442622950819675e-05, + "loss": 0.5044, + "step": 204 + }, + { + "epoch": 0.4196519959058342, + "grad_norm": 0.7274067002262758, + "learning_rate": 3.360655737704918e-05, + "loss": 0.4974, + "step": 205 + }, + { + "epoch": 0.4216990788126919, + "grad_norm": 0.5458714268885722, + "learning_rate": 3.3770491803278695e-05, + "loss": 0.5017, + "step": 206 + }, + { + "epoch": 0.42374616171954965, + "grad_norm": 0.7035504070202385, + "learning_rate": 3.39344262295082e-05, + "loss": 0.4983, + "step": 207 + }, + { + "epoch": 0.4257932446264074, + "grad_norm": 0.5269572768909501, + "learning_rate": 3.409836065573771e-05, + "loss": 0.5484, + "step": 208 + }, + { + "epoch": 0.4278403275332651, + "grad_norm": 0.5550435466705637, + "learning_rate": 3.4262295081967214e-05, + "loss": 0.5152, + "step": 209 + }, + { + "epoch": 0.42988741044012285, + "grad_norm": 0.5658118678239209, + "learning_rate": 3.442622950819672e-05, + "loss": 0.529, + "step": 210 + }, + { + "epoch": 0.43193449334698053, + "grad_norm": 0.4274306239416562, + "learning_rate": 3.4590163934426234e-05, + "loss": 0.547, + "step": 211 + }, + { + "epoch": 0.43398157625383826, + "grad_norm": 0.6498291018145316, + "learning_rate": 3.475409836065574e-05, + "loss": 0.5255, + "step": 212 + }, + { + "epoch": 0.436028659160696, + "grad_norm": 0.4862132754376562, + "learning_rate": 3.491803278688525e-05, + "loss": 0.5407, + "step": 213 + }, + { + "epoch": 0.43807574206755373, + "grad_norm": 0.7345334765463367, + "learning_rate": 3.5081967213114754e-05, + "loss": 0.5467, + "step": 214 + }, + { + "epoch": 0.44012282497441146, + "grad_norm": 0.6199950312514801, + "learning_rate": 3.524590163934427e-05, + "loss": 0.5599, + "step": 215 + }, + { + "epoch": 0.4421699078812692, + "grad_norm": 0.7074691063675598, + "learning_rate": 3.5409836065573773e-05, + "loss": 0.5127, + "step": 216 + }, + { + "epoch": 0.44421699078812693, + "grad_norm": 0.6229358703039948, + "learning_rate": 3.557377049180329e-05, + "loss": 0.5186, + "step": 217 + }, + { + "epoch": 0.44626407369498466, + "grad_norm": 0.6395887726511317, + "learning_rate": 3.5737704918032786e-05, + "loss": 0.4989, + "step": 218 + }, + { + "epoch": 0.4483111566018424, + "grad_norm": 0.5458826332923155, + "learning_rate": 3.59016393442623e-05, + "loss": 0.5008, + "step": 219 + }, + { + "epoch": 0.4503582395087001, + "grad_norm": 0.6729015313127311, + "learning_rate": 3.6065573770491806e-05, + "loss": 0.5122, + "step": 220 + }, + { + "epoch": 0.4524053224155578, + "grad_norm": 0.7752024606600724, + "learning_rate": 3.622950819672131e-05, + "loss": 0.5018, + "step": 221 + }, + { + "epoch": 0.45445240532241554, + "grad_norm": 0.6141567713992134, + "learning_rate": 3.6393442622950826e-05, + "loss": 0.4921, + "step": 222 + }, + { + "epoch": 0.4564994882292733, + "grad_norm": 0.6807359145513986, + "learning_rate": 3.655737704918033e-05, + "loss": 0.5041, + "step": 223 + }, + { + "epoch": 0.458546571136131, + "grad_norm": 0.5856826658124886, + "learning_rate": 3.672131147540984e-05, + "loss": 0.5281, + "step": 224 + }, + { + "epoch": 0.46059365404298874, + "grad_norm": 0.6528386749398997, + "learning_rate": 3.6885245901639346e-05, + "loss": 0.4862, + "step": 225 + }, + { + "epoch": 0.4626407369498465, + "grad_norm": 0.8760949897414592, + "learning_rate": 3.704918032786886e-05, + "loss": 0.5313, + "step": 226 + }, + { + "epoch": 0.4646878198567042, + "grad_norm": 0.7133493187489152, + "learning_rate": 3.7213114754098365e-05, + "loss": 0.5611, + "step": 227 + }, + { + "epoch": 0.46673490276356194, + "grad_norm": 0.6514391258645618, + "learning_rate": 3.737704918032787e-05, + "loss": 0.5017, + "step": 228 + }, + { + "epoch": 0.4687819856704197, + "grad_norm": 0.712386378258888, + "learning_rate": 3.754098360655738e-05, + "loss": 0.4801, + "step": 229 + }, + { + "epoch": 0.47082906857727735, + "grad_norm": 0.7933509165191774, + "learning_rate": 3.770491803278689e-05, + "loss": 0.5265, + "step": 230 + }, + { + "epoch": 0.4728761514841351, + "grad_norm": 0.6393959136262052, + "learning_rate": 3.78688524590164e-05, + "loss": 0.533, + "step": 231 + }, + { + "epoch": 0.4749232343909928, + "grad_norm": 0.8336237173823177, + "learning_rate": 3.8032786885245905e-05, + "loss": 0.5067, + "step": 232 + }, + { + "epoch": 0.47697031729785055, + "grad_norm": 0.7946625026406952, + "learning_rate": 3.819672131147541e-05, + "loss": 0.5572, + "step": 233 + }, + { + "epoch": 0.4790174002047083, + "grad_norm": 0.7448514333498657, + "learning_rate": 3.836065573770492e-05, + "loss": 0.5267, + "step": 234 + }, + { + "epoch": 0.481064483111566, + "grad_norm": 0.6216449013018147, + "learning_rate": 3.852459016393443e-05, + "loss": 0.5007, + "step": 235 + }, + { + "epoch": 0.48311156601842375, + "grad_norm": 0.5847497681971316, + "learning_rate": 3.868852459016394e-05, + "loss": 0.5465, + "step": 236 + }, + { + "epoch": 0.4851586489252815, + "grad_norm": 0.5930045841712915, + "learning_rate": 3.8852459016393444e-05, + "loss": 0.5498, + "step": 237 + }, + { + "epoch": 0.4872057318321392, + "grad_norm": 0.6242247074949386, + "learning_rate": 3.901639344262295e-05, + "loss": 0.5032, + "step": 238 + }, + { + "epoch": 0.48925281473899696, + "grad_norm": 0.5315372089751544, + "learning_rate": 3.9180327868852464e-05, + "loss": 0.5351, + "step": 239 + }, + { + "epoch": 0.49129989764585463, + "grad_norm": 0.5976493830078852, + "learning_rate": 3.934426229508197e-05, + "loss": 0.5511, + "step": 240 + }, + { + "epoch": 0.49334698055271237, + "grad_norm": 0.5263959833035169, + "learning_rate": 3.950819672131148e-05, + "loss": 0.5095, + "step": 241 + }, + { + "epoch": 0.4953940634595701, + "grad_norm": 0.5164961935996712, + "learning_rate": 3.9672131147540983e-05, + "loss": 0.4903, + "step": 242 + }, + { + "epoch": 0.49744114636642783, + "grad_norm": 0.5030461231666816, + "learning_rate": 3.98360655737705e-05, + "loss": 0.5246, + "step": 243 + }, + { + "epoch": 0.49948822927328557, + "grad_norm": 0.5199256237042372, + "learning_rate": 4e-05, + "loss": 0.527, + "step": 244 + }, + { + "epoch": 0.5015353121801432, + "grad_norm": 0.5720072000502151, + "learning_rate": 3.999997953390434e-05, + "loss": 0.4698, + "step": 245 + }, + { + "epoch": 0.503582395087001, + "grad_norm": 0.4723958838410465, + "learning_rate": 3.999991813565924e-05, + "loss": 0.5021, + "step": 246 + }, + { + "epoch": 0.5056294779938587, + "grad_norm": 0.6664032925560375, + "learning_rate": 3.999981580539036e-05, + "loss": 0.5195, + "step": 247 + }, + { + "epoch": 0.5076765609007164, + "grad_norm": 0.46934376217932583, + "learning_rate": 3.999967254330713e-05, + "loss": 0.4915, + "step": 248 + }, + { + "epoch": 0.5097236438075742, + "grad_norm": 0.5471916125552302, + "learning_rate": 3.999948834970275e-05, + "loss": 0.5395, + "step": 249 + }, + { + "epoch": 0.5117707267144319, + "grad_norm": 0.5322293661429813, + "learning_rate": 3.9999263224954204e-05, + "loss": 0.5156, + "step": 250 + }, + { + "epoch": 0.5138178096212896, + "grad_norm": 0.48934414004740173, + "learning_rate": 3.999899716952221e-05, + "loss": 0.505, + "step": 251 + }, + { + "epoch": 0.5158648925281474, + "grad_norm": 0.6841239054987143, + "learning_rate": 3.9998690183951304e-05, + "loss": 0.517, + "step": 252 + }, + { + "epoch": 0.5179119754350051, + "grad_norm": 0.6081072200654224, + "learning_rate": 3.999834226886976e-05, + "loss": 0.5209, + "step": 253 + }, + { + "epoch": 0.5199590583418628, + "grad_norm": 0.591682811543655, + "learning_rate": 3.999795342498961e-05, + "loss": 0.5144, + "step": 254 + }, + { + "epoch": 0.5220061412487206, + "grad_norm": 0.644975243350573, + "learning_rate": 3.999752365310668e-05, + "loss": 0.5285, + "step": 255 + }, + { + "epoch": 0.5240532241555783, + "grad_norm": 0.5648625378625047, + "learning_rate": 3.999705295410054e-05, + "loss": 0.493, + "step": 256 + }, + { + "epoch": 0.526100307062436, + "grad_norm": 0.6130167811037579, + "learning_rate": 3.999654132893453e-05, + "loss": 0.5257, + "step": 257 + }, + { + "epoch": 0.5281473899692938, + "grad_norm": 0.5037937329537826, + "learning_rate": 3.999598877865575e-05, + "loss": 0.4947, + "step": 258 + }, + { + "epoch": 0.5301944728761515, + "grad_norm": 0.6388452684007601, + "learning_rate": 3.999539530439504e-05, + "loss": 0.5319, + "step": 259 + }, + { + "epoch": 0.5322415557830092, + "grad_norm": 0.5304888345319132, + "learning_rate": 3.9994760907367025e-05, + "loss": 0.5239, + "step": 260 + }, + { + "epoch": 0.534288638689867, + "grad_norm": 0.548729152916564, + "learning_rate": 3.999408558887006e-05, + "loss": 0.5182, + "step": 261 + }, + { + "epoch": 0.5363357215967247, + "grad_norm": 0.4946265440615839, + "learning_rate": 3.9993369350286265e-05, + "loss": 0.5211, + "step": 262 + }, + { + "epoch": 0.5383828045035824, + "grad_norm": 0.5513131572470374, + "learning_rate": 3.999261219308149e-05, + "loss": 0.4922, + "step": 263 + }, + { + "epoch": 0.5404298874104401, + "grad_norm": 0.5628821503706624, + "learning_rate": 3.999181411880536e-05, + "loss": 0.4833, + "step": 264 + }, + { + "epoch": 0.5424769703172978, + "grad_norm": 0.6321595406631201, + "learning_rate": 3.99909751290912e-05, + "loss": 0.5156, + "step": 265 + }, + { + "epoch": 0.5445240532241555, + "grad_norm": 0.49452183826279106, + "learning_rate": 3.9990095225656104e-05, + "loss": 0.4918, + "step": 266 + }, + { + "epoch": 0.5465711361310133, + "grad_norm": 0.5983835992691791, + "learning_rate": 3.998917441030089e-05, + "loss": 0.534, + "step": 267 + }, + { + "epoch": 0.548618219037871, + "grad_norm": 0.5392668568126767, + "learning_rate": 3.9988212684910107e-05, + "loss": 0.4919, + "step": 268 + }, + { + "epoch": 0.5506653019447287, + "grad_norm": 0.5271122702751097, + "learning_rate": 3.998721005145204e-05, + "loss": 0.5023, + "step": 269 + }, + { + "epoch": 0.5527123848515865, + "grad_norm": 0.613359179011921, + "learning_rate": 3.998616651197867e-05, + "loss": 0.5123, + "step": 270 + }, + { + "epoch": 0.5547594677584442, + "grad_norm": 0.5885866628158287, + "learning_rate": 3.9985082068625724e-05, + "loss": 0.5192, + "step": 271 + }, + { + "epoch": 0.5568065506653019, + "grad_norm": 0.5458709549332362, + "learning_rate": 3.998395672361264e-05, + "loss": 0.5159, + "step": 272 + }, + { + "epoch": 0.5588536335721597, + "grad_norm": 0.47718089897345783, + "learning_rate": 3.998279047924255e-05, + "loss": 0.4571, + "step": 273 + }, + { + "epoch": 0.5609007164790174, + "grad_norm": 0.5918739989559765, + "learning_rate": 3.998158333790231e-05, + "loss": 0.5093, + "step": 274 + }, + { + "epoch": 0.5629477993858751, + "grad_norm": 0.6171028421078789, + "learning_rate": 3.998033530206246e-05, + "loss": 0.5174, + "step": 275 + }, + { + "epoch": 0.5649948822927329, + "grad_norm": 0.6049929559700232, + "learning_rate": 3.9979046374277246e-05, + "loss": 0.5292, + "step": 276 + }, + { + "epoch": 0.5670419651995906, + "grad_norm": 0.6353516580703451, + "learning_rate": 3.99777165571846e-05, + "loss": 0.5202, + "step": 277 + }, + { + "epoch": 0.5690890481064483, + "grad_norm": 0.542126012081921, + "learning_rate": 3.997634585350614e-05, + "loss": 0.515, + "step": 278 + }, + { + "epoch": 0.5711361310133061, + "grad_norm": 0.6577209989909311, + "learning_rate": 3.997493426604715e-05, + "loss": 0.4827, + "step": 279 + }, + { + "epoch": 0.5731832139201638, + "grad_norm": 0.5786055028419322, + "learning_rate": 3.997348179769661e-05, + "loss": 0.4984, + "step": 280 + }, + { + "epoch": 0.5752302968270215, + "grad_norm": 0.5693454757360409, + "learning_rate": 3.9971988451427155e-05, + "loss": 0.4795, + "step": 281 + }, + { + "epoch": 0.5772773797338793, + "grad_norm": 0.523219696074873, + "learning_rate": 3.997045423029508e-05, + "loss": 0.5288, + "step": 282 + }, + { + "epoch": 0.579324462640737, + "grad_norm": 0.5914665747076296, + "learning_rate": 3.996887913744033e-05, + "loss": 0.5412, + "step": 283 + }, + { + "epoch": 0.5813715455475946, + "grad_norm": 0.504747034534071, + "learning_rate": 3.996726317608652e-05, + "loss": 0.5119, + "step": 284 + }, + { + "epoch": 0.5834186284544524, + "grad_norm": 0.6330512274369553, + "learning_rate": 3.996560634954088e-05, + "loss": 0.5504, + "step": 285 + }, + { + "epoch": 0.5854657113613101, + "grad_norm": 0.5570244494354821, + "learning_rate": 3.9963908661194285e-05, + "loss": 0.5323, + "step": 286 + }, + { + "epoch": 0.5875127942681678, + "grad_norm": 0.7110270240629256, + "learning_rate": 3.9962170114521246e-05, + "loss": 0.5086, + "step": 287 + }, + { + "epoch": 0.5895598771750256, + "grad_norm": 0.5503143307537267, + "learning_rate": 3.996039071307989e-05, + "loss": 0.5451, + "step": 288 + }, + { + "epoch": 0.5916069600818833, + "grad_norm": 0.6551750805922822, + "learning_rate": 3.995857046051196e-05, + "loss": 0.5375, + "step": 289 + }, + { + "epoch": 0.593654042988741, + "grad_norm": 0.625258001738531, + "learning_rate": 3.995670936054279e-05, + "loss": 0.5241, + "step": 290 + }, + { + "epoch": 0.5957011258955988, + "grad_norm": 0.6762376840884802, + "learning_rate": 3.9954807416981335e-05, + "loss": 0.5049, + "step": 291 + }, + { + "epoch": 0.5977482088024565, + "grad_norm": 0.6711303804476403, + "learning_rate": 3.995286463372013e-05, + "loss": 0.5117, + "step": 292 + }, + { + "epoch": 0.5997952917093142, + "grad_norm": 0.5151264552712931, + "learning_rate": 3.9950881014735295e-05, + "loss": 0.5053, + "step": 293 + }, + { + "epoch": 0.601842374616172, + "grad_norm": 0.663266741433607, + "learning_rate": 3.994885656408651e-05, + "loss": 0.4948, + "step": 294 + }, + { + "epoch": 0.6038894575230297, + "grad_norm": 0.567833717196775, + "learning_rate": 3.994679128591706e-05, + "loss": 0.5566, + "step": 295 + }, + { + "epoch": 0.6059365404298874, + "grad_norm": 0.5899154422863364, + "learning_rate": 3.9944685184453746e-05, + "loss": 0.53, + "step": 296 + }, + { + "epoch": 0.6079836233367452, + "grad_norm": 0.5758493539965378, + "learning_rate": 3.994253826400693e-05, + "loss": 0.5314, + "step": 297 + }, + { + "epoch": 0.6100307062436029, + "grad_norm": 0.5296826695926912, + "learning_rate": 3.9940350528970535e-05, + "loss": 0.5116, + "step": 298 + }, + { + "epoch": 0.6120777891504606, + "grad_norm": 0.5290847709355387, + "learning_rate": 3.993812198382199e-05, + "loss": 0.5028, + "step": 299 + }, + { + "epoch": 0.6141248720573184, + "grad_norm": 0.552336105606733, + "learning_rate": 3.993585263312227e-05, + "loss": 0.5202, + "step": 300 + }, + { + "epoch": 0.6161719549641761, + "grad_norm": 0.5293716014410816, + "learning_rate": 3.993354248151583e-05, + "loss": 0.4912, + "step": 301 + }, + { + "epoch": 0.6182190378710338, + "grad_norm": 0.6609476283232325, + "learning_rate": 3.993119153373067e-05, + "loss": 0.5438, + "step": 302 + }, + { + "epoch": 0.6202661207778914, + "grad_norm": 0.5469203904739622, + "learning_rate": 3.992879979457824e-05, + "loss": 0.5123, + "step": 303 + }, + { + "epoch": 0.6223132036847492, + "grad_norm": 0.6036405520109454, + "learning_rate": 3.9926367268953514e-05, + "loss": 0.5047, + "step": 304 + }, + { + "epoch": 0.6243602865916069, + "grad_norm": 0.5523585337922061, + "learning_rate": 3.9923893961834914e-05, + "loss": 0.5126, + "step": 305 + }, + { + "epoch": 0.6264073694984647, + "grad_norm": 0.5060340859748851, + "learning_rate": 3.992137987828434e-05, + "loss": 0.512, + "step": 306 + }, + { + "epoch": 0.6284544524053224, + "grad_norm": 0.4931821446334137, + "learning_rate": 3.991882502344712e-05, + "loss": 0.5086, + "step": 307 + }, + { + "epoch": 0.6305015353121801, + "grad_norm": 0.49318765086607474, + "learning_rate": 3.991622940255208e-05, + "loss": 0.4928, + "step": 308 + }, + { + "epoch": 0.6325486182190379, + "grad_norm": 0.47444503281787836, + "learning_rate": 3.991359302091141e-05, + "loss": 0.4823, + "step": 309 + }, + { + "epoch": 0.6345957011258956, + "grad_norm": 0.5837217244432208, + "learning_rate": 3.991091588392077e-05, + "loss": 0.553, + "step": 310 + }, + { + "epoch": 0.6366427840327533, + "grad_norm": 0.5943267226346076, + "learning_rate": 3.99081979970592e-05, + "loss": 0.5191, + "step": 311 + }, + { + "epoch": 0.638689866939611, + "grad_norm": 0.4692619037725303, + "learning_rate": 3.9905439365889176e-05, + "loss": 0.4833, + "step": 312 + }, + { + "epoch": 0.6407369498464688, + "grad_norm": 0.5017562232149083, + "learning_rate": 3.990263999605652e-05, + "loss": 0.4932, + "step": 313 + }, + { + "epoch": 0.6427840327533265, + "grad_norm": 0.4800449124898955, + "learning_rate": 3.989979989329046e-05, + "loss": 0.5475, + "step": 314 + }, + { + "epoch": 0.6448311156601843, + "grad_norm": 0.7077261651500286, + "learning_rate": 3.9896919063403567e-05, + "loss": 0.5656, + "step": 315 + }, + { + "epoch": 0.646878198567042, + "grad_norm": 0.4705479904386771, + "learning_rate": 3.989399751229179e-05, + "loss": 0.4812, + "step": 316 + }, + { + "epoch": 0.6489252814738997, + "grad_norm": 0.6188237485091818, + "learning_rate": 3.989103524593439e-05, + "loss": 0.5448, + "step": 317 + }, + { + "epoch": 0.6509723643807575, + "grad_norm": 0.5875009833381106, + "learning_rate": 3.9888032270393966e-05, + "loss": 0.5391, + "step": 318 + }, + { + "epoch": 0.6530194472876152, + "grad_norm": 0.6367560756626307, + "learning_rate": 3.988498859181645e-05, + "loss": 0.5857, + "step": 319 + }, + { + "epoch": 0.6550665301944729, + "grad_norm": 0.485432763118475, + "learning_rate": 3.988190421643105e-05, + "loss": 0.4775, + "step": 320 + }, + { + "epoch": 0.6571136131013307, + "grad_norm": 0.6730462808178248, + "learning_rate": 3.9878779150550306e-05, + "loss": 0.4953, + "step": 321 + }, + { + "epoch": 0.6591606960081884, + "grad_norm": 0.6111893546543505, + "learning_rate": 3.9875613400569975e-05, + "loss": 0.4593, + "step": 322 + }, + { + "epoch": 0.661207778915046, + "grad_norm": 0.7191482927871957, + "learning_rate": 3.987240697296912e-05, + "loss": 0.4943, + "step": 323 + }, + { + "epoch": 0.6632548618219037, + "grad_norm": 0.6827606638555512, + "learning_rate": 3.986915987431006e-05, + "loss": 0.5312, + "step": 324 + }, + { + "epoch": 0.6653019447287615, + "grad_norm": 0.6385480452392281, + "learning_rate": 3.986587211123833e-05, + "loss": 0.5066, + "step": 325 + }, + { + "epoch": 0.6673490276356192, + "grad_norm": 0.5586591391727767, + "learning_rate": 3.986254369048268e-05, + "loss": 0.519, + "step": 326 + }, + { + "epoch": 0.6693961105424769, + "grad_norm": 0.7119031753948682, + "learning_rate": 3.985917461885512e-05, + "loss": 0.526, + "step": 327 + }, + { + "epoch": 0.6714431934493347, + "grad_norm": 0.403690119122138, + "learning_rate": 3.98557649032508e-05, + "loss": 0.4941, + "step": 328 + }, + { + "epoch": 0.6734902763561924, + "grad_norm": 0.711228959963807, + "learning_rate": 3.985231455064809e-05, + "loss": 0.5161, + "step": 329 + }, + { + "epoch": 0.6755373592630501, + "grad_norm": 0.6152056684018806, + "learning_rate": 3.9848823568108515e-05, + "loss": 0.5252, + "step": 330 + }, + { + "epoch": 0.6775844421699079, + "grad_norm": 0.6811387482564264, + "learning_rate": 3.984529196277674e-05, + "loss": 0.5153, + "step": 331 + }, + { + "epoch": 0.6796315250767656, + "grad_norm": 0.715472760713951, + "learning_rate": 3.9841719741880583e-05, + "loss": 0.5136, + "step": 332 + }, + { + "epoch": 0.6816786079836233, + "grad_norm": 0.6117081033404244, + "learning_rate": 3.9838106912731e-05, + "loss": 0.5056, + "step": 333 + }, + { + "epoch": 0.6837256908904811, + "grad_norm": 0.5640133468287892, + "learning_rate": 3.983445348272203e-05, + "loss": 0.5022, + "step": 334 + }, + { + "epoch": 0.6857727737973388, + "grad_norm": 0.5708874168642846, + "learning_rate": 3.983075945933083e-05, + "loss": 0.5063, + "step": 335 + }, + { + "epoch": 0.6878198567041965, + "grad_norm": 0.5178591256000558, + "learning_rate": 3.9827024850117606e-05, + "loss": 0.5067, + "step": 336 + }, + { + "epoch": 0.6898669396110543, + "grad_norm": 0.5425146505052523, + "learning_rate": 3.982324966272566e-05, + "loss": 0.5112, + "step": 337 + }, + { + "epoch": 0.691914022517912, + "grad_norm": 0.4545857800127313, + "learning_rate": 3.9819433904881324e-05, + "loss": 0.4987, + "step": 338 + }, + { + "epoch": 0.6939611054247697, + "grad_norm": 0.5538231712121938, + "learning_rate": 3.981557758439396e-05, + "loss": 0.5174, + "step": 339 + }, + { + "epoch": 0.6960081883316275, + "grad_norm": 0.516910525115318, + "learning_rate": 3.981168070915594e-05, + "loss": 0.5205, + "step": 340 + }, + { + "epoch": 0.6980552712384852, + "grad_norm": 0.5341261157883666, + "learning_rate": 3.980774328714267e-05, + "loss": 0.5406, + "step": 341 + }, + { + "epoch": 0.7001023541453428, + "grad_norm": 0.5219326420859508, + "learning_rate": 3.9803765326412506e-05, + "loss": 0.5491, + "step": 342 + }, + { + "epoch": 0.7021494370522006, + "grad_norm": 0.5126045854291318, + "learning_rate": 3.979974683510677e-05, + "loss": 0.5507, + "step": 343 + }, + { + "epoch": 0.7041965199590583, + "grad_norm": 0.5442853681992302, + "learning_rate": 3.9795687821449754e-05, + "loss": 0.5136, + "step": 344 + }, + { + "epoch": 0.706243602865916, + "grad_norm": 0.4580135182917517, + "learning_rate": 3.9791588293748676e-05, + "loss": 0.4908, + "step": 345 + }, + { + "epoch": 0.7082906857727738, + "grad_norm": 0.5632074896697251, + "learning_rate": 3.978744826039366e-05, + "loss": 0.5046, + "step": 346 + }, + { + "epoch": 0.7103377686796315, + "grad_norm": 0.4381200556890962, + "learning_rate": 3.9783267729857756e-05, + "loss": 0.4994, + "step": 347 + }, + { + "epoch": 0.7123848515864892, + "grad_norm": 0.5608491796041225, + "learning_rate": 3.9779046710696854e-05, + "loss": 0.4813, + "step": 348 + }, + { + "epoch": 0.714431934493347, + "grad_norm": 0.4726444333140026, + "learning_rate": 3.977478521154974e-05, + "loss": 0.487, + "step": 349 + }, + { + "epoch": 0.7164790174002047, + "grad_norm": 0.5138973186415984, + "learning_rate": 3.977048324113805e-05, + "loss": 0.5418, + "step": 350 + }, + { + "epoch": 0.7185261003070624, + "grad_norm": 0.48636953697879376, + "learning_rate": 3.976614080826623e-05, + "loss": 0.5313, + "step": 351 + }, + { + "epoch": 0.7205731832139202, + "grad_norm": 0.5627556793214199, + "learning_rate": 3.9761757921821544e-05, + "loss": 0.5136, + "step": 352 + }, + { + "epoch": 0.7226202661207779, + "grad_norm": 0.42092677165552256, + "learning_rate": 3.975733459077405e-05, + "loss": 0.5396, + "step": 353 + }, + { + "epoch": 0.7246673490276356, + "grad_norm": 0.48567860686645814, + "learning_rate": 3.9752870824176585e-05, + "loss": 0.4912, + "step": 354 + }, + { + "epoch": 0.7267144319344934, + "grad_norm": 0.48215074888552417, + "learning_rate": 3.974836663116472e-05, + "loss": 0.5246, + "step": 355 + }, + { + "epoch": 0.7287615148413511, + "grad_norm": 0.4041796943755226, + "learning_rate": 3.97438220209568e-05, + "loss": 0.4697, + "step": 356 + }, + { + "epoch": 0.7308085977482088, + "grad_norm": 0.5033023757557246, + "learning_rate": 3.973923700285386e-05, + "loss": 0.5033, + "step": 357 + }, + { + "epoch": 0.7328556806550666, + "grad_norm": 0.47849562907550447, + "learning_rate": 3.973461158623963e-05, + "loss": 0.51, + "step": 358 + }, + { + "epoch": 0.7349027635619243, + "grad_norm": 0.4728712624318383, + "learning_rate": 3.972994578058055e-05, + "loss": 0.5183, + "step": 359 + }, + { + "epoch": 0.736949846468782, + "grad_norm": 0.46994907616226134, + "learning_rate": 3.972523959542569e-05, + "loss": 0.4791, + "step": 360 + }, + { + "epoch": 0.7389969293756398, + "grad_norm": 0.4716759611852859, + "learning_rate": 3.9720493040406786e-05, + "loss": 0.5053, + "step": 361 + }, + { + "epoch": 0.7410440122824974, + "grad_norm": 0.4739511859859272, + "learning_rate": 3.9715706125238164e-05, + "loss": 0.4902, + "step": 362 + }, + { + "epoch": 0.7430910951893551, + "grad_norm": 0.5524351394525074, + "learning_rate": 3.971087885971679e-05, + "loss": 0.5002, + "step": 363 + }, + { + "epoch": 0.7451381780962129, + "grad_norm": 0.5160577267882994, + "learning_rate": 3.970601125372218e-05, + "loss": 0.5077, + "step": 364 + }, + { + "epoch": 0.7471852610030706, + "grad_norm": 0.6036728391773393, + "learning_rate": 3.970110331721643e-05, + "loss": 0.5152, + "step": 365 + }, + { + "epoch": 0.7492323439099283, + "grad_norm": 0.44920724019503216, + "learning_rate": 3.9696155060244166e-05, + "loss": 0.5016, + "step": 366 + }, + { + "epoch": 0.7512794268167861, + "grad_norm": 0.5748889938625247, + "learning_rate": 3.9691166492932535e-05, + "loss": 0.5484, + "step": 367 + }, + { + "epoch": 0.7533265097236438, + "grad_norm": 0.456775282424986, + "learning_rate": 3.968613762549119e-05, + "loss": 0.4839, + "step": 368 + }, + { + "epoch": 0.7553735926305015, + "grad_norm": 0.5722901317947914, + "learning_rate": 3.968106846821226e-05, + "loss": 0.4961, + "step": 369 + }, + { + "epoch": 0.7574206755373593, + "grad_norm": 0.513431808850321, + "learning_rate": 3.9675959031470336e-05, + "loss": 0.5347, + "step": 370 + }, + { + "epoch": 0.759467758444217, + "grad_norm": 0.4971842267477506, + "learning_rate": 3.9670809325722425e-05, + "loss": 0.5025, + "step": 371 + }, + { + "epoch": 0.7615148413510747, + "grad_norm": 0.5489460962549015, + "learning_rate": 3.966561936150797e-05, + "loss": 0.527, + "step": 372 + }, + { + "epoch": 0.7635619242579325, + "grad_norm": 0.46580059626935816, + "learning_rate": 3.966038914944881e-05, + "loss": 0.5024, + "step": 373 + }, + { + "epoch": 0.7656090071647902, + "grad_norm": 0.48329497303274316, + "learning_rate": 3.9655118700249146e-05, + "loss": 0.4543, + "step": 374 + }, + { + "epoch": 0.7676560900716479, + "grad_norm": 0.4294347901743354, + "learning_rate": 3.964980802469552e-05, + "loss": 0.4918, + "step": 375 + }, + { + "epoch": 0.7697031729785057, + "grad_norm": 0.46898699035993047, + "learning_rate": 3.964445713365682e-05, + "loss": 0.5219, + "step": 376 + }, + { + "epoch": 0.7717502558853634, + "grad_norm": 0.5368649938570568, + "learning_rate": 3.963906603808422e-05, + "loss": 0.5491, + "step": 377 + }, + { + "epoch": 0.7737973387922211, + "grad_norm": 0.4589070525461554, + "learning_rate": 3.96336347490112e-05, + "loss": 0.5193, + "step": 378 + }, + { + "epoch": 0.7758444216990789, + "grad_norm": 0.5198618447541961, + "learning_rate": 3.9628163277553486e-05, + "loss": 0.5059, + "step": 379 + }, + { + "epoch": 0.7778915046059366, + "grad_norm": 0.5201151329391878, + "learning_rate": 3.962265163490903e-05, + "loss": 0.5242, + "step": 380 + }, + { + "epoch": 0.7799385875127943, + "grad_norm": 0.5286097719017859, + "learning_rate": 3.9617099832358035e-05, + "loss": 0.5131, + "step": 381 + }, + { + "epoch": 0.781985670419652, + "grad_norm": 0.4908828849732809, + "learning_rate": 3.961150788126286e-05, + "loss": 0.5247, + "step": 382 + }, + { + "epoch": 0.7840327533265097, + "grad_norm": 0.5105242176184168, + "learning_rate": 3.960587579306805e-05, + "loss": 0.5082, + "step": 383 + }, + { + "epoch": 0.7860798362333674, + "grad_norm": 0.4827529985137035, + "learning_rate": 3.960020357930028e-05, + "loss": 0.4886, + "step": 384 + }, + { + "epoch": 0.7881269191402251, + "grad_norm": 0.5255080063950176, + "learning_rate": 3.9594491251568376e-05, + "loss": 0.5323, + "step": 385 + }, + { + "epoch": 0.7901740020470829, + "grad_norm": 0.42324786024329186, + "learning_rate": 3.958873882156322e-05, + "loss": 0.4913, + "step": 386 + }, + { + "epoch": 0.7922210849539406, + "grad_norm": 0.4856526176027495, + "learning_rate": 3.9582946301057806e-05, + "loss": 0.5083, + "step": 387 + }, + { + "epoch": 0.7942681678607983, + "grad_norm": 0.4869309440084628, + "learning_rate": 3.957711370190716e-05, + "loss": 0.503, + "step": 388 + }, + { + "epoch": 0.7963152507676561, + "grad_norm": 0.5268029487052006, + "learning_rate": 3.957124103604833e-05, + "loss": 0.5082, + "step": 389 + }, + { + "epoch": 0.7983623336745138, + "grad_norm": 0.507634587065165, + "learning_rate": 3.9565328315500375e-05, + "loss": 0.5166, + "step": 390 + }, + { + "epoch": 0.8004094165813715, + "grad_norm": 0.5687757643916819, + "learning_rate": 3.9559375552364325e-05, + "loss": 0.5143, + "step": 391 + }, + { + "epoch": 0.8024564994882293, + "grad_norm": 0.4695087735006217, + "learning_rate": 3.955338275882316e-05, + "loss": 0.4713, + "step": 392 + }, + { + "epoch": 0.804503582395087, + "grad_norm": 0.46001610899852224, + "learning_rate": 3.9547349947141787e-05, + "loss": 0.4701, + "step": 393 + }, + { + "epoch": 0.8065506653019447, + "grad_norm": 0.4942091923144858, + "learning_rate": 3.954127712966702e-05, + "loss": 0.4916, + "step": 394 + }, + { + "epoch": 0.8085977482088025, + "grad_norm": 0.4750963479134321, + "learning_rate": 3.953516431882754e-05, + "loss": 0.535, + "step": 395 + }, + { + "epoch": 0.8106448311156602, + "grad_norm": 0.45303577566833647, + "learning_rate": 3.952901152713389e-05, + "loss": 0.4756, + "step": 396 + }, + { + "epoch": 0.812691914022518, + "grad_norm": 0.45075457713091993, + "learning_rate": 3.952281876717843e-05, + "loss": 0.5013, + "step": 397 + }, + { + "epoch": 0.8147389969293757, + "grad_norm": 0.4866908528841161, + "learning_rate": 3.951658605163533e-05, + "loss": 0.5159, + "step": 398 + }, + { + "epoch": 0.8167860798362334, + "grad_norm": 0.47273974659050305, + "learning_rate": 3.9510313393260507e-05, + "loss": 0.4876, + "step": 399 + }, + { + "epoch": 0.8188331627430911, + "grad_norm": 0.4599999323212464, + "learning_rate": 3.950400080489165e-05, + "loss": 0.486, + "step": 400 + }, + { + "epoch": 0.8208802456499488, + "grad_norm": 0.41263568282343643, + "learning_rate": 3.9497648299448174e-05, + "loss": 0.6514, + "step": 401 + }, + { + "epoch": 0.8229273285568065, + "grad_norm": 0.49637310910399085, + "learning_rate": 3.949125588993117e-05, + "loss": 0.5247, + "step": 402 + }, + { + "epoch": 0.8249744114636642, + "grad_norm": 0.407642872768924, + "learning_rate": 3.94848235894234e-05, + "loss": 0.5023, + "step": 403 + }, + { + "epoch": 0.827021494370522, + "grad_norm": 0.5125499802657086, + "learning_rate": 3.947835141108928e-05, + "loss": 0.5187, + "step": 404 + }, + { + "epoch": 0.8290685772773797, + "grad_norm": 0.46889549178993806, + "learning_rate": 3.947183936817483e-05, + "loss": 0.5089, + "step": 405 + }, + { + "epoch": 0.8311156601842374, + "grad_norm": 0.4793952039316711, + "learning_rate": 3.9465287474007654e-05, + "loss": 0.4946, + "step": 406 + }, + { + "epoch": 0.8331627430910952, + "grad_norm": 0.4616324812446371, + "learning_rate": 3.945869574199693e-05, + "loss": 0.4905, + "step": 407 + }, + { + "epoch": 0.8352098259979529, + "grad_norm": 0.522723518377932, + "learning_rate": 3.9452064185633345e-05, + "loss": 0.4873, + "step": 408 + }, + { + "epoch": 0.8372569089048106, + "grad_norm": 0.4444650907824343, + "learning_rate": 3.944539281848912e-05, + "loss": 0.4995, + "step": 409 + }, + { + "epoch": 0.8393039918116684, + "grad_norm": 0.5394910784111964, + "learning_rate": 3.943868165421793e-05, + "loss": 0.4597, + "step": 410 + }, + { + "epoch": 0.8413510747185261, + "grad_norm": 0.47150833451005714, + "learning_rate": 3.943193070655492e-05, + "loss": 0.4768, + "step": 411 + }, + { + "epoch": 0.8433981576253838, + "grad_norm": 0.5217385743079497, + "learning_rate": 3.942513998931663e-05, + "loss": 0.4936, + "step": 412 + }, + { + "epoch": 0.8454452405322416, + "grad_norm": 0.47838287732742774, + "learning_rate": 3.9418309516401015e-05, + "loss": 0.4998, + "step": 413 + }, + { + "epoch": 0.8474923234390993, + "grad_norm": 0.5219985412359689, + "learning_rate": 3.9411439301787383e-05, + "loss": 0.4922, + "step": 414 + }, + { + "epoch": 0.849539406345957, + "grad_norm": 0.5360559638934609, + "learning_rate": 3.940452935953639e-05, + "loss": 0.4932, + "step": 415 + }, + { + "epoch": 0.8515864892528148, + "grad_norm": 0.45167095248901046, + "learning_rate": 3.939757970378997e-05, + "loss": 0.5325, + "step": 416 + }, + { + "epoch": 0.8536335721596725, + "grad_norm": 0.5392146495002762, + "learning_rate": 3.9390590348771374e-05, + "loss": 0.5161, + "step": 417 + }, + { + "epoch": 0.8556806550665302, + "grad_norm": 0.4694690399172554, + "learning_rate": 3.9383561308785075e-05, + "loss": 0.4872, + "step": 418 + }, + { + "epoch": 0.857727737973388, + "grad_norm": 0.5450902044992034, + "learning_rate": 3.937649259821677e-05, + "loss": 0.5033, + "step": 419 + }, + { + "epoch": 0.8597748208802457, + "grad_norm": 0.4513747246745581, + "learning_rate": 3.9369384231533365e-05, + "loss": 0.5022, + "step": 420 + }, + { + "epoch": 0.8618219037871033, + "grad_norm": 0.47998170745321456, + "learning_rate": 3.9362236223282885e-05, + "loss": 0.488, + "step": 421 + }, + { + "epoch": 0.8638689866939611, + "grad_norm": 0.5703617109539353, + "learning_rate": 3.935504858809454e-05, + "loss": 0.5364, + "step": 422 + }, + { + "epoch": 0.8659160696008188, + "grad_norm": 0.46673348103204154, + "learning_rate": 3.9347821340678597e-05, + "loss": 0.519, + "step": 423 + }, + { + "epoch": 0.8679631525076765, + "grad_norm": 0.5424695733970012, + "learning_rate": 3.934055449582641e-05, + "loss": 0.5568, + "step": 424 + }, + { + "epoch": 0.8700102354145343, + "grad_norm": 0.4765444623240975, + "learning_rate": 3.9333248068410375e-05, + "loss": 0.4743, + "step": 425 + }, + { + "epoch": 0.872057318321392, + "grad_norm": 0.49864798719594966, + "learning_rate": 3.932590207338391e-05, + "loss": 0.4982, + "step": 426 + }, + { + "epoch": 0.8741044012282497, + "grad_norm": 0.4988338996845499, + "learning_rate": 3.931851652578137e-05, + "loss": 0.4963, + "step": 427 + }, + { + "epoch": 0.8761514841351075, + "grad_norm": 0.4432676805703767, + "learning_rate": 3.931109144071811e-05, + "loss": 0.4719, + "step": 428 + }, + { + "epoch": 0.8781985670419652, + "grad_norm": 0.48146006351463866, + "learning_rate": 3.930362683339037e-05, + "loss": 0.4863, + "step": 429 + }, + { + "epoch": 0.8802456499488229, + "grad_norm": 0.44888291145823134, + "learning_rate": 3.92961227190753e-05, + "loss": 0.5167, + "step": 430 + }, + { + "epoch": 0.8822927328556807, + "grad_norm": 0.4602238267927309, + "learning_rate": 3.928857911313088e-05, + "loss": 0.5031, + "step": 431 + }, + { + "epoch": 0.8843398157625384, + "grad_norm": 0.49233832361099084, + "learning_rate": 3.928099603099591e-05, + "loss": 0.5013, + "step": 432 + }, + { + "epoch": 0.8863868986693961, + "grad_norm": 0.4576541292218355, + "learning_rate": 3.9273373488190036e-05, + "loss": 0.5574, + "step": 433 + }, + { + "epoch": 0.8884339815762539, + "grad_norm": 0.4638871631773895, + "learning_rate": 3.92657115003136e-05, + "loss": 0.5253, + "step": 434 + }, + { + "epoch": 0.8904810644831116, + "grad_norm": 0.5178780585024065, + "learning_rate": 3.9258010083047715e-05, + "loss": 0.5485, + "step": 435 + }, + { + "epoch": 0.8925281473899693, + "grad_norm": 0.4598091842235503, + "learning_rate": 3.925026925215417e-05, + "loss": 0.4636, + "step": 436 + }, + { + "epoch": 0.8945752302968271, + "grad_norm": 0.5965207792237605, + "learning_rate": 3.924248902347541e-05, + "loss": 0.5464, + "step": 437 + }, + { + "epoch": 0.8966223132036848, + "grad_norm": 0.49770158650370516, + "learning_rate": 3.9234669412934546e-05, + "loss": 0.5461, + "step": 438 + }, + { + "epoch": 0.8986693961105425, + "grad_norm": 0.5031859956447141, + "learning_rate": 3.922681043653526e-05, + "loss": 0.5146, + "step": 439 + }, + { + "epoch": 0.9007164790174002, + "grad_norm": 0.4561558037016599, + "learning_rate": 3.92189121103618e-05, + "loss": 0.4898, + "step": 440 + }, + { + "epoch": 0.9027635619242579, + "grad_norm": 0.4247526477385047, + "learning_rate": 3.921097445057896e-05, + "loss": 0.4976, + "step": 441 + }, + { + "epoch": 0.9048106448311156, + "grad_norm": 0.523620896917421, + "learning_rate": 3.920299747343204e-05, + "loss": 0.4894, + "step": 442 + }, + { + "epoch": 0.9068577277379734, + "grad_norm": 0.4267971609347695, + "learning_rate": 3.919498119524679e-05, + "loss": 0.5029, + "step": 443 + }, + { + "epoch": 0.9089048106448311, + "grad_norm": 0.5817422656960313, + "learning_rate": 3.9186925632429396e-05, + "loss": 0.5477, + "step": 444 + }, + { + "epoch": 0.9109518935516888, + "grad_norm": 0.3812610872657855, + "learning_rate": 3.9178830801466465e-05, + "loss": 0.5147, + "step": 445 + }, + { + "epoch": 0.9129989764585466, + "grad_norm": 0.5210417707070376, + "learning_rate": 3.917069671892494e-05, + "loss": 0.5052, + "step": 446 + }, + { + "epoch": 0.9150460593654043, + "grad_norm": 0.4195818289044568, + "learning_rate": 3.9162523401452125e-05, + "loss": 0.476, + "step": 447 + }, + { + "epoch": 0.917093142272262, + "grad_norm": 0.4640561681219655, + "learning_rate": 3.915431086577561e-05, + "loss": 0.4811, + "step": 448 + }, + { + "epoch": 0.9191402251791198, + "grad_norm": 0.46049336960827775, + "learning_rate": 3.914605912870324e-05, + "loss": 0.5082, + "step": 449 + }, + { + "epoch": 0.9211873080859775, + "grad_norm": 0.5244765933281279, + "learning_rate": 3.913776820712309e-05, + "loss": 0.4982, + "step": 450 + }, + { + "epoch": 0.9232343909928352, + "grad_norm": 0.4207300716892893, + "learning_rate": 3.912943811800347e-05, + "loss": 0.477, + "step": 451 + }, + { + "epoch": 0.925281473899693, + "grad_norm": 0.4532179957617135, + "learning_rate": 3.912106887839278e-05, + "loss": 0.4953, + "step": 452 + }, + { + "epoch": 0.9273285568065507, + "grad_norm": 0.42360404758505654, + "learning_rate": 3.9112660505419626e-05, + "loss": 0.5248, + "step": 453 + }, + { + "epoch": 0.9293756397134084, + "grad_norm": 0.4345837890396033, + "learning_rate": 3.910421301629264e-05, + "loss": 0.4887, + "step": 454 + }, + { + "epoch": 0.9314227226202662, + "grad_norm": 0.43091075968385034, + "learning_rate": 3.909572642830053e-05, + "loss": 0.4987, + "step": 455 + }, + { + "epoch": 0.9334698055271239, + "grad_norm": 0.4111596649311637, + "learning_rate": 3.9087200758812054e-05, + "loss": 0.5429, + "step": 456 + }, + { + "epoch": 0.9355168884339816, + "grad_norm": 0.4114767207964879, + "learning_rate": 3.9078636025275904e-05, + "loss": 0.4868, + "step": 457 + }, + { + "epoch": 0.9375639713408394, + "grad_norm": 0.4511575697019446, + "learning_rate": 3.907003224522075e-05, + "loss": 0.5151, + "step": 458 + }, + { + "epoch": 0.9396110542476971, + "grad_norm": 0.47837137672489904, + "learning_rate": 3.906138943625519e-05, + "loss": 0.4812, + "step": 459 + }, + { + "epoch": 0.9416581371545547, + "grad_norm": 0.394915101475589, + "learning_rate": 3.9052707616067654e-05, + "loss": 0.5405, + "step": 460 + }, + { + "epoch": 0.9437052200614124, + "grad_norm": 0.5432929178652014, + "learning_rate": 3.9043986802426453e-05, + "loss": 0.4816, + "step": 461 + }, + { + "epoch": 0.9457523029682702, + "grad_norm": 0.3617390720179231, + "learning_rate": 3.903522701317968e-05, + "loss": 0.4864, + "step": 462 + }, + { + "epoch": 0.9477993858751279, + "grad_norm": 0.5239560704020834, + "learning_rate": 3.9026428266255205e-05, + "loss": 0.4979, + "step": 463 + }, + { + "epoch": 0.9498464687819856, + "grad_norm": 0.39348645216036965, + "learning_rate": 3.901759057966064e-05, + "loss": 0.4741, + "step": 464 + }, + { + "epoch": 0.9518935516888434, + "grad_norm": 0.47748699997048394, + "learning_rate": 3.9008713971483257e-05, + "loss": 0.4729, + "step": 465 + }, + { + "epoch": 0.9539406345957011, + "grad_norm": 0.4339281681486082, + "learning_rate": 3.899979845989003e-05, + "loss": 0.5072, + "step": 466 + }, + { + "epoch": 0.9559877175025588, + "grad_norm": 0.4360289845901765, + "learning_rate": 3.899084406312751e-05, + "loss": 0.4844, + "step": 467 + }, + { + "epoch": 0.9580348004094166, + "grad_norm": 0.49803606731717964, + "learning_rate": 3.8981850799521856e-05, + "loss": 0.5021, + "step": 468 + }, + { + "epoch": 0.9600818833162743, + "grad_norm": 0.4633434375152389, + "learning_rate": 3.897281868747878e-05, + "loss": 0.5003, + "step": 469 + }, + { + "epoch": 0.962128966223132, + "grad_norm": 0.4887467757571554, + "learning_rate": 3.896374774548348e-05, + "loss": 0.5054, + "step": 470 + }, + { + "epoch": 0.9641760491299898, + "grad_norm": 0.44664870237749776, + "learning_rate": 3.895463799210063e-05, + "loss": 0.5186, + "step": 471 + }, + { + "epoch": 0.9662231320368475, + "grad_norm": 0.5203094189929391, + "learning_rate": 3.894548944597434e-05, + "loss": 0.5227, + "step": 472 + }, + { + "epoch": 0.9682702149437052, + "grad_norm": 0.37329019363463944, + "learning_rate": 3.8936302125828114e-05, + "loss": 0.4836, + "step": 473 + }, + { + "epoch": 0.970317297850563, + "grad_norm": 0.46252269585391337, + "learning_rate": 3.892707605046482e-05, + "loss": 0.465, + "step": 474 + }, + { + "epoch": 0.9723643807574207, + "grad_norm": 0.4466031989756924, + "learning_rate": 3.8917811238766606e-05, + "loss": 0.5103, + "step": 475 + }, + { + "epoch": 0.9744114636642784, + "grad_norm": 0.49858281748974287, + "learning_rate": 3.8908507709694945e-05, + "loss": 0.4857, + "step": 476 + }, + { + "epoch": 0.9764585465711362, + "grad_norm": 0.5999408245173112, + "learning_rate": 3.8899165482290524e-05, + "loss": 0.5135, + "step": 477 + }, + { + "epoch": 0.9785056294779939, + "grad_norm": 0.44390557227435046, + "learning_rate": 3.888978457567323e-05, + "loss": 0.4826, + "step": 478 + }, + { + "epoch": 0.9805527123848515, + "grad_norm": 0.6043545995456223, + "learning_rate": 3.888036500904212e-05, + "loss": 0.5154, + "step": 479 + }, + { + "epoch": 0.9825997952917093, + "grad_norm": 0.4818312706910111, + "learning_rate": 3.887090680167537e-05, + "loss": 0.5087, + "step": 480 + }, + { + "epoch": 0.984646878198567, + "grad_norm": 0.6429627680182052, + "learning_rate": 3.886140997293024e-05, + "loss": 0.5033, + "step": 481 + }, + { + "epoch": 0.9866939611054247, + "grad_norm": 0.46009297651512515, + "learning_rate": 3.8851874542243024e-05, + "loss": 0.4737, + "step": 482 + }, + { + "epoch": 0.9887410440122825, + "grad_norm": 0.5257256010671767, + "learning_rate": 3.8842300529129026e-05, + "loss": 0.4922, + "step": 483 + }, + { + "epoch": 0.9907881269191402, + "grad_norm": 0.4952147402880594, + "learning_rate": 3.883268795318252e-05, + "loss": 0.5093, + "step": 484 + }, + { + "epoch": 0.9928352098259979, + "grad_norm": 0.40718187065435846, + "learning_rate": 3.882303683407669e-05, + "loss": 0.4898, + "step": 485 + }, + { + "epoch": 0.9948822927328557, + "grad_norm": 0.6342262964756523, + "learning_rate": 3.8813347191563615e-05, + "loss": 0.4809, + "step": 486 + }, + { + "epoch": 0.9969293756397134, + "grad_norm": 0.36521858199635876, + "learning_rate": 3.88036190454742e-05, + "loss": 0.4792, + "step": 487 + }, + { + "epoch": 0.9989764585465711, + "grad_norm": 0.49465426774165494, + "learning_rate": 3.879385241571817e-05, + "loss": 0.4945, + "step": 488 + }, + { + "epoch": 1.0010235414534288, + "grad_norm": 0.6823091472624835, + "learning_rate": 3.8784047322284e-05, + "loss": 0.7219, + "step": 489 + }, + { + "epoch": 1.0030706243602865, + "grad_norm": 0.5576204983958845, + "learning_rate": 3.8774203785238886e-05, + "loss": 0.5356, + "step": 490 + }, + { + "epoch": 1.0051177072671442, + "grad_norm": 0.4049517959021661, + "learning_rate": 3.8764321824728715e-05, + "loss": 0.3433, + "step": 491 + }, + { + "epoch": 1.007164790174002, + "grad_norm": 0.6379294758690475, + "learning_rate": 3.875440146097798e-05, + "loss": 0.4658, + "step": 492 + }, + { + "epoch": 1.0092118730808597, + "grad_norm": 0.48729117155574453, + "learning_rate": 3.8744442714289816e-05, + "loss": 0.423, + "step": 493 + }, + { + "epoch": 1.0112589559877174, + "grad_norm": 0.6905667947564962, + "learning_rate": 3.873444560504588e-05, + "loss": 0.4378, + "step": 494 + }, + { + "epoch": 1.0133060388945752, + "grad_norm": 0.5821283432486261, + "learning_rate": 3.872441015370635e-05, + "loss": 0.4592, + "step": 495 + }, + { + "epoch": 1.015353121801433, + "grad_norm": 0.6506243911845556, + "learning_rate": 3.8714336380809874e-05, + "loss": 0.4401, + "step": 496 + }, + { + "epoch": 1.0174002047082906, + "grad_norm": 0.5026499066925773, + "learning_rate": 3.870422430697354e-05, + "loss": 0.4082, + "step": 497 + }, + { + "epoch": 1.0194472876151484, + "grad_norm": 0.7706353109115344, + "learning_rate": 3.869407395289281e-05, + "loss": 0.4851, + "step": 498 + }, + { + "epoch": 1.021494370522006, + "grad_norm": 0.40094293966181355, + "learning_rate": 3.86838853393415e-05, + "loss": 0.3689, + "step": 499 + }, + { + "epoch": 1.0235414534288638, + "grad_norm": 0.7044751532962212, + "learning_rate": 3.867365848717171e-05, + "loss": 0.4298, + "step": 500 + }, + { + "epoch": 1.0255885363357216, + "grad_norm": 0.5123771630826552, + "learning_rate": 3.866339341731384e-05, + "loss": 0.4313, + "step": 501 + }, + { + "epoch": 1.0276356192425793, + "grad_norm": 0.44854290574972205, + "learning_rate": 3.865309015077645e-05, + "loss": 0.3686, + "step": 502 + }, + { + "epoch": 1.029682702149437, + "grad_norm": 0.5101255948957101, + "learning_rate": 3.8642748708646324e-05, + "loss": 0.4471, + "step": 503 + }, + { + "epoch": 1.0317297850562948, + "grad_norm": 0.44907848191294825, + "learning_rate": 3.863236911208835e-05, + "loss": 0.342, + "step": 504 + }, + { + "epoch": 1.0337768679631525, + "grad_norm": 0.5031368605626068, + "learning_rate": 3.862195138234551e-05, + "loss": 0.4214, + "step": 505 + }, + { + "epoch": 1.0358239508700102, + "grad_norm": 0.5833077927402516, + "learning_rate": 3.8611495540738835e-05, + "loss": 0.4858, + "step": 506 + }, + { + "epoch": 1.037871033776868, + "grad_norm": 0.41256204125595797, + "learning_rate": 3.860100160866733e-05, + "loss": 0.4171, + "step": 507 + }, + { + "epoch": 1.0399181166837257, + "grad_norm": 0.5124022037131871, + "learning_rate": 3.859046960760801e-05, + "loss": 0.4517, + "step": 508 + }, + { + "epoch": 1.0419651995905834, + "grad_norm": 0.5225640900487728, + "learning_rate": 3.857989955911574e-05, + "loss": 0.4341, + "step": 509 + }, + { + "epoch": 1.0440122824974412, + "grad_norm": 0.38566215827923084, + "learning_rate": 3.85692914848233e-05, + "loss": 0.3913, + "step": 510 + }, + { + "epoch": 1.046059365404299, + "grad_norm": 0.4633891939003197, + "learning_rate": 3.855864540644126e-05, + "loss": 0.397, + "step": 511 + }, + { + "epoch": 1.0481064483111566, + "grad_norm": 0.47380208701961085, + "learning_rate": 3.8547961345758e-05, + "loss": 0.4476, + "step": 512 + }, + { + "epoch": 1.0501535312180144, + "grad_norm": 0.5577403954270109, + "learning_rate": 3.853723932463962e-05, + "loss": 0.4559, + "step": 513 + }, + { + "epoch": 1.052200614124872, + "grad_norm": 0.5358423915305728, + "learning_rate": 3.8526479365029906e-05, + "loss": 0.4322, + "step": 514 + }, + { + "epoch": 1.0542476970317298, + "grad_norm": 0.463025872995783, + "learning_rate": 3.8515681488950286e-05, + "loss": 0.4725, + "step": 515 + }, + { + "epoch": 1.0562947799385876, + "grad_norm": 0.4349348376778485, + "learning_rate": 3.850484571849982e-05, + "loss": 0.3743, + "step": 516 + }, + { + "epoch": 1.0583418628454453, + "grad_norm": 0.49093997411971185, + "learning_rate": 3.849397207585508e-05, + "loss": 0.4704, + "step": 517 + }, + { + "epoch": 1.060388945752303, + "grad_norm": 0.5752884046292331, + "learning_rate": 3.848306058327016e-05, + "loss": 0.4772, + "step": 518 + }, + { + "epoch": 1.0624360286591608, + "grad_norm": 0.36504458066426737, + "learning_rate": 3.847211126307666e-05, + "loss": 0.4277, + "step": 519 + }, + { + "epoch": 1.0644831115660185, + "grad_norm": 0.5134952699143726, + "learning_rate": 3.846112413768353e-05, + "loss": 0.4094, + "step": 520 + }, + { + "epoch": 1.0665301944728762, + "grad_norm": 0.42686550744637813, + "learning_rate": 3.845009922957713e-05, + "loss": 0.3999, + "step": 521 + }, + { + "epoch": 1.068577277379734, + "grad_norm": 0.45121329177961284, + "learning_rate": 3.843903656132116e-05, + "loss": 0.4782, + "step": 522 + }, + { + "epoch": 1.0706243602865917, + "grad_norm": 0.4799180332218181, + "learning_rate": 3.842793615555657e-05, + "loss": 0.4344, + "step": 523 + }, + { + "epoch": 1.0726714431934494, + "grad_norm": 0.4906577800070559, + "learning_rate": 3.8416798035001545e-05, + "loss": 0.3999, + "step": 524 + }, + { + "epoch": 1.0747185261003072, + "grad_norm": 0.4381457628189835, + "learning_rate": 3.8405622222451496e-05, + "loss": 0.4867, + "step": 525 + }, + { + "epoch": 1.076765609007165, + "grad_norm": 0.4802227337203006, + "learning_rate": 3.8394408740778934e-05, + "loss": 0.4067, + "step": 526 + }, + { + "epoch": 1.0788126919140226, + "grad_norm": 0.4490101903364367, + "learning_rate": 3.838315761293348e-05, + "loss": 0.4346, + "step": 527 + }, + { + "epoch": 1.0808597748208801, + "grad_norm": 0.4633513621013708, + "learning_rate": 3.8371868861941795e-05, + "loss": 0.4177, + "step": 528 + }, + { + "epoch": 1.0829068577277379, + "grad_norm": 0.38946131051973665, + "learning_rate": 3.836054251090755e-05, + "loss": 0.3554, + "step": 529 + }, + { + "epoch": 1.0849539406345956, + "grad_norm": 0.4671283243776551, + "learning_rate": 3.8349178583011356e-05, + "loss": 0.4613, + "step": 530 + }, + { + "epoch": 1.0870010235414533, + "grad_norm": 0.42447494229810095, + "learning_rate": 3.833777710151075e-05, + "loss": 0.4056, + "step": 531 + }, + { + "epoch": 1.089048106448311, + "grad_norm": 0.42416375790936595, + "learning_rate": 3.83263380897401e-05, + "loss": 0.3751, + "step": 532 + }, + { + "epoch": 1.0910951893551688, + "grad_norm": 0.4073852074265662, + "learning_rate": 3.8314861571110604e-05, + "loss": 0.4637, + "step": 533 + }, + { + "epoch": 1.0931422722620265, + "grad_norm": 0.4548496033565697, + "learning_rate": 3.830334756911021e-05, + "loss": 0.4299, + "step": 534 + }, + { + "epoch": 1.0951893551688843, + "grad_norm": 0.376272980920323, + "learning_rate": 3.829179610730359e-05, + "loss": 0.408, + "step": 535 + }, + { + "epoch": 1.097236438075742, + "grad_norm": 0.427450221785291, + "learning_rate": 3.828020720933207e-05, + "loss": 0.4095, + "step": 536 + }, + { + "epoch": 1.0992835209825997, + "grad_norm": 0.440976279211047, + "learning_rate": 3.826858089891361e-05, + "loss": 0.494, + "step": 537 + }, + { + "epoch": 1.1013306038894575, + "grad_norm": 0.4209945160038491, + "learning_rate": 3.8256917199842715e-05, + "loss": 0.4586, + "step": 538 + }, + { + "epoch": 1.1033776867963152, + "grad_norm": 0.3962229334467954, + "learning_rate": 3.824521613599043e-05, + "loss": 0.405, + "step": 539 + }, + { + "epoch": 1.105424769703173, + "grad_norm": 0.40571779964971894, + "learning_rate": 3.823347773130427e-05, + "loss": 0.4275, + "step": 540 + }, + { + "epoch": 1.1074718526100307, + "grad_norm": 0.3618547256384093, + "learning_rate": 3.822170200980815e-05, + "loss": 0.3858, + "step": 541 + }, + { + "epoch": 1.1095189355168884, + "grad_norm": 0.4080407212884249, + "learning_rate": 3.820988899560239e-05, + "loss": 0.4645, + "step": 542 + }, + { + "epoch": 1.1115660184237461, + "grad_norm": 0.3661867427781835, + "learning_rate": 3.819803871286361e-05, + "loss": 0.4359, + "step": 543 + }, + { + "epoch": 1.1136131013306039, + "grad_norm": 0.3672357843320578, + "learning_rate": 3.818615118584472e-05, + "loss": 0.4266, + "step": 544 + }, + { + "epoch": 1.1156601842374616, + "grad_norm": 0.42948959266814246, + "learning_rate": 3.817422643887484e-05, + "loss": 0.3994, + "step": 545 + }, + { + "epoch": 1.1177072671443193, + "grad_norm": 0.4671751141730952, + "learning_rate": 3.816226449635927e-05, + "loss": 0.4408, + "step": 546 + }, + { + "epoch": 1.119754350051177, + "grad_norm": 0.41461516441504426, + "learning_rate": 3.815026538277943e-05, + "loss": 0.3956, + "step": 547 + }, + { + "epoch": 1.1218014329580348, + "grad_norm": 0.5835686601708205, + "learning_rate": 3.813822912269284e-05, + "loss": 0.4505, + "step": 548 + }, + { + "epoch": 1.1238485158648925, + "grad_norm": 0.4319406426533076, + "learning_rate": 3.812615574073301e-05, + "loss": 0.3822, + "step": 549 + }, + { + "epoch": 1.1258955987717503, + "grad_norm": 0.4516791429478303, + "learning_rate": 3.811404526160943e-05, + "loss": 0.4144, + "step": 550 + }, + { + "epoch": 1.127942681678608, + "grad_norm": 0.5360130807078863, + "learning_rate": 3.810189771010755e-05, + "loss": 0.4786, + "step": 551 + }, + { + "epoch": 1.1299897645854657, + "grad_norm": 0.4708744253041833, + "learning_rate": 3.808971311108865e-05, + "loss": 0.4241, + "step": 552 + }, + { + "epoch": 1.1320368474923235, + "grad_norm": 0.43422914565208964, + "learning_rate": 3.8077491489489835e-05, + "loss": 0.4204, + "step": 553 + }, + { + "epoch": 1.1340839303991812, + "grad_norm": 0.4721868771642081, + "learning_rate": 3.806523287032401e-05, + "loss": 0.3909, + "step": 554 + }, + { + "epoch": 1.136131013306039, + "grad_norm": 0.4196801932672472, + "learning_rate": 3.805293727867978e-05, + "loss": 0.4427, + "step": 555 + }, + { + "epoch": 1.1381780962128967, + "grad_norm": 0.43191530611285084, + "learning_rate": 3.8040604739721415e-05, + "loss": 0.4247, + "step": 556 + }, + { + "epoch": 1.1402251791197544, + "grad_norm": 0.4098386090267591, + "learning_rate": 3.8028235278688814e-05, + "loss": 0.4136, + "step": 557 + }, + { + "epoch": 1.1422722620266121, + "grad_norm": 0.42481716204053743, + "learning_rate": 3.8015828920897425e-05, + "loss": 0.425, + "step": 558 + }, + { + "epoch": 1.1443193449334699, + "grad_norm": 0.3947721096519682, + "learning_rate": 3.8003385691738227e-05, + "loss": 0.4169, + "step": 559 + }, + { + "epoch": 1.1463664278403276, + "grad_norm": 0.4434721832169034, + "learning_rate": 3.7990905616677644e-05, + "loss": 0.4804, + "step": 560 + }, + { + "epoch": 1.1484135107471853, + "grad_norm": 0.44596996028550917, + "learning_rate": 3.797838872125752e-05, + "loss": 0.4587, + "step": 561 + }, + { + "epoch": 1.150460593654043, + "grad_norm": 0.40781008639677546, + "learning_rate": 3.7965835031095065e-05, + "loss": 0.4614, + "step": 562 + }, + { + "epoch": 1.1525076765609008, + "grad_norm": 0.4036300851691864, + "learning_rate": 3.795324457188276e-05, + "loss": 0.3913, + "step": 563 + }, + { + "epoch": 1.1545547594677585, + "grad_norm": 0.3583696642752529, + "learning_rate": 3.794061736938837e-05, + "loss": 0.3828, + "step": 564 + }, + { + "epoch": 1.156601842374616, + "grad_norm": 0.5138599201629513, + "learning_rate": 3.792795344945485e-05, + "loss": 0.4861, + "step": 565 + }, + { + "epoch": 1.158648925281474, + "grad_norm": 0.40729637695646664, + "learning_rate": 3.79152528380003e-05, + "loss": 0.4136, + "step": 566 + }, + { + "epoch": 1.1606960081883315, + "grad_norm": 0.4688730059593128, + "learning_rate": 3.790251556101791e-05, + "loss": 0.3991, + "step": 567 + }, + { + "epoch": 1.1627430910951895, + "grad_norm": 0.41555333567521613, + "learning_rate": 3.7889741644575914e-05, + "loss": 0.4859, + "step": 568 + }, + { + "epoch": 1.164790174002047, + "grad_norm": 0.368128010733192, + "learning_rate": 3.787693111481753e-05, + "loss": 0.412, + "step": 569 + }, + { + "epoch": 1.1668372569089047, + "grad_norm": 0.40448293578209554, + "learning_rate": 3.786408399796091e-05, + "loss": 0.4812, + "step": 570 + }, + { + "epoch": 1.1688843398157625, + "grad_norm": 0.35706355299321096, + "learning_rate": 3.78512003202991e-05, + "loss": 0.4098, + "step": 571 + }, + { + "epoch": 1.1709314227226202, + "grad_norm": 0.3794649872224116, + "learning_rate": 3.783828010819993e-05, + "loss": 0.4184, + "step": 572 + }, + { + "epoch": 1.172978505629478, + "grad_norm": 0.4084126099380887, + "learning_rate": 3.782532338810605e-05, + "loss": 0.4279, + "step": 573 + }, + { + "epoch": 1.1750255885363357, + "grad_norm": 0.40248846798503674, + "learning_rate": 3.7812330186534815e-05, + "loss": 0.374, + "step": 574 + }, + { + "epoch": 1.1770726714431934, + "grad_norm": 0.42082880622868263, + "learning_rate": 3.779930053007821e-05, + "loss": 0.4294, + "step": 575 + }, + { + "epoch": 1.1791197543500511, + "grad_norm": 0.4741074023730061, + "learning_rate": 3.778623444540287e-05, + "loss": 0.4655, + "step": 576 + }, + { + "epoch": 1.1811668372569089, + "grad_norm": 0.40443469906849566, + "learning_rate": 3.777313195924998e-05, + "loss": 0.4313, + "step": 577 + }, + { + "epoch": 1.1832139201637666, + "grad_norm": 0.5301327875202605, + "learning_rate": 3.775999309843519e-05, + "loss": 0.4872, + "step": 578 + }, + { + "epoch": 1.1852610030706243, + "grad_norm": 0.4031400337502115, + "learning_rate": 3.774681788984863e-05, + "loss": 0.44, + "step": 579 + }, + { + "epoch": 1.187308085977482, + "grad_norm": 0.32666462235868937, + "learning_rate": 3.773360636045481e-05, + "loss": 0.3297, + "step": 580 + }, + { + "epoch": 1.1893551688843398, + "grad_norm": 0.48740976479218406, + "learning_rate": 3.7720358537292566e-05, + "loss": 0.502, + "step": 581 + }, + { + "epoch": 1.1914022517911975, + "grad_norm": 0.4129431656165589, + "learning_rate": 3.770707444747502e-05, + "loss": 0.3901, + "step": 582 + }, + { + "epoch": 1.1934493346980553, + "grad_norm": 0.37634568229429827, + "learning_rate": 3.7693754118189525e-05, + "loss": 0.406, + "step": 583 + }, + { + "epoch": 1.195496417604913, + "grad_norm": 0.413600210538619, + "learning_rate": 3.768039757669759e-05, + "loss": 0.4389, + "step": 584 + }, + { + "epoch": 1.1975435005117707, + "grad_norm": 0.41522152600174966, + "learning_rate": 3.766700485033484e-05, + "loss": 0.4213, + "step": 585 + }, + { + "epoch": 1.1995905834186285, + "grad_norm": 0.3994741162417736, + "learning_rate": 3.765357596651095e-05, + "loss": 0.4192, + "step": 586 + }, + { + "epoch": 1.2016376663254862, + "grad_norm": 0.44961799715640177, + "learning_rate": 3.764011095270962e-05, + "loss": 0.4448, + "step": 587 + }, + { + "epoch": 1.203684749232344, + "grad_norm": 0.4405333877210868, + "learning_rate": 3.762660983648846e-05, + "loss": 0.4425, + "step": 588 + }, + { + "epoch": 1.2057318321392017, + "grad_norm": 0.3647533882869788, + "learning_rate": 3.761307264547899e-05, + "loss": 0.3798, + "step": 589 + }, + { + "epoch": 1.2077789150460594, + "grad_norm": 0.5134897309426143, + "learning_rate": 3.759949940738655e-05, + "loss": 0.4862, + "step": 590 + }, + { + "epoch": 1.2098259979529171, + "grad_norm": 0.42730289018233486, + "learning_rate": 3.7585890149990265e-05, + "loss": 0.3887, + "step": 591 + }, + { + "epoch": 1.2118730808597749, + "grad_norm": 0.4053414794034375, + "learning_rate": 3.757224490114297e-05, + "loss": 0.4327, + "step": 592 + }, + { + "epoch": 1.2139201637666326, + "grad_norm": 0.4351870322953961, + "learning_rate": 3.755856368877116e-05, + "loss": 0.379, + "step": 593 + }, + { + "epoch": 1.2159672466734903, + "grad_norm": 0.4319979039337228, + "learning_rate": 3.7544846540874934e-05, + "loss": 0.45, + "step": 594 + }, + { + "epoch": 1.218014329580348, + "grad_norm": 0.3683791838302071, + "learning_rate": 3.7531093485527943e-05, + "loss": 0.4189, + "step": 595 + }, + { + "epoch": 1.2200614124872058, + "grad_norm": 0.47932608544530164, + "learning_rate": 3.7517304550877315e-05, + "loss": 0.4327, + "step": 596 + }, + { + "epoch": 1.2221084953940635, + "grad_norm": 0.46656194426712394, + "learning_rate": 3.750347976514362e-05, + "loss": 0.4774, + "step": 597 + }, + { + "epoch": 1.2241555783009213, + "grad_norm": 0.3933033098379041, + "learning_rate": 3.7489619156620796e-05, + "loss": 0.4224, + "step": 598 + }, + { + "epoch": 1.226202661207779, + "grad_norm": 0.40838601128257357, + "learning_rate": 3.74757227536761e-05, + "loss": 0.4361, + "step": 599 + }, + { + "epoch": 1.2282497441146367, + "grad_norm": 0.4011956657411684, + "learning_rate": 3.7461790584750036e-05, + "loss": 0.373, + "step": 600 + }, + { + "epoch": 1.2302968270214945, + "grad_norm": 0.4783960660544416, + "learning_rate": 3.744782267835632e-05, + "loss": 0.4497, + "step": 601 + }, + { + "epoch": 1.2323439099283522, + "grad_norm": 0.4426728718136347, + "learning_rate": 3.74338190630818e-05, + "loss": 0.4396, + "step": 602 + }, + { + "epoch": 1.23439099283521, + "grad_norm": 0.4507239891416186, + "learning_rate": 3.7419779767586406e-05, + "loss": 0.4312, + "step": 603 + }, + { + "epoch": 1.2364380757420674, + "grad_norm": 0.45738216770554924, + "learning_rate": 3.740570482060311e-05, + "loss": 0.4186, + "step": 604 + }, + { + "epoch": 1.2384851586489254, + "grad_norm": 0.4112804317839281, + "learning_rate": 3.7391594250937813e-05, + "loss": 0.4075, + "step": 605 + }, + { + "epoch": 1.240532241555783, + "grad_norm": 0.3910005461879264, + "learning_rate": 3.737744808746935e-05, + "loss": 0.4063, + "step": 606 + }, + { + "epoch": 1.2425793244626409, + "grad_norm": 0.37923695698945686, + "learning_rate": 3.73632663591494e-05, + "loss": 0.3753, + "step": 607 + }, + { + "epoch": 1.2446264073694984, + "grad_norm": 0.3993349892342946, + "learning_rate": 3.7349049095002414e-05, + "loss": 0.415, + "step": 608 + }, + { + "epoch": 1.246673490276356, + "grad_norm": 0.49340730052744397, + "learning_rate": 3.733479632412559e-05, + "loss": 0.4137, + "step": 609 + }, + { + "epoch": 1.2487205731832138, + "grad_norm": 0.4033663663707718, + "learning_rate": 3.732050807568878e-05, + "loss": 0.4078, + "step": 610 + }, + { + "epoch": 1.2507676560900716, + "grad_norm": 0.4885064983059752, + "learning_rate": 3.730618437893444e-05, + "loss": 0.479, + "step": 611 + }, + { + "epoch": 1.2528147389969293, + "grad_norm": 0.43160732650352407, + "learning_rate": 3.729182526317761e-05, + "loss": 0.455, + "step": 612 + }, + { + "epoch": 1.254861821903787, + "grad_norm": 0.45988754618419736, + "learning_rate": 3.727743075780578e-05, + "loss": 0.3783, + "step": 613 + }, + { + "epoch": 1.2569089048106448, + "grad_norm": 0.44846875859935137, + "learning_rate": 3.726300089227887e-05, + "loss": 0.4124, + "step": 614 + }, + { + "epoch": 1.2589559877175025, + "grad_norm": 0.5009012146609582, + "learning_rate": 3.72485356961292e-05, + "loss": 0.4256, + "step": 615 + }, + { + "epoch": 1.2610030706243602, + "grad_norm": 0.40882732802753774, + "learning_rate": 3.723403519896136e-05, + "loss": 0.3877, + "step": 616 + }, + { + "epoch": 1.263050153531218, + "grad_norm": 0.509277319060574, + "learning_rate": 3.721949943045223e-05, + "loss": 0.4603, + "step": 617 + }, + { + "epoch": 1.2650972364380757, + "grad_norm": 0.4550675308253649, + "learning_rate": 3.720492842035084e-05, + "loss": 0.4958, + "step": 618 + }, + { + "epoch": 1.2671443193449334, + "grad_norm": 0.5039050269537974, + "learning_rate": 3.7190322198478355e-05, + "loss": 0.5296, + "step": 619 + }, + { + "epoch": 1.2691914022517912, + "grad_norm": 0.42034576410985675, + "learning_rate": 3.7175680794728015e-05, + "loss": 0.4171, + "step": 620 + }, + { + "epoch": 1.271238485158649, + "grad_norm": 0.6194106838370393, + "learning_rate": 3.716100423906505e-05, + "loss": 0.524, + "step": 621 + }, + { + "epoch": 1.2732855680655066, + "grad_norm": 0.3686755041504862, + "learning_rate": 3.7146292561526654e-05, + "loss": 0.3836, + "step": 622 + }, + { + "epoch": 1.2753326509723644, + "grad_norm": 0.5059037426173506, + "learning_rate": 3.7131545792221864e-05, + "loss": 0.485, + "step": 623 + }, + { + "epoch": 1.277379733879222, + "grad_norm": 0.40419199499915215, + "learning_rate": 3.711676396133158e-05, + "loss": 0.3979, + "step": 624 + }, + { + "epoch": 1.2794268167860798, + "grad_norm": 0.48892649990931586, + "learning_rate": 3.7101947099108425e-05, + "loss": 0.5072, + "step": 625 + }, + { + "epoch": 1.2814738996929376, + "grad_norm": 0.4545220525315462, + "learning_rate": 3.708709523587674e-05, + "loss": 0.4275, + "step": 626 + }, + { + "epoch": 1.2835209825997953, + "grad_norm": 0.40189284591007923, + "learning_rate": 3.707220840203249e-05, + "loss": 0.3881, + "step": 627 + }, + { + "epoch": 1.285568065506653, + "grad_norm": 0.43194988622551983, + "learning_rate": 3.70572866280432e-05, + "loss": 0.4747, + "step": 628 + }, + { + "epoch": 1.2876151484135108, + "grad_norm": 0.44678464462286466, + "learning_rate": 3.7042329944447925e-05, + "loss": 0.391, + "step": 629 + }, + { + "epoch": 1.2896622313203685, + "grad_norm": 0.4623510609405332, + "learning_rate": 3.702733838185716e-05, + "loss": 0.423, + "step": 630 + }, + { + "epoch": 1.2917093142272262, + "grad_norm": 0.4314971051525567, + "learning_rate": 3.701231197095277e-05, + "loss": 0.4285, + "step": 631 + }, + { + "epoch": 1.293756397134084, + "grad_norm": 0.39807090722917615, + "learning_rate": 3.6997250742487955e-05, + "loss": 0.3975, + "step": 632 + }, + { + "epoch": 1.2958034800409417, + "grad_norm": 0.4538564539433799, + "learning_rate": 3.698215472728718e-05, + "loss": 0.4566, + "step": 633 + }, + { + "epoch": 1.2978505629477994, + "grad_norm": 0.387005647622149, + "learning_rate": 3.696702395624608e-05, + "loss": 0.4376, + "step": 634 + }, + { + "epoch": 1.2998976458546572, + "grad_norm": 0.4089593023659839, + "learning_rate": 3.6951858460331446e-05, + "loss": 0.4297, + "step": 635 + }, + { + "epoch": 1.301944728761515, + "grad_norm": 0.423980436557396, + "learning_rate": 3.693665827058111e-05, + "loss": 0.4407, + "step": 636 + }, + { + "epoch": 1.3039918116683726, + "grad_norm": 0.3939340668431382, + "learning_rate": 3.692142341810395e-05, + "loss": 0.3762, + "step": 637 + }, + { + "epoch": 1.3060388945752304, + "grad_norm": 0.3791993183647926, + "learning_rate": 3.690615393407975e-05, + "loss": 0.412, + "step": 638 + }, + { + "epoch": 1.308085977482088, + "grad_norm": 0.44464627005705176, + "learning_rate": 3.689084984975918e-05, + "loss": 0.3946, + "step": 639 + }, + { + "epoch": 1.3101330603889458, + "grad_norm": 0.4356188955070128, + "learning_rate": 3.6875511196463715e-05, + "loss": 0.4583, + "step": 640 + }, + { + "epoch": 1.3121801432958033, + "grad_norm": 0.43588824508846175, + "learning_rate": 3.686013800558561e-05, + "loss": 0.4674, + "step": 641 + }, + { + "epoch": 1.3142272262026613, + "grad_norm": 0.5156297864126925, + "learning_rate": 3.6844730308587776e-05, + "loss": 0.4052, + "step": 642 + }, + { + "epoch": 1.3162743091095188, + "grad_norm": 0.40684908578032264, + "learning_rate": 3.682928813700375e-05, + "loss": 0.4329, + "step": 643 + }, + { + "epoch": 1.3183213920163768, + "grad_norm": 0.42989019879872664, + "learning_rate": 3.681381152243763e-05, + "loss": 0.4264, + "step": 644 + }, + { + "epoch": 1.3203684749232343, + "grad_norm": 0.37812994433882907, + "learning_rate": 3.6798300496564e-05, + "loss": 0.386, + "step": 645 + }, + { + "epoch": 1.3224155578300922, + "grad_norm": 0.4191444772159957, + "learning_rate": 3.678275509112788e-05, + "loss": 0.4587, + "step": 646 + }, + { + "epoch": 1.3244626407369497, + "grad_norm": 0.34057419650855425, + "learning_rate": 3.6767175337944646e-05, + "loss": 0.4069, + "step": 647 + }, + { + "epoch": 1.3265097236438077, + "grad_norm": 0.4210596650083103, + "learning_rate": 3.675156126889996e-05, + "loss": 0.4614, + "step": 648 + }, + { + "epoch": 1.3285568065506652, + "grad_norm": 0.380672360295006, + "learning_rate": 3.6735912915949745e-05, + "loss": 0.4919, + "step": 649 + }, + { + "epoch": 1.330603889457523, + "grad_norm": 0.46378857138786805, + "learning_rate": 3.672023031112005e-05, + "loss": 0.4398, + "step": 650 + }, + { + "epoch": 1.3326509723643807, + "grad_norm": 0.3731856457734196, + "learning_rate": 3.670451348650705e-05, + "loss": 0.3786, + "step": 651 + }, + { + "epoch": 1.3346980552712384, + "grad_norm": 0.4439716102781381, + "learning_rate": 3.6688762474276945e-05, + "loss": 0.4175, + "step": 652 + }, + { + "epoch": 1.3367451381780961, + "grad_norm": 0.5191394349356976, + "learning_rate": 3.667297730666592e-05, + "loss": 0.5042, + "step": 653 + }, + { + "epoch": 1.3387922210849539, + "grad_norm": 0.41690574322071583, + "learning_rate": 3.665715801598004e-05, + "loss": 0.4098, + "step": 654 + }, + { + "epoch": 1.3408393039918116, + "grad_norm": 0.42158819550590015, + "learning_rate": 3.6641304634595216e-05, + "loss": 0.4271, + "step": 655 + }, + { + "epoch": 1.3428863868986693, + "grad_norm": 0.4596791286598413, + "learning_rate": 3.662541719495714e-05, + "loss": 0.4609, + "step": 656 + }, + { + "epoch": 1.344933469805527, + "grad_norm": 0.4858116688501969, + "learning_rate": 3.6609495729581186e-05, + "loss": 0.4909, + "step": 657 + }, + { + "epoch": 1.3469805527123848, + "grad_norm": 0.3811091658108338, + "learning_rate": 3.659354027105238e-05, + "loss": 0.4224, + "step": 658 + }, + { + "epoch": 1.3490276356192425, + "grad_norm": 0.40653713538788, + "learning_rate": 3.657755085202532e-05, + "loss": 0.4144, + "step": 659 + }, + { + "epoch": 1.3510747185261003, + "grad_norm": 0.4900473769452504, + "learning_rate": 3.6561527505224104e-05, + "loss": 0.4582, + "step": 660 + }, + { + "epoch": 1.353121801432958, + "grad_norm": 0.45088745908937394, + "learning_rate": 3.6545470263442265e-05, + "loss": 0.4345, + "step": 661 + }, + { + "epoch": 1.3551688843398157, + "grad_norm": 0.44409465950161797, + "learning_rate": 3.65293791595427e-05, + "loss": 0.4867, + "step": 662 + }, + { + "epoch": 1.3572159672466735, + "grad_norm": 0.3857032013105052, + "learning_rate": 3.651325422645763e-05, + "loss": 0.4072, + "step": 663 + }, + { + "epoch": 1.3592630501535312, + "grad_norm": 0.4196509296621015, + "learning_rate": 3.649709549718849e-05, + "loss": 0.4109, + "step": 664 + }, + { + "epoch": 1.361310133060389, + "grad_norm": 0.3711033202233833, + "learning_rate": 3.648090300480589e-05, + "loss": 0.4498, + "step": 665 + }, + { + "epoch": 1.3633572159672467, + "grad_norm": 0.4738363974307569, + "learning_rate": 3.646467678244954e-05, + "loss": 0.4268, + "step": 666 + }, + { + "epoch": 1.3654042988741044, + "grad_norm": 0.4028954103082967, + "learning_rate": 3.6448416863328186e-05, + "loss": 0.4346, + "step": 667 + }, + { + "epoch": 1.3674513817809621, + "grad_norm": 0.4635908886319793, + "learning_rate": 3.643212328071953e-05, + "loss": 0.4453, + "step": 668 + }, + { + "epoch": 1.3694984646878199, + "grad_norm": 0.38319546452013337, + "learning_rate": 3.641579606797017e-05, + "loss": 0.4054, + "step": 669 + }, + { + "epoch": 1.3715455475946776, + "grad_norm": 0.3700643765726001, + "learning_rate": 3.639943525849555e-05, + "loss": 0.4692, + "step": 670 + }, + { + "epoch": 1.3735926305015353, + "grad_norm": 0.34905600329178454, + "learning_rate": 3.638304088577984e-05, + "loss": 0.4131, + "step": 671 + }, + { + "epoch": 1.375639713408393, + "grad_norm": 0.37865539989560054, + "learning_rate": 3.6366612983375936e-05, + "loss": 0.4696, + "step": 672 + }, + { + "epoch": 1.3776867963152508, + "grad_norm": 0.34634916188478093, + "learning_rate": 3.635015158490533e-05, + "loss": 0.4206, + "step": 673 + }, + { + "epoch": 1.3797338792221086, + "grad_norm": 0.3790168529129947, + "learning_rate": 3.6333656724058075e-05, + "loss": 0.4311, + "step": 674 + }, + { + "epoch": 1.3817809621289663, + "grad_norm": 0.39245574822155366, + "learning_rate": 3.6317128434592725e-05, + "loss": 0.453, + "step": 675 + }, + { + "epoch": 1.383828045035824, + "grad_norm": 0.3723108820988336, + "learning_rate": 3.6300566750336225e-05, + "loss": 0.4055, + "step": 676 + }, + { + "epoch": 1.3858751279426818, + "grad_norm": 0.37587879714211964, + "learning_rate": 3.6283971705183884e-05, + "loss": 0.4468, + "step": 677 + }, + { + "epoch": 1.3879222108495395, + "grad_norm": 0.3410504468464227, + "learning_rate": 3.626734333309927e-05, + "loss": 0.3782, + "step": 678 + }, + { + "epoch": 1.3899692937563972, + "grad_norm": 0.4332726859166056, + "learning_rate": 3.625068166811418e-05, + "loss": 0.4419, + "step": 679 + }, + { + "epoch": 1.3920163766632547, + "grad_norm": 0.3869800586992885, + "learning_rate": 3.623398674432853e-05, + "loss": 0.4211, + "step": 680 + }, + { + "epoch": 1.3940634595701127, + "grad_norm": 0.38121810828451136, + "learning_rate": 3.621725859591031e-05, + "loss": 0.4592, + "step": 681 + }, + { + "epoch": 1.3961105424769702, + "grad_norm": 0.39222808854449187, + "learning_rate": 3.6200497257095504e-05, + "loss": 0.4664, + "step": 682 + }, + { + "epoch": 1.3981576253838282, + "grad_norm": 0.3565067379505055, + "learning_rate": 3.6183702762188045e-05, + "loss": 0.4218, + "step": 683 + }, + { + "epoch": 1.4002047082906857, + "grad_norm": 0.3730858903859956, + "learning_rate": 3.6166875145559684e-05, + "loss": 0.4338, + "step": 684 + }, + { + "epoch": 1.4022517911975436, + "grad_norm": 0.36655424256782704, + "learning_rate": 3.615001444165001e-05, + "loss": 0.4372, + "step": 685 + }, + { + "epoch": 1.4042988741044011, + "grad_norm": 0.4487119389407511, + "learning_rate": 3.613312068496627e-05, + "loss": 0.462, + "step": 686 + }, + { + "epoch": 1.406345957011259, + "grad_norm": 0.3515839553061697, + "learning_rate": 3.611619391008341e-05, + "loss": 0.3974, + "step": 687 + }, + { + "epoch": 1.4083930399181166, + "grad_norm": 0.3677425891572493, + "learning_rate": 3.6099234151643924e-05, + "loss": 0.4634, + "step": 688 + }, + { + "epoch": 1.4104401228249743, + "grad_norm": 0.3826249798074036, + "learning_rate": 3.608224144435781e-05, + "loss": 0.4338, + "step": 689 + }, + { + "epoch": 1.412487205731832, + "grad_norm": 0.37018522079183763, + "learning_rate": 3.606521582300252e-05, + "loss": 0.4089, + "step": 690 + }, + { + "epoch": 1.4145342886386898, + "grad_norm": 0.39718163395641504, + "learning_rate": 3.604815732242283e-05, + "loss": 0.4481, + "step": 691 + }, + { + "epoch": 1.4165813715455475, + "grad_norm": 0.4177328745486512, + "learning_rate": 3.6031065977530854e-05, + "loss": 0.4061, + "step": 692 + }, + { + "epoch": 1.4186284544524053, + "grad_norm": 0.43120876742640946, + "learning_rate": 3.6013941823305884e-05, + "loss": 0.4105, + "step": 693 + }, + { + "epoch": 1.420675537359263, + "grad_norm": 0.4330368740835022, + "learning_rate": 3.5996784894794394e-05, + "loss": 0.4329, + "step": 694 + }, + { + "epoch": 1.4227226202661207, + "grad_norm": 0.42455998302592957, + "learning_rate": 3.5979595227109906e-05, + "loss": 0.4045, + "step": 695 + }, + { + "epoch": 1.4247697031729785, + "grad_norm": 0.4072147719505155, + "learning_rate": 3.5962372855432956e-05, + "loss": 0.452, + "step": 696 + }, + { + "epoch": 1.4268167860798362, + "grad_norm": 0.4188081245089065, + "learning_rate": 3.594511781501103e-05, + "loss": 0.4199, + "step": 697 + }, + { + "epoch": 1.428863868986694, + "grad_norm": 0.38251894827311167, + "learning_rate": 3.592783014115845e-05, + "loss": 0.3955, + "step": 698 + }, + { + "epoch": 1.4309109518935517, + "grad_norm": 0.4441091071016442, + "learning_rate": 3.5910509869256326e-05, + "loss": 0.4398, + "step": 699 + }, + { + "epoch": 1.4329580348004094, + "grad_norm": 0.38058511976389736, + "learning_rate": 3.58931570347525e-05, + "loss": 0.4019, + "step": 700 + }, + { + "epoch": 1.4350051177072671, + "grad_norm": 0.3617807382285515, + "learning_rate": 3.587577167316146e-05, + "loss": 0.4363, + "step": 701 + }, + { + "epoch": 1.4370522006141249, + "grad_norm": 0.43987641158494933, + "learning_rate": 3.585835382006424e-05, + "loss": 0.4328, + "step": 702 + }, + { + "epoch": 1.4390992835209826, + "grad_norm": 0.3609144606393506, + "learning_rate": 3.584090351110838e-05, + "loss": 0.4104, + "step": 703 + }, + { + "epoch": 1.4411463664278403, + "grad_norm": 0.4091808525809948, + "learning_rate": 3.582342078200786e-05, + "loss": 0.4138, + "step": 704 + }, + { + "epoch": 1.443193449334698, + "grad_norm": 0.41531638710969015, + "learning_rate": 3.5805905668543e-05, + "loss": 0.4697, + "step": 705 + }, + { + "epoch": 1.4452405322415558, + "grad_norm": 0.4154315464780426, + "learning_rate": 3.57883582065604e-05, + "loss": 0.4844, + "step": 706 + }, + { + "epoch": 1.4472876151484135, + "grad_norm": 0.426486587951999, + "learning_rate": 3.577077843197285e-05, + "loss": 0.4088, + "step": 707 + }, + { + "epoch": 1.4493346980552713, + "grad_norm": 0.4965404868194584, + "learning_rate": 3.57531663807593e-05, + "loss": 0.4369, + "step": 708 + }, + { + "epoch": 1.451381780962129, + "grad_norm": 0.4695819928605211, + "learning_rate": 3.573552208896474e-05, + "loss": 0.4382, + "step": 709 + }, + { + "epoch": 1.4534288638689867, + "grad_norm": 0.46954254869967305, + "learning_rate": 3.571784559270014e-05, + "loss": 0.4456, + "step": 710 + }, + { + "epoch": 1.4554759467758445, + "grad_norm": 0.4760159462479475, + "learning_rate": 3.570013692814239e-05, + "loss": 0.4477, + "step": 711 + }, + { + "epoch": 1.4575230296827022, + "grad_norm": 0.3992032479219894, + "learning_rate": 3.568239613153421e-05, + "loss": 0.396, + "step": 712 + }, + { + "epoch": 1.45957011258956, + "grad_norm": 0.5000911965095803, + "learning_rate": 3.566462323918409e-05, + "loss": 0.4535, + "step": 713 + }, + { + "epoch": 1.4616171954964177, + "grad_norm": 0.43927830807739277, + "learning_rate": 3.564681828746619e-05, + "loss": 0.4579, + "step": 714 + }, + { + "epoch": 1.4636642784032754, + "grad_norm": 0.3840584961797973, + "learning_rate": 3.5628981312820315e-05, + "loss": 0.3861, + "step": 715 + }, + { + "epoch": 1.4657113613101331, + "grad_norm": 0.46545692391278676, + "learning_rate": 3.561111235175177e-05, + "loss": 0.4142, + "step": 716 + }, + { + "epoch": 1.4677584442169909, + "grad_norm": 0.32920533353338466, + "learning_rate": 3.5593211440831345e-05, + "loss": 0.3932, + "step": 717 + }, + { + "epoch": 1.4698055271238486, + "grad_norm": 0.5879408329975259, + "learning_rate": 3.557527861669522e-05, + "loss": 0.4518, + "step": 718 + }, + { + "epoch": 1.471852610030706, + "grad_norm": 0.3711931671196681, + "learning_rate": 3.555731391604488e-05, + "loss": 0.4135, + "step": 719 + }, + { + "epoch": 1.473899692937564, + "grad_norm": 0.451950604877235, + "learning_rate": 3.553931737564705e-05, + "loss": 0.3778, + "step": 720 + }, + { + "epoch": 1.4759467758444216, + "grad_norm": 0.3802803607853056, + "learning_rate": 3.552128903233363e-05, + "loss": 0.4528, + "step": 721 + }, + { + "epoch": 1.4779938587512795, + "grad_norm": 0.5401764017146911, + "learning_rate": 3.55032289230016e-05, + "loss": 0.4431, + "step": 722 + }, + { + "epoch": 1.480040941658137, + "grad_norm": 0.39166017929307784, + "learning_rate": 3.5485137084612945e-05, + "loss": 0.4384, + "step": 723 + }, + { + "epoch": 1.482088024564995, + "grad_norm": 0.44680536201066046, + "learning_rate": 3.54670135541946e-05, + "loss": 0.4017, + "step": 724 + }, + { + "epoch": 1.4841351074718525, + "grad_norm": 0.4126986303099099, + "learning_rate": 3.544885836883836e-05, + "loss": 0.4267, + "step": 725 + }, + { + "epoch": 1.4861821903787105, + "grad_norm": 0.49002764481757854, + "learning_rate": 3.5430671565700786e-05, + "loss": 0.4451, + "step": 726 + }, + { + "epoch": 1.488229273285568, + "grad_norm": 0.43094421524683524, + "learning_rate": 3.541245318200318e-05, + "loss": 0.4157, + "step": 727 + }, + { + "epoch": 1.4902763561924257, + "grad_norm": 0.4077891821232257, + "learning_rate": 3.5394203255031445e-05, + "loss": 0.4184, + "step": 728 + }, + { + "epoch": 1.4923234390992834, + "grad_norm": 0.4214423887733812, + "learning_rate": 3.537592182213607e-05, + "loss": 0.4404, + "step": 729 + }, + { + "epoch": 1.4943705220061412, + "grad_norm": 0.39779568039250524, + "learning_rate": 3.5357608920732e-05, + "loss": 0.3862, + "step": 730 + }, + { + "epoch": 1.496417604912999, + "grad_norm": 0.3868845645927947, + "learning_rate": 3.5339264588298606e-05, + "loss": 0.4859, + "step": 731 + }, + { + "epoch": 1.4984646878198566, + "grad_norm": 0.401554933480186, + "learning_rate": 3.532088886237956e-05, + "loss": 0.4605, + "step": 732 + }, + { + "epoch": 1.5005117707267144, + "grad_norm": 0.39625088885583626, + "learning_rate": 3.530248178058282e-05, + "loss": 0.4371, + "step": 733 + }, + { + "epoch": 1.5025588536335721, + "grad_norm": 0.3433544773175932, + "learning_rate": 3.528404338058046e-05, + "loss": 0.359, + "step": 734 + }, + { + "epoch": 1.5046059365404298, + "grad_norm": 0.3790208864361706, + "learning_rate": 3.526557370010872e-05, + "loss": 0.4668, + "step": 735 + }, + { + "epoch": 1.5066530194472876, + "grad_norm": 0.3463058143561582, + "learning_rate": 3.5247072776967805e-05, + "loss": 0.428, + "step": 736 + }, + { + "epoch": 1.5087001023541453, + "grad_norm": 0.4092673650520593, + "learning_rate": 3.522854064902189e-05, + "loss": 0.4787, + "step": 737 + }, + { + "epoch": 1.510747185261003, + "grad_norm": 0.3701835656091217, + "learning_rate": 3.520997735419901e-05, + "loss": 0.4335, + "step": 738 + }, + { + "epoch": 1.5127942681678608, + "grad_norm": 0.4036995525874622, + "learning_rate": 3.519138293049097e-05, + "loss": 0.4371, + "step": 739 + }, + { + "epoch": 1.5148413510747185, + "grad_norm": 0.3520959626432674, + "learning_rate": 3.51727574159533e-05, + "loss": 0.442, + "step": 740 + }, + { + "epoch": 1.5168884339815762, + "grad_norm": 0.3474665477504301, + "learning_rate": 3.515410084870516e-05, + "loss": 0.3833, + "step": 741 + }, + { + "epoch": 1.518935516888434, + "grad_norm": 0.37700117041246284, + "learning_rate": 3.513541326692925e-05, + "loss": 0.421, + "step": 742 + }, + { + "epoch": 1.5209825997952917, + "grad_norm": 0.3354789534984794, + "learning_rate": 3.511669470887177e-05, + "loss": 0.3646, + "step": 743 + }, + { + "epoch": 1.5230296827021494, + "grad_norm": 0.36336403413161944, + "learning_rate": 3.509794521284228e-05, + "loss": 0.4479, + "step": 744 + }, + { + "epoch": 1.5250767656090072, + "grad_norm": 0.3640226182102435, + "learning_rate": 3.5079164817213684e-05, + "loss": 0.3994, + "step": 745 + }, + { + "epoch": 1.527123848515865, + "grad_norm": 0.39095484712326395, + "learning_rate": 3.5060353560422137e-05, + "loss": 0.4906, + "step": 746 + }, + { + "epoch": 1.5291709314227226, + "grad_norm": 0.42138462191208187, + "learning_rate": 3.504151148096691e-05, + "loss": 0.4876, + "step": 747 + }, + { + "epoch": 1.5312180143295804, + "grad_norm": 0.36356347432593744, + "learning_rate": 3.5022638617410396e-05, + "loss": 0.4171, + "step": 748 + }, + { + "epoch": 1.5332650972364381, + "grad_norm": 0.402770920410115, + "learning_rate": 3.500373500837799e-05, + "loss": 0.4427, + "step": 749 + }, + { + "epoch": 1.5353121801432958, + "grad_norm": 0.35789791445115465, + "learning_rate": 3.4984800692557974e-05, + "loss": 0.4306, + "step": 750 + }, + { + "epoch": 1.5373592630501536, + "grad_norm": 0.3511318223180355, + "learning_rate": 3.496583570870152e-05, + "loss": 0.4051, + "step": 751 + }, + { + "epoch": 1.5394063459570113, + "grad_norm": 0.415831551492267, + "learning_rate": 3.494684009562254e-05, + "loss": 0.4738, + "step": 752 + }, + { + "epoch": 1.541453428863869, + "grad_norm": 0.3774452021561776, + "learning_rate": 3.492781389219763e-05, + "loss": 0.472, + "step": 753 + }, + { + "epoch": 1.5435005117707266, + "grad_norm": 0.38222406103253237, + "learning_rate": 3.4908757137366006e-05, + "loss": 0.3984, + "step": 754 + }, + { + "epoch": 1.5455475946775845, + "grad_norm": 0.4097364486307627, + "learning_rate": 3.488966987012941e-05, + "loss": 0.4436, + "step": 755 + }, + { + "epoch": 1.547594677584442, + "grad_norm": 0.3733136430359242, + "learning_rate": 3.487055212955201e-05, + "loss": 0.4102, + "step": 756 + }, + { + "epoch": 1.5496417604913, + "grad_norm": 0.49818552627556745, + "learning_rate": 3.485140395476038e-05, + "loss": 0.485, + "step": 757 + }, + { + "epoch": 1.5516888433981575, + "grad_norm": 0.40160664320174677, + "learning_rate": 3.4832225384943335e-05, + "loss": 0.4662, + "step": 758 + }, + { + "epoch": 1.5537359263050154, + "grad_norm": 0.36914648750243184, + "learning_rate": 3.481301645935193e-05, + "loss": 0.4173, + "step": 759 + }, + { + "epoch": 1.555783009211873, + "grad_norm": 0.40716553946286355, + "learning_rate": 3.4793777217299346e-05, + "loss": 0.417, + "step": 760 + }, + { + "epoch": 1.557830092118731, + "grad_norm": 0.42779403061287685, + "learning_rate": 3.477450769816077e-05, + "loss": 0.4848, + "step": 761 + }, + { + "epoch": 1.5598771750255884, + "grad_norm": 0.3422957268296126, + "learning_rate": 3.475520794137341e-05, + "loss": 0.4039, + "step": 762 + }, + { + "epoch": 1.5619242579324464, + "grad_norm": 0.4032099351305834, + "learning_rate": 3.473587798643633e-05, + "loss": 0.4378, + "step": 763 + }, + { + "epoch": 1.563971340839304, + "grad_norm": 0.4010793632381822, + "learning_rate": 3.4716517872910405e-05, + "loss": 0.4461, + "step": 764 + }, + { + "epoch": 1.5660184237461618, + "grad_norm": 0.41602009577806304, + "learning_rate": 3.4697127640418204e-05, + "loss": 0.5198, + "step": 765 + }, + { + "epoch": 1.5680655066530194, + "grad_norm": 0.3840607306535795, + "learning_rate": 3.467770732864399e-05, + "loss": 0.4072, + "step": 766 + }, + { + "epoch": 1.5701125895598773, + "grad_norm": 0.45544304460334517, + "learning_rate": 3.4658256977333536e-05, + "loss": 0.4263, + "step": 767 + }, + { + "epoch": 1.5721596724667348, + "grad_norm": 0.4489200382867973, + "learning_rate": 3.4638776626294134e-05, + "loss": 0.4189, + "step": 768 + }, + { + "epoch": 1.5742067553735928, + "grad_norm": 0.40708118209847305, + "learning_rate": 3.461926631539445e-05, + "loss": 0.4344, + "step": 769 + }, + { + "epoch": 1.5762538382804503, + "grad_norm": 0.47396385986382383, + "learning_rate": 3.459972608456448e-05, + "loss": 0.4318, + "step": 770 + }, + { + "epoch": 1.5783009211873082, + "grad_norm": 0.44275905450120784, + "learning_rate": 3.4580155973795434e-05, + "loss": 0.4024, + "step": 771 + }, + { + "epoch": 1.5803480040941658, + "grad_norm": 0.37512733188763275, + "learning_rate": 3.4560556023139695e-05, + "loss": 0.4646, + "step": 772 + }, + { + "epoch": 1.5823950870010235, + "grad_norm": 0.4718852007159965, + "learning_rate": 3.454092627271072e-05, + "loss": 0.4386, + "step": 773 + }, + { + "epoch": 1.5844421699078812, + "grad_norm": 0.36977406003434726, + "learning_rate": 3.4521266762682924e-05, + "loss": 0.4429, + "step": 774 + }, + { + "epoch": 1.586489252814739, + "grad_norm": 0.36917719793736264, + "learning_rate": 3.450157753329166e-05, + "loss": 0.4753, + "step": 775 + }, + { + "epoch": 1.5885363357215967, + "grad_norm": 0.466057931195573, + "learning_rate": 3.448185862483309e-05, + "loss": 0.4542, + "step": 776 + }, + { + "epoch": 1.5905834186284544, + "grad_norm": 0.34642675856704536, + "learning_rate": 3.446211007766412e-05, + "loss": 0.3954, + "step": 777 + }, + { + "epoch": 1.5926305015353122, + "grad_norm": 0.4330447451756299, + "learning_rate": 3.4442331932202326e-05, + "loss": 0.4278, + "step": 778 + }, + { + "epoch": 1.59467758444217, + "grad_norm": 0.3730822908863033, + "learning_rate": 3.4422524228925836e-05, + "loss": 0.4061, + "step": 779 + }, + { + "epoch": 1.5967246673490276, + "grad_norm": 0.37973943757397915, + "learning_rate": 3.440268700837329e-05, + "loss": 0.394, + "step": 780 + }, + { + "epoch": 1.5987717502558854, + "grad_norm": 0.49428851704529125, + "learning_rate": 3.438282031114374e-05, + "loss": 0.4486, + "step": 781 + }, + { + "epoch": 1.600818833162743, + "grad_norm": 0.41805483229206175, + "learning_rate": 3.4362924177896545e-05, + "loss": 0.4393, + "step": 782 + }, + { + "epoch": 1.6028659160696008, + "grad_norm": 0.47689234144727466, + "learning_rate": 3.434299864935133e-05, + "loss": 0.4354, + "step": 783 + }, + { + "epoch": 1.6049129989764586, + "grad_norm": 0.42155977412917267, + "learning_rate": 3.432304376628787e-05, + "loss": 0.4602, + "step": 784 + }, + { + "epoch": 1.6069600818833163, + "grad_norm": 0.3865188335568589, + "learning_rate": 3.430305956954602e-05, + "loss": 0.4152, + "step": 785 + }, + { + "epoch": 1.609007164790174, + "grad_norm": 0.4271955733661172, + "learning_rate": 3.428304610002563e-05, + "loss": 0.4408, + "step": 786 + }, + { + "epoch": 1.6110542476970318, + "grad_norm": 0.44963813575609907, + "learning_rate": 3.4263003398686464e-05, + "loss": 0.4958, + "step": 787 + }, + { + "epoch": 1.6131013306038895, + "grad_norm": 0.3733726588143654, + "learning_rate": 3.424293150654809e-05, + "loss": 0.4287, + "step": 788 + }, + { + "epoch": 1.6151484135107472, + "grad_norm": 0.3812115032343838, + "learning_rate": 3.422283046468985e-05, + "loss": 0.405, + "step": 789 + }, + { + "epoch": 1.617195496417605, + "grad_norm": 0.37005120300102184, + "learning_rate": 3.420270031425072e-05, + "loss": 0.4516, + "step": 790 + }, + { + "epoch": 1.6192425793244627, + "grad_norm": 0.45666000218850494, + "learning_rate": 3.4182541096429265e-05, + "loss": 0.4523, + "step": 791 + }, + { + "epoch": 1.6212896622313204, + "grad_norm": 0.34898311295474405, + "learning_rate": 3.416235285248352e-05, + "loss": 0.4007, + "step": 792 + }, + { + "epoch": 1.623336745138178, + "grad_norm": 0.3839127190696324, + "learning_rate": 3.4142135623730954e-05, + "loss": 0.4617, + "step": 793 + }, + { + "epoch": 1.625383828045036, + "grad_norm": 0.4387438648695345, + "learning_rate": 3.412188945154833e-05, + "loss": 0.4723, + "step": 794 + }, + { + "epoch": 1.6274309109518934, + "grad_norm": 0.3332803619102528, + "learning_rate": 3.410161437737166e-05, + "loss": 0.362, + "step": 795 + }, + { + "epoch": 1.6294779938587514, + "grad_norm": 0.3807927986402262, + "learning_rate": 3.4081310442696114e-05, + "loss": 0.4339, + "step": 796 + }, + { + "epoch": 1.6315250767656089, + "grad_norm": 0.3937568675037694, + "learning_rate": 3.4060977689075914e-05, + "loss": 0.4184, + "step": 797 + }, + { + "epoch": 1.6335721596724668, + "grad_norm": 0.40435872407490453, + "learning_rate": 3.404061615812425e-05, + "loss": 0.4906, + "step": 798 + }, + { + "epoch": 1.6356192425793243, + "grad_norm": 0.37907378547306464, + "learning_rate": 3.402022589151325e-05, + "loss": 0.4273, + "step": 799 + }, + { + "epoch": 1.6376663254861823, + "grad_norm": 0.3703864227511636, + "learning_rate": 3.399980693097383e-05, + "loss": 0.4154, + "step": 800 + }, + { + "epoch": 1.6397134083930398, + "grad_norm": 0.4191952674222777, + "learning_rate": 3.3979359318295605e-05, + "loss": 0.3581, + "step": 801 + }, + { + "epoch": 1.6417604912998978, + "grad_norm": 0.3966331751054016, + "learning_rate": 3.395888309532687e-05, + "loss": 0.4669, + "step": 802 + }, + { + "epoch": 1.6438075742067553, + "grad_norm": 0.445008569764165, + "learning_rate": 3.393837830397446e-05, + "loss": 0.4267, + "step": 803 + }, + { + "epoch": 1.6458546571136132, + "grad_norm": 0.46681180810571704, + "learning_rate": 3.391784498620369e-05, + "loss": 0.4895, + "step": 804 + }, + { + "epoch": 1.6479017400204707, + "grad_norm": 0.3385167810574236, + "learning_rate": 3.3897283184038215e-05, + "loss": 0.404, + "step": 805 + }, + { + "epoch": 1.6499488229273287, + "grad_norm": 0.5219129958909896, + "learning_rate": 3.387669293956003e-05, + "loss": 0.4305, + "step": 806 + }, + { + "epoch": 1.6519959058341862, + "grad_norm": 0.3668653050867526, + "learning_rate": 3.385607429490934e-05, + "loss": 0.3746, + "step": 807 + }, + { + "epoch": 1.6540429887410442, + "grad_norm": 0.5158311002393402, + "learning_rate": 3.3835427292284445e-05, + "loss": 0.486, + "step": 808 + }, + { + "epoch": 1.6560900716479017, + "grad_norm": 0.36978068539278974, + "learning_rate": 3.38147519739417e-05, + "loss": 0.4487, + "step": 809 + }, + { + "epoch": 1.6581371545547596, + "grad_norm": 0.3772021211105363, + "learning_rate": 3.37940483821954e-05, + "loss": 0.4017, + "step": 810 + }, + { + "epoch": 1.6601842374616171, + "grad_norm": 0.41401208686983515, + "learning_rate": 3.3773316559417734e-05, + "loss": 0.491, + "step": 811 + }, + { + "epoch": 1.6622313203684749, + "grad_norm": 0.3524038676017696, + "learning_rate": 3.375255654803864e-05, + "loss": 0.3938, + "step": 812 + }, + { + "epoch": 1.6642784032753326, + "grad_norm": 0.4096084241414413, + "learning_rate": 3.373176839054576e-05, + "loss": 0.4157, + "step": 813 + }, + { + "epoch": 1.6663254861821903, + "grad_norm": 0.4186774343886114, + "learning_rate": 3.371095212948431e-05, + "loss": 0.4681, + "step": 814 + }, + { + "epoch": 1.668372569089048, + "grad_norm": 0.40515797591205743, + "learning_rate": 3.3690107807457085e-05, + "loss": 0.4459, + "step": 815 + }, + { + "epoch": 1.6704196519959058, + "grad_norm": 0.32867520587713756, + "learning_rate": 3.366923546712426e-05, + "loss": 0.3888, + "step": 816 + }, + { + "epoch": 1.6724667349027635, + "grad_norm": 0.347589875409667, + "learning_rate": 3.364833515120336e-05, + "loss": 0.4083, + "step": 817 + }, + { + "epoch": 1.6745138178096213, + "grad_norm": 0.3591052938957919, + "learning_rate": 3.362740690246918e-05, + "loss": 0.3938, + "step": 818 + }, + { + "epoch": 1.676560900716479, + "grad_norm": 0.35854368626658006, + "learning_rate": 3.360645076375368e-05, + "loss": 0.4217, + "step": 819 + }, + { + "epoch": 1.6786079836233367, + "grad_norm": 0.44210711469086067, + "learning_rate": 3.358546677794586e-05, + "loss": 0.4752, + "step": 820 + }, + { + "epoch": 1.6806550665301945, + "grad_norm": 0.3418341105649144, + "learning_rate": 3.356445498799179e-05, + "loss": 0.3828, + "step": 821 + }, + { + "epoch": 1.6827021494370522, + "grad_norm": 0.40574252833368174, + "learning_rate": 3.354341543689438e-05, + "loss": 0.4138, + "step": 822 + }, + { + "epoch": 1.68474923234391, + "grad_norm": 0.41618079643123923, + "learning_rate": 3.352234816771337e-05, + "loss": 0.4559, + "step": 823 + }, + { + "epoch": 1.6867963152507677, + "grad_norm": 0.42761098792857727, + "learning_rate": 3.350125322356525e-05, + "loss": 0.4466, + "step": 824 + }, + { + "epoch": 1.6888433981576254, + "grad_norm": 0.33049589320031353, + "learning_rate": 3.348013064762312e-05, + "loss": 0.3768, + "step": 825 + }, + { + "epoch": 1.6908904810644831, + "grad_norm": 0.3768698742983625, + "learning_rate": 3.3458980483116664e-05, + "loss": 0.4278, + "step": 826 + }, + { + "epoch": 1.6929375639713409, + "grad_norm": 0.3675413592176129, + "learning_rate": 3.343780277333199e-05, + "loss": 0.4573, + "step": 827 + }, + { + "epoch": 1.6949846468781986, + "grad_norm": 0.39935534592316574, + "learning_rate": 3.3416597561611616e-05, + "loss": 0.4607, + "step": 828 + }, + { + "epoch": 1.6970317297850563, + "grad_norm": 0.3422867133996467, + "learning_rate": 3.3395364891354316e-05, + "loss": 0.3925, + "step": 829 + }, + { + "epoch": 1.699078812691914, + "grad_norm": 0.4421095089073154, + "learning_rate": 3.33741048060151e-05, + "loss": 0.5186, + "step": 830 + }, + { + "epoch": 1.7011258955987718, + "grad_norm": 0.40513609641554227, + "learning_rate": 3.3352817349105046e-05, + "loss": 0.4388, + "step": 831 + }, + { + "epoch": 1.7031729785056293, + "grad_norm": 0.47309198797804985, + "learning_rate": 3.333150256419127e-05, + "loss": 0.5152, + "step": 832 + }, + { + "epoch": 1.7052200614124873, + "grad_norm": 0.37396555660516145, + "learning_rate": 3.331016049489681e-05, + "loss": 0.3778, + "step": 833 + }, + { + "epoch": 1.7072671443193448, + "grad_norm": 0.46343295183867955, + "learning_rate": 3.328879118490055e-05, + "loss": 0.4181, + "step": 834 + }, + { + "epoch": 1.7093142272262027, + "grad_norm": 0.36985219342112263, + "learning_rate": 3.3267394677937134e-05, + "loss": 0.4519, + "step": 835 + }, + { + "epoch": 1.7113613101330603, + "grad_norm": 0.47168961292482453, + "learning_rate": 3.3245971017796854e-05, + "loss": 0.4754, + "step": 836 + }, + { + "epoch": 1.7134083930399182, + "grad_norm": 0.45545209130289294, + "learning_rate": 3.322452024832557e-05, + "loss": 0.4054, + "step": 837 + }, + { + "epoch": 1.7154554759467757, + "grad_norm": 0.44300545126776186, + "learning_rate": 3.320304241342464e-05, + "loss": 0.4707, + "step": 838 + }, + { + "epoch": 1.7175025588536337, + "grad_norm": 0.40914423787668386, + "learning_rate": 3.31815375570508e-05, + "loss": 0.4008, + "step": 839 + }, + { + "epoch": 1.7195496417604912, + "grad_norm": 0.42788343961512754, + "learning_rate": 3.3160005723216105e-05, + "loss": 0.4063, + "step": 840 + }, + { + "epoch": 1.7215967246673491, + "grad_norm": 0.3836547816282569, + "learning_rate": 3.31384469559878e-05, + "loss": 0.4471, + "step": 841 + }, + { + "epoch": 1.7236438075742067, + "grad_norm": 0.3712650340512891, + "learning_rate": 3.311686129948827e-05, + "loss": 0.3813, + "step": 842 + }, + { + "epoch": 1.7256908904810646, + "grad_norm": 0.3661541112584301, + "learning_rate": 3.3095248797894925e-05, + "loss": 0.4373, + "step": 843 + }, + { + "epoch": 1.7277379733879221, + "grad_norm": 0.40769069558582477, + "learning_rate": 3.307360949544012e-05, + "loss": 0.433, + "step": 844 + }, + { + "epoch": 1.72978505629478, + "grad_norm": 0.4384390897805754, + "learning_rate": 3.305194343641106e-05, + "loss": 0.4414, + "step": 845 + }, + { + "epoch": 1.7318321392016376, + "grad_norm": 0.39419405263041707, + "learning_rate": 3.30302506651497e-05, + "loss": 0.3903, + "step": 846 + }, + { + "epoch": 1.7338792221084955, + "grad_norm": 0.5193517899563321, + "learning_rate": 3.300853122605268e-05, + "loss": 0.5126, + "step": 847 + }, + { + "epoch": 1.735926305015353, + "grad_norm": 0.37470876760021676, + "learning_rate": 3.2986785163571216e-05, + "loss": 0.4088, + "step": 848 + }, + { + "epoch": 1.737973387922211, + "grad_norm": 0.3911340678248973, + "learning_rate": 3.2965012522211e-05, + "loss": 0.4231, + "step": 849 + }, + { + "epoch": 1.7400204708290685, + "grad_norm": 0.3718531256618771, + "learning_rate": 3.294321334653213e-05, + "loss": 0.4087, + "step": 850 + }, + { + "epoch": 1.7420675537359263, + "grad_norm": 0.44651626014948276, + "learning_rate": 3.2921387681149e-05, + "loss": 0.4623, + "step": 851 + }, + { + "epoch": 1.744114636642784, + "grad_norm": 0.37467196684362686, + "learning_rate": 3.289953557073024e-05, + "loss": 0.4148, + "step": 852 + }, + { + "epoch": 1.7461617195496417, + "grad_norm": 0.42577176527999205, + "learning_rate": 3.2877657059998584e-05, + "loss": 0.4103, + "step": 853 + }, + { + "epoch": 1.7482088024564995, + "grad_norm": 0.3833747846094608, + "learning_rate": 3.285575219373079e-05, + "loss": 0.4432, + "step": 854 + }, + { + "epoch": 1.7502558853633572, + "grad_norm": 0.4116483874002876, + "learning_rate": 3.2833821016757586e-05, + "loss": 0.4203, + "step": 855 + }, + { + "epoch": 1.752302968270215, + "grad_norm": 0.330063754223896, + "learning_rate": 3.281186357396351e-05, + "loss": 0.3895, + "step": 856 + }, + { + "epoch": 1.7543500511770727, + "grad_norm": 0.398313432569639, + "learning_rate": 3.278987991028688e-05, + "loss": 0.4367, + "step": 857 + }, + { + "epoch": 1.7563971340839304, + "grad_norm": 0.3618880492014595, + "learning_rate": 3.276787007071968e-05, + "loss": 0.3917, + "step": 858 + }, + { + "epoch": 1.7584442169907881, + "grad_norm": 0.4604401121085883, + "learning_rate": 3.274583410030745e-05, + "loss": 0.4577, + "step": 859 + }, + { + "epoch": 1.7604912998976459, + "grad_norm": 0.33679127692517397, + "learning_rate": 3.2723772044149224e-05, + "loss": 0.4072, + "step": 860 + }, + { + "epoch": 1.7625383828045036, + "grad_norm": 0.3624660321439512, + "learning_rate": 3.270168394739741e-05, + "loss": 0.4098, + "step": 861 + }, + { + "epoch": 1.7645854657113613, + "grad_norm": 0.343429438960415, + "learning_rate": 3.267956985525774e-05, + "loss": 0.4262, + "step": 862 + }, + { + "epoch": 1.766632548618219, + "grad_norm": 0.3866589432412809, + "learning_rate": 3.26574298129891e-05, + "loss": 0.4311, + "step": 863 + }, + { + "epoch": 1.7686796315250768, + "grad_norm": 0.3484380413267947, + "learning_rate": 3.263526386590351e-05, + "loss": 0.4265, + "step": 864 + }, + { + "epoch": 1.7707267144319345, + "grad_norm": 0.3727255169733587, + "learning_rate": 3.261307205936603e-05, + "loss": 0.4936, + "step": 865 + }, + { + "epoch": 1.7727737973387923, + "grad_norm": 0.35604276467584445, + "learning_rate": 3.2590854438794604e-05, + "loss": 0.3855, + "step": 866 + }, + { + "epoch": 1.77482088024565, + "grad_norm": 0.37701838111479336, + "learning_rate": 3.2568611049660046e-05, + "loss": 0.4308, + "step": 867 + }, + { + "epoch": 1.7768679631525077, + "grad_norm": 0.3530904170276449, + "learning_rate": 3.2546341937485884e-05, + "loss": 0.4198, + "step": 868 + }, + { + "epoch": 1.7789150460593655, + "grad_norm": 0.37564040686995553, + "learning_rate": 3.2524047147848284e-05, + "loss": 0.3702, + "step": 869 + }, + { + "epoch": 1.7809621289662232, + "grad_norm": 0.366044908240742, + "learning_rate": 3.250172672637598e-05, + "loss": 0.4561, + "step": 870 + }, + { + "epoch": 1.7830092118730807, + "grad_norm": 0.3683215486290299, + "learning_rate": 3.247938071875017e-05, + "loss": 0.4467, + "step": 871 + }, + { + "epoch": 1.7850562947799387, + "grad_norm": 0.38716018556798415, + "learning_rate": 3.24570091707044e-05, + "loss": 0.4151, + "step": 872 + }, + { + "epoch": 1.7871033776867962, + "grad_norm": 0.7367692001662189, + "learning_rate": 3.24346121280245e-05, + "loss": 0.438, + "step": 873 + }, + { + "epoch": 1.7891504605936541, + "grad_norm": 0.41338889728660405, + "learning_rate": 3.2412189636548456e-05, + "loss": 0.4629, + "step": 874 + }, + { + "epoch": 1.7911975435005116, + "grad_norm": 0.35357448274069847, + "learning_rate": 3.238974174216637e-05, + "loss": 0.3559, + "step": 875 + }, + { + "epoch": 1.7932446264073696, + "grad_norm": 0.3796163326385031, + "learning_rate": 3.236726849082032e-05, + "loss": 0.4281, + "step": 876 + }, + { + "epoch": 1.795291709314227, + "grad_norm": 0.3944707372507435, + "learning_rate": 3.234476992850425e-05, + "loss": 0.4537, + "step": 877 + }, + { + "epoch": 1.797338792221085, + "grad_norm": 0.3318296802336291, + "learning_rate": 3.232224610126396e-05, + "loss": 0.3985, + "step": 878 + }, + { + "epoch": 1.7993858751279426, + "grad_norm": 0.40919288715091856, + "learning_rate": 3.229969705519693e-05, + "loss": 0.4616, + "step": 879 + }, + { + "epoch": 1.8014329580348005, + "grad_norm": 0.39543379019412783, + "learning_rate": 3.227712283645224e-05, + "loss": 0.4883, + "step": 880 + }, + { + "epoch": 1.803480040941658, + "grad_norm": 0.3624564100802025, + "learning_rate": 3.225452349123051e-05, + "loss": 0.4264, + "step": 881 + }, + { + "epoch": 1.805527123848516, + "grad_norm": 0.35250394717776745, + "learning_rate": 3.2231899065783766e-05, + "loss": 0.3975, + "step": 882 + }, + { + "epoch": 1.8075742067553735, + "grad_norm": 0.41454217155818623, + "learning_rate": 3.2209249606415394e-05, + "loss": 0.4668, + "step": 883 + }, + { + "epoch": 1.8096212896622315, + "grad_norm": 0.33248312085973525, + "learning_rate": 3.2186575159479966e-05, + "loss": 0.3853, + "step": 884 + }, + { + "epoch": 1.811668372569089, + "grad_norm": 0.41602712353842625, + "learning_rate": 3.2163875771383246e-05, + "loss": 0.4615, + "step": 885 + }, + { + "epoch": 1.813715455475947, + "grad_norm": 0.3941084429766098, + "learning_rate": 3.214115148858201e-05, + "loss": 0.5111, + "step": 886 + }, + { + "epoch": 1.8157625383828044, + "grad_norm": 0.34750451682083505, + "learning_rate": 3.211840235758399e-05, + "loss": 0.4055, + "step": 887 + }, + { + "epoch": 1.8178096212896624, + "grad_norm": 0.45072414952251894, + "learning_rate": 3.209562842494778e-05, + "loss": 0.4673, + "step": 888 + }, + { + "epoch": 1.81985670419652, + "grad_norm": 0.36513950471383816, + "learning_rate": 3.207282973728273e-05, + "loss": 0.4526, + "step": 889 + }, + { + "epoch": 1.8219037871033776, + "grad_norm": 0.3898093803880879, + "learning_rate": 3.205000634124884e-05, + "loss": 0.4045, + "step": 890 + }, + { + "epoch": 1.8239508700102354, + "grad_norm": 0.36982373792147444, + "learning_rate": 3.20271582835567e-05, + "loss": 0.4079, + "step": 891 + }, + { + "epoch": 1.825997952917093, + "grad_norm": 0.35532947918298324, + "learning_rate": 3.200428561096737e-05, + "loss": 0.444, + "step": 892 + }, + { + "epoch": 1.8280450358239508, + "grad_norm": 0.3777203606733099, + "learning_rate": 3.198138837029227e-05, + "loss": 0.46, + "step": 893 + }, + { + "epoch": 1.8300921187308086, + "grad_norm": 0.3489126911275309, + "learning_rate": 3.195846660839311e-05, + "loss": 0.3887, + "step": 894 + }, + { + "epoch": 1.8321392016376663, + "grad_norm": 0.3633804424526499, + "learning_rate": 3.193552037218179e-05, + "loss": 0.5416, + "step": 895 + }, + { + "epoch": 1.834186284544524, + "grad_norm": 0.38704595278722626, + "learning_rate": 3.1912549708620314e-05, + "loss": 0.47, + "step": 896 + }, + { + "epoch": 1.8362333674513818, + "grad_norm": 0.36865192383072315, + "learning_rate": 3.188955466472063e-05, + "loss": 0.4084, + "step": 897 + }, + { + "epoch": 1.8382804503582395, + "grad_norm": 0.42348322893722706, + "learning_rate": 3.186653528754464e-05, + "loss": 0.4354, + "step": 898 + }, + { + "epoch": 1.8403275332650972, + "grad_norm": 0.37712189200316865, + "learning_rate": 3.184349162420401e-05, + "loss": 0.4011, + "step": 899 + }, + { + "epoch": 1.842374616171955, + "grad_norm": 0.40519216196971364, + "learning_rate": 3.182042372186013e-05, + "loss": 0.4523, + "step": 900 + }, + { + "epoch": 1.8444216990788127, + "grad_norm": 0.3270302439955843, + "learning_rate": 3.179733162772398e-05, + "loss": 0.3863, + "step": 901 + }, + { + "epoch": 1.8464687819856704, + "grad_norm": 0.36562347535937434, + "learning_rate": 3.177421538905606e-05, + "loss": 0.414, + "step": 902 + }, + { + "epoch": 1.8485158648925282, + "grad_norm": 0.34059679657441694, + "learning_rate": 3.17510750531663e-05, + "loss": 0.4319, + "step": 903 + }, + { + "epoch": 1.850562947799386, + "grad_norm": 0.3971233949934265, + "learning_rate": 3.172791066741392e-05, + "loss": 0.4051, + "step": 904 + }, + { + "epoch": 1.8526100307062436, + "grad_norm": 0.37489566812190156, + "learning_rate": 3.170472227920737e-05, + "loss": 0.4488, + "step": 905 + }, + { + "epoch": 1.8546571136131014, + "grad_norm": 0.3619651650704685, + "learning_rate": 3.168150993600424e-05, + "loss": 0.4097, + "step": 906 + }, + { + "epoch": 1.856704196519959, + "grad_norm": 0.4144444046604467, + "learning_rate": 3.165827368531113e-05, + "loss": 0.4838, + "step": 907 + }, + { + "epoch": 1.8587512794268168, + "grad_norm": 0.34162161820675707, + "learning_rate": 3.1635013574683564e-05, + "loss": 0.403, + "step": 908 + }, + { + "epoch": 1.8607983623336746, + "grad_norm": 0.3876122467545283, + "learning_rate": 3.161172965172591e-05, + "loss": 0.4564, + "step": 909 + }, + { + "epoch": 1.862845445240532, + "grad_norm": 0.34340552862244217, + "learning_rate": 3.1588421964091276e-05, + "loss": 0.397, + "step": 910 + }, + { + "epoch": 1.86489252814739, + "grad_norm": 0.383808899187042, + "learning_rate": 3.1565090559481396e-05, + "loss": 0.4265, + "step": 911 + }, + { + "epoch": 1.8669396110542475, + "grad_norm": 0.31690081389294966, + "learning_rate": 3.1541735485646536e-05, + "loss": 0.4047, + "step": 912 + }, + { + "epoch": 1.8689866939611055, + "grad_norm": 0.3992971993499514, + "learning_rate": 3.151835679038542e-05, + "loss": 0.439, + "step": 913 + }, + { + "epoch": 1.871033776867963, + "grad_norm": 0.31545614214320866, + "learning_rate": 3.149495452154512e-05, + "loss": 0.3986, + "step": 914 + }, + { + "epoch": 1.873080859774821, + "grad_norm": 0.33181454924376286, + "learning_rate": 3.147152872702092e-05, + "loss": 0.4364, + "step": 915 + }, + { + "epoch": 1.8751279426816785, + "grad_norm": 0.38596107764783966, + "learning_rate": 3.14480794547563e-05, + "loss": 0.4666, + "step": 916 + }, + { + "epoch": 1.8771750255885364, + "grad_norm": 0.3743307900880147, + "learning_rate": 3.142460675274275e-05, + "loss": 0.4136, + "step": 917 + }, + { + "epoch": 1.879222108495394, + "grad_norm": 0.39267653351820997, + "learning_rate": 3.1401110669019724e-05, + "loss": 0.4308, + "step": 918 + }, + { + "epoch": 1.881269191402252, + "grad_norm": 0.3865400215305747, + "learning_rate": 3.137759125167455e-05, + "loss": 0.4663, + "step": 919 + }, + { + "epoch": 1.8833162743091094, + "grad_norm": 0.36025373309076514, + "learning_rate": 3.135404854884226e-05, + "loss": 0.4202, + "step": 920 + }, + { + "epoch": 1.8853633572159674, + "grad_norm": 0.3774704107912035, + "learning_rate": 3.133048260870561e-05, + "loss": 0.4047, + "step": 921 + }, + { + "epoch": 1.8874104401228249, + "grad_norm": 0.3281213257339426, + "learning_rate": 3.130689347949486e-05, + "loss": 0.4088, + "step": 922 + }, + { + "epoch": 1.8894575230296828, + "grad_norm": 0.3593342361422244, + "learning_rate": 3.1283281209487755e-05, + "loss": 0.4475, + "step": 923 + }, + { + "epoch": 1.8915046059365404, + "grad_norm": 0.348849346649699, + "learning_rate": 3.1259645847009384e-05, + "loss": 0.4133, + "step": 924 + }, + { + "epoch": 1.8935516888433983, + "grad_norm": 0.3661633848236013, + "learning_rate": 3.123598744043211e-05, + "loss": 0.4345, + "step": 925 + }, + { + "epoch": 1.8955987717502558, + "grad_norm": 0.39376453806042766, + "learning_rate": 3.121230603817545e-05, + "loss": 0.4802, + "step": 926 + }, + { + "epoch": 1.8976458546571138, + "grad_norm": 0.4054577347062655, + "learning_rate": 3.1188601688706e-05, + "loss": 0.4861, + "step": 927 + }, + { + "epoch": 1.8996929375639713, + "grad_norm": 0.3244003846213942, + "learning_rate": 3.1164874440537295e-05, + "loss": 0.3988, + "step": 928 + }, + { + "epoch": 1.901740020470829, + "grad_norm": 0.3598304561011502, + "learning_rate": 3.114112434222976e-05, + "loss": 0.4083, + "step": 929 + }, + { + "epoch": 1.9037871033776868, + "grad_norm": 0.3818801716604425, + "learning_rate": 3.111735144239057e-05, + "loss": 0.424, + "step": 930 + }, + { + "epoch": 1.9058341862845445, + "grad_norm": 0.351513366570262, + "learning_rate": 3.109355578967356e-05, + "loss": 0.4529, + "step": 931 + }, + { + "epoch": 1.9078812691914022, + "grad_norm": 0.3240275031967729, + "learning_rate": 3.106973743277916e-05, + "loss": 0.4211, + "step": 932 + }, + { + "epoch": 1.90992835209826, + "grad_norm": 0.361341366033979, + "learning_rate": 3.104589642045422e-05, + "loss": 0.4776, + "step": 933 + }, + { + "epoch": 1.9119754350051177, + "grad_norm": 0.29231193321874205, + "learning_rate": 3.1022032801492e-05, + "loss": 0.3741, + "step": 934 + }, + { + "epoch": 1.9140225179119754, + "grad_norm": 0.3401516128458725, + "learning_rate": 3.099814662473202e-05, + "loss": 0.411, + "step": 935 + }, + { + "epoch": 1.9160696008188332, + "grad_norm": 0.3813395041273339, + "learning_rate": 3.0974237939059947e-05, + "loss": 0.4652, + "step": 936 + }, + { + "epoch": 1.9181166837256909, + "grad_norm": 0.33396626631772636, + "learning_rate": 3.095030679340751e-05, + "loss": 0.438, + "step": 937 + }, + { + "epoch": 1.9201637666325486, + "grad_norm": 0.27154874124436096, + "learning_rate": 3.092635323675245e-05, + "loss": 0.3297, + "step": 938 + }, + { + "epoch": 1.9222108495394064, + "grad_norm": 0.3729869848757238, + "learning_rate": 3.0902377318118336e-05, + "loss": 0.3925, + "step": 939 + }, + { + "epoch": 1.924257932446264, + "grad_norm": 0.3729082740901169, + "learning_rate": 3.0878379086574494e-05, + "loss": 0.4632, + "step": 940 + }, + { + "epoch": 1.9263050153531218, + "grad_norm": 0.33442817830860083, + "learning_rate": 3.085435859123596e-05, + "loss": 0.4246, + "step": 941 + }, + { + "epoch": 1.9283520982599796, + "grad_norm": 0.3455333402015141, + "learning_rate": 3.083031588126329e-05, + "loss": 0.4291, + "step": 942 + }, + { + "epoch": 1.9303991811668373, + "grad_norm": 0.33030857885870823, + "learning_rate": 3.0806251005862535e-05, + "loss": 0.4293, + "step": 943 + }, + { + "epoch": 1.932446264073695, + "grad_norm": 0.3313566999610316, + "learning_rate": 3.07821640142851e-05, + "loss": 0.4691, + "step": 944 + }, + { + "epoch": 1.9344933469805528, + "grad_norm": 0.3371774148631356, + "learning_rate": 3.0758054955827655e-05, + "loss": 0.4283, + "step": 945 + }, + { + "epoch": 1.9365404298874105, + "grad_norm": 0.35055569911779827, + "learning_rate": 3.073392387983202e-05, + "loss": 0.4157, + "step": 946 + }, + { + "epoch": 1.9385875127942682, + "grad_norm": 0.3713512962937003, + "learning_rate": 3.070977083568508e-05, + "loss": 0.4709, + "step": 947 + }, + { + "epoch": 1.940634595701126, + "grad_norm": 0.3121880967121788, + "learning_rate": 3.06855958728187e-05, + "loss": 0.3584, + "step": 948 + }, + { + "epoch": 1.9426816786079835, + "grad_norm": 0.3923276029776799, + "learning_rate": 3.0661399040709584e-05, + "loss": 0.4273, + "step": 949 + }, + { + "epoch": 1.9447287615148414, + "grad_norm": 0.39501548252234137, + "learning_rate": 3.0637180388879207e-05, + "loss": 0.4292, + "step": 950 + }, + { + "epoch": 1.946775844421699, + "grad_norm": 0.3918727723912462, + "learning_rate": 3.061293996689369e-05, + "loss": 0.4422, + "step": 951 + }, + { + "epoch": 1.9488229273285569, + "grad_norm": 0.3650853100344803, + "learning_rate": 3.05886778243637e-05, + "loss": 0.4355, + "step": 952 + }, + { + "epoch": 1.9508700102354144, + "grad_norm": 0.40314024207731314, + "learning_rate": 3.0564394010944396e-05, + "loss": 0.3964, + "step": 953 + }, + { + "epoch": 1.9529170931422724, + "grad_norm": 0.3321329959720213, + "learning_rate": 3.054008857633524e-05, + "loss": 0.3802, + "step": 954 + }, + { + "epoch": 1.9549641760491299, + "grad_norm": 0.3578935619719161, + "learning_rate": 3.051576157027998e-05, + "loss": 0.4187, + "step": 955 + }, + { + "epoch": 1.9570112589559878, + "grad_norm": 0.36008706778157745, + "learning_rate": 3.0491413042566492e-05, + "loss": 0.421, + "step": 956 + }, + { + "epoch": 1.9590583418628453, + "grad_norm": 0.35653968726696394, + "learning_rate": 3.0467043043026705e-05, + "loss": 0.3773, + "step": 957 + }, + { + "epoch": 1.9611054247697033, + "grad_norm": 0.398663136011159, + "learning_rate": 3.0442651621536502e-05, + "loss": 0.4867, + "step": 958 + }, + { + "epoch": 1.9631525076765608, + "grad_norm": 0.39803480683406417, + "learning_rate": 3.041823882801559e-05, + "loss": 0.4572, + "step": 959 + }, + { + "epoch": 1.9651995905834188, + "grad_norm": 0.31571236419876386, + "learning_rate": 3.039380471242743e-05, + "loss": 0.3953, + "step": 960 + }, + { + "epoch": 1.9672466734902763, + "grad_norm": 0.356704934998217, + "learning_rate": 3.0369349324779115e-05, + "loss": 0.4116, + "step": 961 + }, + { + "epoch": 1.9692937563971342, + "grad_norm": 0.38456016779817315, + "learning_rate": 3.0344872715121276e-05, + "loss": 0.4181, + "step": 962 + }, + { + "epoch": 1.9713408393039917, + "grad_norm": 0.42725505346001535, + "learning_rate": 3.0320374933547982e-05, + "loss": 0.4509, + "step": 963 + }, + { + "epoch": 1.9733879222108497, + "grad_norm": 0.3124541183454085, + "learning_rate": 3.0295856030196618e-05, + "loss": 0.3635, + "step": 964 + }, + { + "epoch": 1.9754350051177072, + "grad_norm": 0.38135606484411233, + "learning_rate": 3.0271316055247812e-05, + "loss": 0.4322, + "step": 965 + }, + { + "epoch": 1.9774820880245652, + "grad_norm": 0.3564421243258828, + "learning_rate": 3.024675505892531e-05, + "loss": 0.4706, + "step": 966 + }, + { + "epoch": 1.9795291709314227, + "grad_norm": 0.33614285440354186, + "learning_rate": 3.022217309149588e-05, + "loss": 0.3916, + "step": 967 + }, + { + "epoch": 1.9815762538382804, + "grad_norm": 0.3642272706560808, + "learning_rate": 3.019757020326921e-05, + "loss": 0.4001, + "step": 968 + }, + { + "epoch": 1.9836233367451381, + "grad_norm": 0.3677389037382802, + "learning_rate": 3.017294644459782e-05, + "loss": 0.4067, + "step": 969 + }, + { + "epoch": 1.9856704196519959, + "grad_norm": 0.38491158269193576, + "learning_rate": 3.0148301865876913e-05, + "loss": 0.4266, + "step": 970 + }, + { + "epoch": 1.9877175025588536, + "grad_norm": 0.4027798698020095, + "learning_rate": 3.0123636517544326e-05, + "loss": 0.5046, + "step": 971 + }, + { + "epoch": 1.9897645854657113, + "grad_norm": 0.36103993797318307, + "learning_rate": 3.0098950450080404e-05, + "loss": 0.3863, + "step": 972 + }, + { + "epoch": 1.991811668372569, + "grad_norm": 0.3202323840951822, + "learning_rate": 3.0074243714007875e-05, + "loss": 0.3562, + "step": 973 + }, + { + "epoch": 1.9938587512794268, + "grad_norm": 0.3949919429980765, + "learning_rate": 3.004951635989179e-05, + "loss": 0.4732, + "step": 974 + }, + { + "epoch": 1.9959058341862845, + "grad_norm": 0.3475495093601558, + "learning_rate": 3.0024768438339388e-05, + "loss": 0.391, + "step": 975 + }, + { + "epoch": 1.9979529170931423, + "grad_norm": 0.39216689067501626, + "learning_rate": 3.0000000000000004e-05, + "loss": 0.4564, + "step": 976 + }, + { + "epoch": 2.0, + "grad_norm": 0.6533349441460323, + "learning_rate": 2.9975211095564955e-05, + "loss": 0.5882, + "step": 977 + }, + { + "epoch": 2.0020470829068575, + "grad_norm": 0.4211988149390974, + "learning_rate": 2.995040177576745e-05, + "loss": 0.3174, + "step": 978 + }, + { + "epoch": 2.0040941658137155, + "grad_norm": 0.6108187632741519, + "learning_rate": 2.992557209138249e-05, + "loss": 0.3367, + "step": 979 + }, + { + "epoch": 2.006141248720573, + "grad_norm": 0.38089369292387826, + "learning_rate": 2.9900722093226737e-05, + "loss": 0.3232, + "step": 980 + }, + { + "epoch": 2.008188331627431, + "grad_norm": 0.47508227914348694, + "learning_rate": 2.9875851832158428e-05, + "loss": 0.3522, + "step": 981 + }, + { + "epoch": 2.0102354145342884, + "grad_norm": 0.4326113199153507, + "learning_rate": 2.9850961359077293e-05, + "loss": 0.2981, + "step": 982 + }, + { + "epoch": 2.0122824974411464, + "grad_norm": 0.42932267247803857, + "learning_rate": 2.98260507249244e-05, + "loss": 0.3202, + "step": 983 + }, + { + "epoch": 2.014329580348004, + "grad_norm": 0.3951716605003508, + "learning_rate": 2.9801119980682095e-05, + "loss": 0.3509, + "step": 984 + }, + { + "epoch": 2.016376663254862, + "grad_norm": 0.35789828953920616, + "learning_rate": 2.977616917737388e-05, + "loss": 0.3281, + "step": 985 + }, + { + "epoch": 2.0184237461617194, + "grad_norm": 0.3460413679418126, + "learning_rate": 2.9751198366064304e-05, + "loss": 0.3161, + "step": 986 + }, + { + "epoch": 2.0204708290685773, + "grad_norm": 0.3703844711305289, + "learning_rate": 2.9726207597858872e-05, + "loss": 0.3251, + "step": 987 + }, + { + "epoch": 2.022517911975435, + "grad_norm": 0.39238883821417286, + "learning_rate": 2.9701196923903927e-05, + "loss": 0.3391, + "step": 988 + }, + { + "epoch": 2.024564994882293, + "grad_norm": 0.34303403649979675, + "learning_rate": 2.9676166395386553e-05, + "loss": 0.361, + "step": 989 + }, + { + "epoch": 2.0266120777891503, + "grad_norm": 0.38788579378291205, + "learning_rate": 2.965111606353447e-05, + "loss": 0.312, + "step": 990 + }, + { + "epoch": 2.0286591606960083, + "grad_norm": 0.4013869555066414, + "learning_rate": 2.9626045979615928e-05, + "loss": 0.3209, + "step": 991 + }, + { + "epoch": 2.030706243602866, + "grad_norm": 0.37089151639897344, + "learning_rate": 2.9600956194939598e-05, + "loss": 0.3498, + "step": 992 + }, + { + "epoch": 2.0327533265097237, + "grad_norm": 0.371332657755311, + "learning_rate": 2.957584676085447e-05, + "loss": 0.3422, + "step": 993 + }, + { + "epoch": 2.0348004094165812, + "grad_norm": 0.3762690812523185, + "learning_rate": 2.9550717728749768e-05, + "loss": 0.3199, + "step": 994 + }, + { + "epoch": 2.036847492323439, + "grad_norm": 0.3431217144511242, + "learning_rate": 2.9525569150054796e-05, + "loss": 0.2871, + "step": 995 + }, + { + "epoch": 2.0388945752302967, + "grad_norm": 0.42824685102967736, + "learning_rate": 2.950040107623887e-05, + "loss": 0.3268, + "step": 996 + }, + { + "epoch": 2.0409416581371547, + "grad_norm": 0.3459406390165573, + "learning_rate": 2.947521355881122e-05, + "loss": 0.3268, + "step": 997 + }, + { + "epoch": 2.042988741044012, + "grad_norm": 0.3484085391017404, + "learning_rate": 2.9450006649320862e-05, + "loss": 0.325, + "step": 998 + }, + { + "epoch": 2.04503582395087, + "grad_norm": 0.3537579300316069, + "learning_rate": 2.9424780399356497e-05, + "loss": 0.3144, + "step": 999 + }, + { + "epoch": 2.0470829068577276, + "grad_norm": 0.3540499619721327, + "learning_rate": 2.9399534860546404e-05, + "loss": 0.3324, + "step": 1000 + }, + { + "epoch": 2.0491299897645856, + "grad_norm": 0.3146082628297793, + "learning_rate": 2.937427008455835e-05, + "loss": 0.3458, + "step": 1001 + }, + { + "epoch": 2.051177072671443, + "grad_norm": 0.41424472646515875, + "learning_rate": 2.9348986123099462e-05, + "loss": 0.3638, + "step": 1002 + }, + { + "epoch": 2.053224155578301, + "grad_norm": 0.3640670691803175, + "learning_rate": 2.932368302791614e-05, + "loss": 0.3596, + "step": 1003 + }, + { + "epoch": 2.0552712384851586, + "grad_norm": 0.38223463417283743, + "learning_rate": 2.9298360850793944e-05, + "loss": 0.3048, + "step": 1004 + }, + { + "epoch": 2.0573183213920165, + "grad_norm": 0.35066959498537137, + "learning_rate": 2.9273019643557474e-05, + "loss": 0.3154, + "step": 1005 + }, + { + "epoch": 2.059365404298874, + "grad_norm": 0.3912112568074411, + "learning_rate": 2.92476594580703e-05, + "loss": 0.3397, + "step": 1006 + }, + { + "epoch": 2.061412487205732, + "grad_norm": 0.40688373618091567, + "learning_rate": 2.9222280346234816e-05, + "loss": 0.3315, + "step": 1007 + }, + { + "epoch": 2.0634595701125895, + "grad_norm": 0.3408605767984647, + "learning_rate": 2.919688235999215e-05, + "loss": 0.3334, + "step": 1008 + }, + { + "epoch": 2.0655066530194475, + "grad_norm": 0.35811936384341014, + "learning_rate": 2.917146555132206e-05, + "loss": 0.3386, + "step": 1009 + }, + { + "epoch": 2.067553735926305, + "grad_norm": 0.3364760604746556, + "learning_rate": 2.914602997224285e-05, + "loss": 0.3199, + "step": 1010 + }, + { + "epoch": 2.069600818833163, + "grad_norm": 0.4118145924928188, + "learning_rate": 2.912057567481119e-05, + "loss": 0.3507, + "step": 1011 + }, + { + "epoch": 2.0716479017400204, + "grad_norm": 0.3612606023218023, + "learning_rate": 2.909510271112212e-05, + "loss": 0.3462, + "step": 1012 + }, + { + "epoch": 2.0736949846468784, + "grad_norm": 0.3222144557445549, + "learning_rate": 2.906961113330883e-05, + "loss": 0.3271, + "step": 1013 + }, + { + "epoch": 2.075742067553736, + "grad_norm": 0.4762234619066632, + "learning_rate": 2.904410099354263e-05, + "loss": 0.311, + "step": 1014 + }, + { + "epoch": 2.0777891504605934, + "grad_norm": 0.3407988463597387, + "learning_rate": 2.9018572344032823e-05, + "loss": 0.3242, + "step": 1015 + }, + { + "epoch": 2.0798362333674514, + "grad_norm": 0.38477863373629423, + "learning_rate": 2.8993025237026578e-05, + "loss": 0.3472, + "step": 1016 + }, + { + "epoch": 2.0818833162743093, + "grad_norm": 0.4212662322514699, + "learning_rate": 2.8967459724808856e-05, + "loss": 0.3055, + "step": 1017 + }, + { + "epoch": 2.083930399181167, + "grad_norm": 0.34644422983431267, + "learning_rate": 2.8941875859702283e-05, + "loss": 0.3099, + "step": 1018 + }, + { + "epoch": 2.0859774820880244, + "grad_norm": 0.38660401266521843, + "learning_rate": 2.891627369406703e-05, + "loss": 0.3301, + "step": 1019 + }, + { + "epoch": 2.0880245649948823, + "grad_norm": 0.33570902053280804, + "learning_rate": 2.889065328030074e-05, + "loss": 0.3559, + "step": 1020 + }, + { + "epoch": 2.09007164790174, + "grad_norm": 0.3938573985227862, + "learning_rate": 2.88650146708384e-05, + "loss": 0.3301, + "step": 1021 + }, + { + "epoch": 2.092118730808598, + "grad_norm": 0.37756710702648894, + "learning_rate": 2.883935791815222e-05, + "loss": 0.3413, + "step": 1022 + }, + { + "epoch": 2.0941658137154553, + "grad_norm": 0.36055751260111174, + "learning_rate": 2.8813683074751578e-05, + "loss": 0.3469, + "step": 1023 + }, + { + "epoch": 2.0962128966223132, + "grad_norm": 0.34337336038390814, + "learning_rate": 2.878799019318283e-05, + "loss": 0.3339, + "step": 1024 + }, + { + "epoch": 2.0982599795291708, + "grad_norm": 0.37844750997650106, + "learning_rate": 2.8762279326029293e-05, + "loss": 0.3278, + "step": 1025 + }, + { + "epoch": 2.1003070624360287, + "grad_norm": 0.3641190698552649, + "learning_rate": 2.8736550525911066e-05, + "loss": 0.3548, + "step": 1026 + }, + { + "epoch": 2.1023541453428862, + "grad_norm": 0.36462562710150886, + "learning_rate": 2.8710803845484955e-05, + "loss": 0.321, + "step": 1027 + }, + { + "epoch": 2.104401228249744, + "grad_norm": 0.3706428526021544, + "learning_rate": 2.8685039337444368e-05, + "loss": 0.3174, + "step": 1028 + }, + { + "epoch": 2.1064483111566017, + "grad_norm": 0.3276491863527004, + "learning_rate": 2.8659257054519182e-05, + "loss": 0.3046, + "step": 1029 + }, + { + "epoch": 2.1084953940634596, + "grad_norm": 0.3247212720041268, + "learning_rate": 2.8633457049475678e-05, + "loss": 0.3526, + "step": 1030 + }, + { + "epoch": 2.110542476970317, + "grad_norm": 0.3326121138866842, + "learning_rate": 2.8607639375116388e-05, + "loss": 0.3179, + "step": 1031 + }, + { + "epoch": 2.112589559877175, + "grad_norm": 0.36702351353785473, + "learning_rate": 2.858180408428001e-05, + "loss": 0.3393, + "step": 1032 + }, + { + "epoch": 2.1146366427840326, + "grad_norm": 0.31780331970447684, + "learning_rate": 2.855595122984129e-05, + "loss": 0.3248, + "step": 1033 + }, + { + "epoch": 2.1166837256908906, + "grad_norm": 0.32446381302593813, + "learning_rate": 2.853008086471094e-05, + "loss": 0.3283, + "step": 1034 + }, + { + "epoch": 2.118730808597748, + "grad_norm": 0.3428912531231067, + "learning_rate": 2.8504193041835497e-05, + "loss": 0.3048, + "step": 1035 + }, + { + "epoch": 2.120777891504606, + "grad_norm": 0.4004823401099236, + "learning_rate": 2.847828781419722e-05, + "loss": 0.3667, + "step": 1036 + }, + { + "epoch": 2.1228249744114636, + "grad_norm": 0.36467750504991164, + "learning_rate": 2.8452365234813992e-05, + "loss": 0.3601, + "step": 1037 + }, + { + "epoch": 2.1248720573183215, + "grad_norm": 0.3853260717775913, + "learning_rate": 2.842642535673922e-05, + "loss": 0.3289, + "step": 1038 + }, + { + "epoch": 2.126919140225179, + "grad_norm": 0.3693312153767782, + "learning_rate": 2.8400468233061708e-05, + "loss": 0.3147, + "step": 1039 + }, + { + "epoch": 2.128966223132037, + "grad_norm": 0.38807123819436246, + "learning_rate": 2.8374493916905544e-05, + "loss": 0.3269, + "step": 1040 + }, + { + "epoch": 2.1310133060388945, + "grad_norm": 0.3324036292794174, + "learning_rate": 2.834850246143002e-05, + "loss": 0.3076, + "step": 1041 + }, + { + "epoch": 2.1330603889457525, + "grad_norm": 0.36633069839093463, + "learning_rate": 2.832249391982949e-05, + "loss": 0.3315, + "step": 1042 + }, + { + "epoch": 2.13510747185261, + "grad_norm": 0.348911620524954, + "learning_rate": 2.8296468345333298e-05, + "loss": 0.2945, + "step": 1043 + }, + { + "epoch": 2.137154554759468, + "grad_norm": 0.4313978670199707, + "learning_rate": 2.827042579120562e-05, + "loss": 0.3556, + "step": 1044 + }, + { + "epoch": 2.1392016376663254, + "grad_norm": 0.33673070921204956, + "learning_rate": 2.8244366310745398e-05, + "loss": 0.3301, + "step": 1045 + }, + { + "epoch": 2.1412487205731834, + "grad_norm": 0.4048102626926484, + "learning_rate": 2.8218289957286226e-05, + "loss": 0.3672, + "step": 1046 + }, + { + "epoch": 2.143295803480041, + "grad_norm": 0.41846672934257156, + "learning_rate": 2.8192196784196198e-05, + "loss": 0.3148, + "step": 1047 + }, + { + "epoch": 2.145342886386899, + "grad_norm": 0.3744830737938391, + "learning_rate": 2.816608684487787e-05, + "loss": 0.3252, + "step": 1048 + }, + { + "epoch": 2.1473899692937564, + "grad_norm": 0.40525534049659034, + "learning_rate": 2.813996019276809e-05, + "loss": 0.3354, + "step": 1049 + }, + { + "epoch": 2.1494370522006143, + "grad_norm": 0.3733330999629505, + "learning_rate": 2.8113816881337902e-05, + "loss": 0.3146, + "step": 1050 + }, + { + "epoch": 2.151484135107472, + "grad_norm": 0.39634148645678874, + "learning_rate": 2.8087656964092472e-05, + "loss": 0.3041, + "step": 1051 + }, + { + "epoch": 2.15353121801433, + "grad_norm": 0.3852615546043453, + "learning_rate": 2.806148049457093e-05, + "loss": 0.3639, + "step": 1052 + }, + { + "epoch": 2.1555783009211873, + "grad_norm": 0.3896444126363705, + "learning_rate": 2.803528752634629e-05, + "loss": 0.3317, + "step": 1053 + }, + { + "epoch": 2.1576253838280453, + "grad_norm": 0.38474286304985633, + "learning_rate": 2.8009078113025335e-05, + "loss": 0.3363, + "step": 1054 + }, + { + "epoch": 2.1596724667349028, + "grad_norm": 0.36285165391849056, + "learning_rate": 2.798285230824849e-05, + "loss": 0.3088, + "step": 1055 + }, + { + "epoch": 2.1617195496417603, + "grad_norm": 0.39881869082842497, + "learning_rate": 2.795661016568975e-05, + "loss": 0.3472, + "step": 1056 + }, + { + "epoch": 2.1637666325486182, + "grad_norm": 0.36428100096066146, + "learning_rate": 2.7930351739056533e-05, + "loss": 0.347, + "step": 1057 + }, + { + "epoch": 2.1658137154554757, + "grad_norm": 0.4335731332830444, + "learning_rate": 2.7904077082089574e-05, + "loss": 0.325, + "step": 1058 + }, + { + "epoch": 2.1678607983623337, + "grad_norm": 0.3984251614786555, + "learning_rate": 2.787778624856286e-05, + "loss": 0.3066, + "step": 1059 + }, + { + "epoch": 2.169907881269191, + "grad_norm": 0.42180327285845043, + "learning_rate": 2.7851479292283442e-05, + "loss": 0.3415, + "step": 1060 + }, + { + "epoch": 2.171954964176049, + "grad_norm": 0.41842751411141604, + "learning_rate": 2.782515626709139e-05, + "loss": 0.3498, + "step": 1061 + }, + { + "epoch": 2.1740020470829067, + "grad_norm": 0.3982687240967601, + "learning_rate": 2.7798817226859678e-05, + "loss": 0.3311, + "step": 1062 + }, + { + "epoch": 2.1760491299897646, + "grad_norm": 0.3653496091806209, + "learning_rate": 2.7772462225494013e-05, + "loss": 0.3393, + "step": 1063 + }, + { + "epoch": 2.178096212896622, + "grad_norm": 0.35694917670185916, + "learning_rate": 2.7746091316932807e-05, + "loss": 0.2938, + "step": 1064 + }, + { + "epoch": 2.18014329580348, + "grad_norm": 0.36076433028006755, + "learning_rate": 2.7719704555147012e-05, + "loss": 0.3176, + "step": 1065 + }, + { + "epoch": 2.1821903787103376, + "grad_norm": 0.35991273790931844, + "learning_rate": 2.7693301994140026e-05, + "loss": 0.3369, + "step": 1066 + }, + { + "epoch": 2.1842374616171956, + "grad_norm": 0.3605864665846731, + "learning_rate": 2.7666883687947588e-05, + "loss": 0.308, + "step": 1067 + }, + { + "epoch": 2.186284544524053, + "grad_norm": 0.3868682601764087, + "learning_rate": 2.7640449690637642e-05, + "loss": 0.335, + "step": 1068 + }, + { + "epoch": 2.188331627430911, + "grad_norm": 0.37080898315589017, + "learning_rate": 2.761400005631028e-05, + "loss": 0.3339, + "step": 1069 + }, + { + "epoch": 2.1903787103377685, + "grad_norm": 0.35560785487130986, + "learning_rate": 2.7587534839097556e-05, + "loss": 0.3348, + "step": 1070 + }, + { + "epoch": 2.1924257932446265, + "grad_norm": 0.36780049563494116, + "learning_rate": 2.756105409316345e-05, + "loss": 0.3042, + "step": 1071 + }, + { + "epoch": 2.194472876151484, + "grad_norm": 0.3750046262445992, + "learning_rate": 2.7534557872703705e-05, + "loss": 0.3286, + "step": 1072 + }, + { + "epoch": 2.196519959058342, + "grad_norm": 0.3643852622155405, + "learning_rate": 2.750804623194574e-05, + "loss": 0.3202, + "step": 1073 + }, + { + "epoch": 2.1985670419651995, + "grad_norm": 0.35562711733903674, + "learning_rate": 2.7481519225148537e-05, + "loss": 0.3158, + "step": 1074 + }, + { + "epoch": 2.2006141248720574, + "grad_norm": 0.34699724820587735, + "learning_rate": 2.7454976906602513e-05, + "loss": 0.3635, + "step": 1075 + }, + { + "epoch": 2.202661207778915, + "grad_norm": 0.3632869672814209, + "learning_rate": 2.742841933062944e-05, + "loss": 0.3378, + "step": 1076 + }, + { + "epoch": 2.204708290685773, + "grad_norm": 0.35456528308188723, + "learning_rate": 2.7401846551582304e-05, + "loss": 0.3078, + "step": 1077 + }, + { + "epoch": 2.2067553735926304, + "grad_norm": 0.4081395245932041, + "learning_rate": 2.7375258623845207e-05, + "loss": 0.3429, + "step": 1078 + }, + { + "epoch": 2.2088024564994884, + "grad_norm": 0.34135550119349445, + "learning_rate": 2.7348655601833255e-05, + "loss": 0.3338, + "step": 1079 + }, + { + "epoch": 2.210849539406346, + "grad_norm": 0.42118938931480804, + "learning_rate": 2.7322037539992457e-05, + "loss": 0.3327, + "step": 1080 + }, + { + "epoch": 2.212896622313204, + "grad_norm": 0.37577320629275823, + "learning_rate": 2.7295404492799575e-05, + "loss": 0.3002, + "step": 1081 + }, + { + "epoch": 2.2149437052200613, + "grad_norm": 0.3303091254072807, + "learning_rate": 2.726875651476207e-05, + "loss": 0.3106, + "step": 1082 + }, + { + "epoch": 2.2169907881269193, + "grad_norm": 0.35777928959386923, + "learning_rate": 2.7242093660417954e-05, + "loss": 0.364, + "step": 1083 + }, + { + "epoch": 2.219037871033777, + "grad_norm": 0.3963714636770776, + "learning_rate": 2.721541598433567e-05, + "loss": 0.2969, + "step": 1084 + }, + { + "epoch": 2.2210849539406348, + "grad_norm": 0.3409183879753277, + "learning_rate": 2.718872354111401e-05, + "loss": 0.3346, + "step": 1085 + }, + { + "epoch": 2.2231320368474923, + "grad_norm": 0.35338149712684697, + "learning_rate": 2.7162016385381975e-05, + "loss": 0.3648, + "step": 1086 + }, + { + "epoch": 2.2251791197543502, + "grad_norm": 0.3315730951687613, + "learning_rate": 2.7135294571798706e-05, + "loss": 0.3063, + "step": 1087 + }, + { + "epoch": 2.2272262026612077, + "grad_norm": 0.3449763315274245, + "learning_rate": 2.7108558155053296e-05, + "loss": 0.3403, + "step": 1088 + }, + { + "epoch": 2.2292732855680657, + "grad_norm": 0.37034261955083203, + "learning_rate": 2.7081807189864764e-05, + "loss": 0.3583, + "step": 1089 + }, + { + "epoch": 2.231320368474923, + "grad_norm": 0.3621281223181069, + "learning_rate": 2.70550417309819e-05, + "loss": 0.3144, + "step": 1090 + }, + { + "epoch": 2.233367451381781, + "grad_norm": 0.39746428647523574, + "learning_rate": 2.7028261833183132e-05, + "loss": 0.3376, + "step": 1091 + }, + { + "epoch": 2.2354145342886387, + "grad_norm": 0.34536466760948237, + "learning_rate": 2.7001467551276464e-05, + "loss": 0.2973, + "step": 1092 + }, + { + "epoch": 2.237461617195496, + "grad_norm": 0.3255654437979655, + "learning_rate": 2.6974658940099337e-05, + "loss": 0.3222, + "step": 1093 + }, + { + "epoch": 2.239508700102354, + "grad_norm": 0.34570256484501904, + "learning_rate": 2.6947836054518484e-05, + "loss": 0.3585, + "step": 1094 + }, + { + "epoch": 2.241555783009212, + "grad_norm": 0.3325137281496525, + "learning_rate": 2.6920998949429913e-05, + "loss": 0.309, + "step": 1095 + }, + { + "epoch": 2.2436028659160696, + "grad_norm": 0.34364314604723273, + "learning_rate": 2.6894147679758678e-05, + "loss": 0.332, + "step": 1096 + }, + { + "epoch": 2.245649948822927, + "grad_norm": 0.3323431139976392, + "learning_rate": 2.6867282300458853e-05, + "loss": 0.3365, + "step": 1097 + }, + { + "epoch": 2.247697031729785, + "grad_norm": 0.36326221718241064, + "learning_rate": 2.684040286651338e-05, + "loss": 0.3361, + "step": 1098 + }, + { + "epoch": 2.2497441146366426, + "grad_norm": 0.3309544898142775, + "learning_rate": 2.6813509432933957e-05, + "loss": 0.3142, + "step": 1099 + }, + { + "epoch": 2.2517911975435005, + "grad_norm": 0.3686682735992276, + "learning_rate": 2.6786602054760952e-05, + "loss": 0.3078, + "step": 1100 + }, + { + "epoch": 2.253838280450358, + "grad_norm": 0.3515712454533351, + "learning_rate": 2.675968078706326e-05, + "loss": 0.3151, + "step": 1101 + }, + { + "epoch": 2.255885363357216, + "grad_norm": 0.3495717010136385, + "learning_rate": 2.673274568493821e-05, + "loss": 0.3243, + "step": 1102 + }, + { + "epoch": 2.2579324462640735, + "grad_norm": 0.34663490722766044, + "learning_rate": 2.670579680351143e-05, + "loss": 0.3284, + "step": 1103 + }, + { + "epoch": 2.2599795291709315, + "grad_norm": 0.34744444318737083, + "learning_rate": 2.667883419793676e-05, + "loss": 0.313, + "step": 1104 + }, + { + "epoch": 2.262026612077789, + "grad_norm": 0.38465408810908674, + "learning_rate": 2.6651857923396132e-05, + "loss": 0.3759, + "step": 1105 + }, + { + "epoch": 2.264073694984647, + "grad_norm": 0.3414863910860875, + "learning_rate": 2.6624868035099445e-05, + "loss": 0.3336, + "step": 1106 + }, + { + "epoch": 2.2661207778915045, + "grad_norm": 0.33861934140219296, + "learning_rate": 2.659786458828446e-05, + "loss": 0.3587, + "step": 1107 + }, + { + "epoch": 2.2681678607983624, + "grad_norm": 0.32698315646381576, + "learning_rate": 2.6570847638216698e-05, + "loss": 0.3506, + "step": 1108 + }, + { + "epoch": 2.27021494370522, + "grad_norm": 0.3303135891963801, + "learning_rate": 2.65438172401893e-05, + "loss": 0.3451, + "step": 1109 + }, + { + "epoch": 2.272262026612078, + "grad_norm": 0.3769317246092125, + "learning_rate": 2.6516773449522936e-05, + "loss": 0.3257, + "step": 1110 + }, + { + "epoch": 2.2743091095189354, + "grad_norm": 0.3511492114236981, + "learning_rate": 2.648971632156569e-05, + "loss": 0.3147, + "step": 1111 + }, + { + "epoch": 2.2763561924257933, + "grad_norm": 0.3675620800860699, + "learning_rate": 2.6462645911692938e-05, + "loss": 0.2979, + "step": 1112 + }, + { + "epoch": 2.278403275332651, + "grad_norm": 0.3792036243843211, + "learning_rate": 2.643556227530724e-05, + "loss": 0.3065, + "step": 1113 + }, + { + "epoch": 2.280450358239509, + "grad_norm": 0.32407850095411667, + "learning_rate": 2.6408465467838225e-05, + "loss": 0.332, + "step": 1114 + }, + { + "epoch": 2.2824974411463663, + "grad_norm": 0.31664427206527734, + "learning_rate": 2.6381355544742482e-05, + "loss": 0.3963, + "step": 1115 + }, + { + "epoch": 2.2845445240532243, + "grad_norm": 0.34686617702001726, + "learning_rate": 2.6354232561503433e-05, + "loss": 0.3357, + "step": 1116 + }, + { + "epoch": 2.286591606960082, + "grad_norm": 0.33621332629180944, + "learning_rate": 2.632709657363124e-05, + "loss": 0.3089, + "step": 1117 + }, + { + "epoch": 2.2886386898669397, + "grad_norm": 0.33153383111460555, + "learning_rate": 2.6299947636662673e-05, + "loss": 0.3054, + "step": 1118 + }, + { + "epoch": 2.2906857727737973, + "grad_norm": 0.3416358925904056, + "learning_rate": 2.6272785806161005e-05, + "loss": 0.3278, + "step": 1119 + }, + { + "epoch": 2.292732855680655, + "grad_norm": 0.40808146160514075, + "learning_rate": 2.6245611137715897e-05, + "loss": 0.3519, + "step": 1120 + }, + { + "epoch": 2.2947799385875127, + "grad_norm": 0.32524026497753233, + "learning_rate": 2.621842368694329e-05, + "loss": 0.3239, + "step": 1121 + }, + { + "epoch": 2.2968270214943707, + "grad_norm": 0.3658777033819554, + "learning_rate": 2.6191223509485273e-05, + "loss": 0.3286, + "step": 1122 + }, + { + "epoch": 2.298874104401228, + "grad_norm": 0.36322923087428066, + "learning_rate": 2.6164010661010007e-05, + "loss": 0.3364, + "step": 1123 + }, + { + "epoch": 2.300921187308086, + "grad_norm": 0.3278681613056945, + "learning_rate": 2.613678519721155e-05, + "loss": 0.3086, + "step": 1124 + }, + { + "epoch": 2.3029682702149437, + "grad_norm": 0.3547085800078984, + "learning_rate": 2.61095471738098e-05, + "loss": 0.3215, + "step": 1125 + }, + { + "epoch": 2.3050153531218016, + "grad_norm": 0.31393691776472127, + "learning_rate": 2.6082296646550364e-05, + "loss": 0.3114, + "step": 1126 + }, + { + "epoch": 2.307062436028659, + "grad_norm": 0.35347094972995313, + "learning_rate": 2.605503367120442e-05, + "loss": 0.3621, + "step": 1127 + }, + { + "epoch": 2.309109518935517, + "grad_norm": 0.3314912682101731, + "learning_rate": 2.6027758303568643e-05, + "loss": 0.3196, + "step": 1128 + }, + { + "epoch": 2.3111566018423746, + "grad_norm": 0.30778697872934085, + "learning_rate": 2.6000470599465065e-05, + "loss": 0.3068, + "step": 1129 + }, + { + "epoch": 2.313203684749232, + "grad_norm": 0.37088270704077186, + "learning_rate": 2.5973170614740946e-05, + "loss": 0.373, + "step": 1130 + }, + { + "epoch": 2.31525076765609, + "grad_norm": 0.34268176472285344, + "learning_rate": 2.5945858405268714e-05, + "loss": 0.3413, + "step": 1131 + }, + { + "epoch": 2.317297850562948, + "grad_norm": 0.3248279403276539, + "learning_rate": 2.5918534026945787e-05, + "loss": 0.3559, + "step": 1132 + }, + { + "epoch": 2.3193449334698055, + "grad_norm": 0.30464581495860377, + "learning_rate": 2.5891197535694507e-05, + "loss": 0.3367, + "step": 1133 + }, + { + "epoch": 2.321392016376663, + "grad_norm": 0.35318148922449216, + "learning_rate": 2.5863848987461993e-05, + "loss": 0.3529, + "step": 1134 + }, + { + "epoch": 2.323439099283521, + "grad_norm": 0.37807352066757405, + "learning_rate": 2.5836488438220044e-05, + "loss": 0.3347, + "step": 1135 + }, + { + "epoch": 2.325486182190379, + "grad_norm": 0.3608071303513835, + "learning_rate": 2.5809115943965027e-05, + "loss": 0.3366, + "step": 1136 + }, + { + "epoch": 2.3275332650972365, + "grad_norm": 0.3499879431406946, + "learning_rate": 2.5781731560717745e-05, + "loss": 0.3106, + "step": 1137 + }, + { + "epoch": 2.329580348004094, + "grad_norm": 0.3591909142140177, + "learning_rate": 2.575433534452334e-05, + "loss": 0.3396, + "step": 1138 + }, + { + "epoch": 2.331627430910952, + "grad_norm": 0.33212351658248346, + "learning_rate": 2.5726927351451178e-05, + "loss": 0.3439, + "step": 1139 + }, + { + "epoch": 2.3336745138178094, + "grad_norm": 0.36284882910248367, + "learning_rate": 2.5699507637594706e-05, + "loss": 0.304, + "step": 1140 + }, + { + "epoch": 2.3357215967246674, + "grad_norm": 0.34302795456823765, + "learning_rate": 2.5672076259071385e-05, + "loss": 0.3276, + "step": 1141 + }, + { + "epoch": 2.337768679631525, + "grad_norm": 0.36338586044821625, + "learning_rate": 2.5644633272022536e-05, + "loss": 0.36, + "step": 1142 + }, + { + "epoch": 2.339815762538383, + "grad_norm": 0.3428090299047305, + "learning_rate": 2.561717873261323e-05, + "loss": 0.3062, + "step": 1143 + }, + { + "epoch": 2.3418628454452404, + "grad_norm": 0.38038284417246715, + "learning_rate": 2.558971269703219e-05, + "loss": 0.3555, + "step": 1144 + }, + { + "epoch": 2.3439099283520983, + "grad_norm": 0.35045801763083695, + "learning_rate": 2.556223522149168e-05, + "loss": 0.3433, + "step": 1145 + }, + { + "epoch": 2.345957011258956, + "grad_norm": 0.36682906880387844, + "learning_rate": 2.5534746362227355e-05, + "loss": 0.3507, + "step": 1146 + }, + { + "epoch": 2.348004094165814, + "grad_norm": 0.34210491558572537, + "learning_rate": 2.5507246175498174e-05, + "loss": 0.3178, + "step": 1147 + }, + { + "epoch": 2.3500511770726713, + "grad_norm": 0.3421821678613336, + "learning_rate": 2.5479734717586285e-05, + "loss": 0.3124, + "step": 1148 + }, + { + "epoch": 2.3520982599795293, + "grad_norm": 0.37370205112447163, + "learning_rate": 2.5452212044796912e-05, + "loss": 0.3179, + "step": 1149 + }, + { + "epoch": 2.3541453428863868, + "grad_norm": 0.3575054301368409, + "learning_rate": 2.5424678213458202e-05, + "loss": 0.2982, + "step": 1150 + }, + { + "epoch": 2.3561924257932447, + "grad_norm": 0.4005011767035321, + "learning_rate": 2.539713327992117e-05, + "loss": 0.329, + "step": 1151 + }, + { + "epoch": 2.3582395087001022, + "grad_norm": 0.3796694210246127, + "learning_rate": 2.5369577300559544e-05, + "loss": 0.3495, + "step": 1152 + }, + { + "epoch": 2.36028659160696, + "grad_norm": 0.38531974828237286, + "learning_rate": 2.5342010331769635e-05, + "loss": 0.3218, + "step": 1153 + }, + { + "epoch": 2.3623336745138177, + "grad_norm": 0.4150417947081143, + "learning_rate": 2.531443242997029e-05, + "loss": 0.3714, + "step": 1154 + }, + { + "epoch": 2.3643807574206757, + "grad_norm": 0.3665969184427945, + "learning_rate": 2.5286843651602688e-05, + "loss": 0.33, + "step": 1155 + }, + { + "epoch": 2.366427840327533, + "grad_norm": 0.3668439390982461, + "learning_rate": 2.5259244053130295e-05, + "loss": 0.3338, + "step": 1156 + }, + { + "epoch": 2.368474923234391, + "grad_norm": 0.4105022406361347, + "learning_rate": 2.5231633691038716e-05, + "loss": 0.3303, + "step": 1157 + }, + { + "epoch": 2.3705220061412486, + "grad_norm": 0.3610884307782202, + "learning_rate": 2.5204012621835575e-05, + "loss": 0.3108, + "step": 1158 + }, + { + "epoch": 2.3725690890481066, + "grad_norm": 0.39634461411533756, + "learning_rate": 2.5176380902050418e-05, + "loss": 0.3398, + "step": 1159 + }, + { + "epoch": 2.374616171954964, + "grad_norm": 0.3956297902185592, + "learning_rate": 2.5148738588234593e-05, + "loss": 0.3199, + "step": 1160 + }, + { + "epoch": 2.376663254861822, + "grad_norm": 0.33517128768421744, + "learning_rate": 2.5121085736961112e-05, + "loss": 0.3288, + "step": 1161 + }, + { + "epoch": 2.3787103377686796, + "grad_norm": 0.3543225737911076, + "learning_rate": 2.5093422404824574e-05, + "loss": 0.3483, + "step": 1162 + }, + { + "epoch": 2.3807574206755375, + "grad_norm": 0.35329611481723555, + "learning_rate": 2.506574864844102e-05, + "loss": 0.3439, + "step": 1163 + }, + { + "epoch": 2.382804503582395, + "grad_norm": 0.3415016772188138, + "learning_rate": 2.5038064524447827e-05, + "loss": 0.3461, + "step": 1164 + }, + { + "epoch": 2.384851586489253, + "grad_norm": 0.3618115220444992, + "learning_rate": 2.5010370089503578e-05, + "loss": 0.3243, + "step": 1165 + }, + { + "epoch": 2.3868986693961105, + "grad_norm": 0.36205472648695425, + "learning_rate": 2.4982665400287972e-05, + "loss": 0.3411, + "step": 1166 + }, + { + "epoch": 2.3889457523029685, + "grad_norm": 0.43026672506995006, + "learning_rate": 2.4954950513501697e-05, + "loss": 0.3319, + "step": 1167 + }, + { + "epoch": 2.390992835209826, + "grad_norm": 0.350243563017003, + "learning_rate": 2.4927225485866297e-05, + "loss": 0.3479, + "step": 1168 + }, + { + "epoch": 2.393039918116684, + "grad_norm": 0.3746353739097793, + "learning_rate": 2.4899490374124085e-05, + "loss": 0.3429, + "step": 1169 + }, + { + "epoch": 2.3950870010235414, + "grad_norm": 0.3260898872775567, + "learning_rate": 2.4871745235038006e-05, + "loss": 0.3472, + "step": 1170 + }, + { + "epoch": 2.397134083930399, + "grad_norm": 0.33558457363741073, + "learning_rate": 2.4843990125391516e-05, + "loss": 0.328, + "step": 1171 + }, + { + "epoch": 2.399181166837257, + "grad_norm": 0.38996942202586005, + "learning_rate": 2.4816225101988506e-05, + "loss": 0.3391, + "step": 1172 + }, + { + "epoch": 2.401228249744115, + "grad_norm": 0.3566387434855101, + "learning_rate": 2.478845022165313e-05, + "loss": 0.3667, + "step": 1173 + }, + { + "epoch": 2.4032753326509724, + "grad_norm": 0.3444970227104489, + "learning_rate": 2.4760665541229712e-05, + "loss": 0.3301, + "step": 1174 + }, + { + "epoch": 2.40532241555783, + "grad_norm": 0.3092553635632143, + "learning_rate": 2.473287111758267e-05, + "loss": 0.3401, + "step": 1175 + }, + { + "epoch": 2.407369498464688, + "grad_norm": 0.3394352190867357, + "learning_rate": 2.470506700759631e-05, + "loss": 0.3218, + "step": 1176 + }, + { + "epoch": 2.409416581371546, + "grad_norm": 0.3393116279567721, + "learning_rate": 2.467725326817481e-05, + "loss": 0.3113, + "step": 1177 + }, + { + "epoch": 2.4114636642784033, + "grad_norm": 0.399667522842199, + "learning_rate": 2.464942995624203e-05, + "loss": 0.3269, + "step": 1178 + }, + { + "epoch": 2.413510747185261, + "grad_norm": 0.3505124387280659, + "learning_rate": 2.462159712874142e-05, + "loss": 0.3411, + "step": 1179 + }, + { + "epoch": 2.4155578300921188, + "grad_norm": 0.3519791912196212, + "learning_rate": 2.4593754842635917e-05, + "loss": 0.3036, + "step": 1180 + }, + { + "epoch": 2.4176049129989763, + "grad_norm": 0.36680184492885926, + "learning_rate": 2.4565903154907807e-05, + "loss": 0.3898, + "step": 1181 + }, + { + "epoch": 2.4196519959058342, + "grad_norm": 0.3778349635443584, + "learning_rate": 2.453804212255862e-05, + "loss": 0.3427, + "step": 1182 + }, + { + "epoch": 2.4216990788126918, + "grad_norm": 0.3389030241399457, + "learning_rate": 2.451017180260902e-05, + "loss": 0.3271, + "step": 1183 + }, + { + "epoch": 2.4237461617195497, + "grad_norm": 0.42513595322376974, + "learning_rate": 2.448229225209865e-05, + "loss": 0.3298, + "step": 1184 + }, + { + "epoch": 2.425793244626407, + "grad_norm": 0.3712368776548631, + "learning_rate": 2.4454403528086088e-05, + "loss": 0.3323, + "step": 1185 + }, + { + "epoch": 2.427840327533265, + "grad_norm": 0.3200914239556184, + "learning_rate": 2.4426505687648653e-05, + "loss": 0.3387, + "step": 1186 + }, + { + "epoch": 2.4298874104401227, + "grad_norm": 0.346315254552554, + "learning_rate": 2.4398598787882334e-05, + "loss": 0.3449, + "step": 1187 + }, + { + "epoch": 2.4319344933469806, + "grad_norm": 0.3184464070741258, + "learning_rate": 2.4370682885901657e-05, + "loss": 0.3006, + "step": 1188 + }, + { + "epoch": 2.433981576253838, + "grad_norm": 0.34720894372009287, + "learning_rate": 2.4342758038839573e-05, + "loss": 0.3354, + "step": 1189 + }, + { + "epoch": 2.436028659160696, + "grad_norm": 0.3963098146391974, + "learning_rate": 2.4314824303847342e-05, + "loss": 0.3273, + "step": 1190 + }, + { + "epoch": 2.4380757420675536, + "grad_norm": 0.3419114471543989, + "learning_rate": 2.4286881738094418e-05, + "loss": 0.3334, + "step": 1191 + }, + { + "epoch": 2.4401228249744116, + "grad_norm": 0.3476204838650686, + "learning_rate": 2.4258930398768317e-05, + "loss": 0.3405, + "step": 1192 + }, + { + "epoch": 2.442169907881269, + "grad_norm": 0.3398409708251329, + "learning_rate": 2.423097034307452e-05, + "loss": 0.2964, + "step": 1193 + }, + { + "epoch": 2.444216990788127, + "grad_norm": 0.33597865186885006, + "learning_rate": 2.4203001628236346e-05, + "loss": 0.3122, + "step": 1194 + }, + { + "epoch": 2.4462640736949846, + "grad_norm": 0.7573334739644073, + "learning_rate": 2.4175024311494835e-05, + "loss": 0.37, + "step": 1195 + }, + { + "epoch": 2.4483111566018425, + "grad_norm": 0.3096397452376292, + "learning_rate": 2.4147038450108627e-05, + "loss": 0.3462, + "step": 1196 + }, + { + "epoch": 2.4503582395087, + "grad_norm": 0.32363631370662416, + "learning_rate": 2.4119044101353853e-05, + "loss": 0.3089, + "step": 1197 + }, + { + "epoch": 2.452405322415558, + "grad_norm": 0.44476946502851955, + "learning_rate": 2.4091041322524023e-05, + "loss": 0.3891, + "step": 1198 + }, + { + "epoch": 2.4544524053224155, + "grad_norm": 0.34282272355962545, + "learning_rate": 2.406303017092988e-05, + "loss": 0.3672, + "step": 1199 + }, + { + "epoch": 2.4564994882292734, + "grad_norm": 0.3458362218172388, + "learning_rate": 2.403501070389932e-05, + "loss": 0.3446, + "step": 1200 + }, + { + "epoch": 2.458546571136131, + "grad_norm": 0.330522133423883, + "learning_rate": 2.4006982978777263e-05, + "loss": 0.3547, + "step": 1201 + }, + { + "epoch": 2.460593654042989, + "grad_norm": 0.33086606944472513, + "learning_rate": 2.39789470529255e-05, + "loss": 0.3145, + "step": 1202 + }, + { + "epoch": 2.4626407369498464, + "grad_norm": 0.3611588885012983, + "learning_rate": 2.3950902983722645e-05, + "loss": 0.3512, + "step": 1203 + }, + { + "epoch": 2.4646878198567044, + "grad_norm": 0.35160175468657195, + "learning_rate": 2.392285082856394e-05, + "loss": 0.331, + "step": 1204 + }, + { + "epoch": 2.466734902763562, + "grad_norm": 0.3329347867861998, + "learning_rate": 2.389479064486121e-05, + "loss": 0.3697, + "step": 1205 + }, + { + "epoch": 2.46878198567042, + "grad_norm": 0.35641819087178006, + "learning_rate": 2.3866722490042685e-05, + "loss": 0.3606, + "step": 1206 + }, + { + "epoch": 2.4708290685772774, + "grad_norm": 0.3604571656733162, + "learning_rate": 2.3838646421552917e-05, + "loss": 0.3377, + "step": 1207 + }, + { + "epoch": 2.472876151484135, + "grad_norm": 0.3352126815168747, + "learning_rate": 2.3810562496852666e-05, + "loss": 0.3262, + "step": 1208 + }, + { + "epoch": 2.474923234390993, + "grad_norm": 0.35666331619036534, + "learning_rate": 2.3782470773418756e-05, + "loss": 0.339, + "step": 1209 + }, + { + "epoch": 2.4769703172978508, + "grad_norm": 0.39672935648110513, + "learning_rate": 2.3754371308743975e-05, + "loss": 0.33, + "step": 1210 + }, + { + "epoch": 2.4790174002047083, + "grad_norm": 0.3591230233614195, + "learning_rate": 2.372626416033696e-05, + "loss": 0.3209, + "step": 1211 + }, + { + "epoch": 2.481064483111566, + "grad_norm": 0.34331298511904995, + "learning_rate": 2.3698149385722067e-05, + "loss": 0.3376, + "step": 1212 + }, + { + "epoch": 2.4831115660184238, + "grad_norm": 0.3934047338427704, + "learning_rate": 2.367002704243927e-05, + "loss": 0.3198, + "step": 1213 + }, + { + "epoch": 2.4851586489252817, + "grad_norm": 0.34587155423784893, + "learning_rate": 2.3641897188044018e-05, + "loss": 0.3442, + "step": 1214 + }, + { + "epoch": 2.487205731832139, + "grad_norm": 0.3681467355523078, + "learning_rate": 2.3613759880107133e-05, + "loss": 0.343, + "step": 1215 + }, + { + "epoch": 2.4892528147389967, + "grad_norm": 0.3249289712237619, + "learning_rate": 2.3585615176214716e-05, + "loss": 0.3066, + "step": 1216 + }, + { + "epoch": 2.4912998976458547, + "grad_norm": 0.37477832639872893, + "learning_rate": 2.3557463133967976e-05, + "loss": 0.3325, + "step": 1217 + }, + { + "epoch": 2.493346980552712, + "grad_norm": 0.36178361521501945, + "learning_rate": 2.3529303810983154e-05, + "loss": 0.3127, + "step": 1218 + }, + { + "epoch": 2.49539406345957, + "grad_norm": 0.32831527326831483, + "learning_rate": 2.3501137264891396e-05, + "loss": 0.3248, + "step": 1219 + }, + { + "epoch": 2.4974411463664277, + "grad_norm": 0.4027644606883135, + "learning_rate": 2.3472963553338614e-05, + "loss": 0.3023, + "step": 1220 + }, + { + "epoch": 2.4994882292732856, + "grad_norm": 0.3439537192088776, + "learning_rate": 2.3444782733985396e-05, + "loss": 0.3252, + "step": 1221 + }, + { + "epoch": 2.501535312180143, + "grad_norm": 0.32953719599686015, + "learning_rate": 2.3416594864506887e-05, + "loss": 0.3043, + "step": 1222 + }, + { + "epoch": 2.503582395087001, + "grad_norm": 0.3490669906957581, + "learning_rate": 2.338840000259264e-05, + "loss": 0.3133, + "step": 1223 + }, + { + "epoch": 2.5056294779938586, + "grad_norm": 0.36299216934576684, + "learning_rate": 2.3360198205946542e-05, + "loss": 0.3141, + "step": 1224 + }, + { + "epoch": 2.5076765609007166, + "grad_norm": 0.3180197591727672, + "learning_rate": 2.333198953228664e-05, + "loss": 0.3535, + "step": 1225 + }, + { + "epoch": 2.509723643807574, + "grad_norm": 0.5171123946314492, + "learning_rate": 2.3303774039345098e-05, + "loss": 0.3367, + "step": 1226 + }, + { + "epoch": 2.511770726714432, + "grad_norm": 0.3840513193856834, + "learning_rate": 2.3275551784867997e-05, + "loss": 0.3188, + "step": 1227 + }, + { + "epoch": 2.5138178096212895, + "grad_norm": 0.3346463848714001, + "learning_rate": 2.3247322826615276e-05, + "loss": 0.3596, + "step": 1228 + }, + { + "epoch": 2.5158648925281475, + "grad_norm": 0.3505848734275376, + "learning_rate": 2.3219087222360603e-05, + "loss": 0.3395, + "step": 1229 + }, + { + "epoch": 2.517911975435005, + "grad_norm": 0.3415846729855278, + "learning_rate": 2.3190845029891218e-05, + "loss": 0.3326, + "step": 1230 + }, + { + "epoch": 2.519959058341863, + "grad_norm": 0.3273579902423157, + "learning_rate": 2.316259630700787e-05, + "loss": 0.3344, + "step": 1231 + }, + { + "epoch": 2.5220061412487205, + "grad_norm": 0.3247531510829717, + "learning_rate": 2.313434111152467e-05, + "loss": 0.3346, + "step": 1232 + }, + { + "epoch": 2.5240532241555784, + "grad_norm": 0.3503893886731608, + "learning_rate": 2.310607950126896e-05, + "loss": 0.3448, + "step": 1233 + }, + { + "epoch": 2.526100307062436, + "grad_norm": 0.35713820818797615, + "learning_rate": 2.307781153408124e-05, + "loss": 0.3359, + "step": 1234 + }, + { + "epoch": 2.528147389969294, + "grad_norm": 0.34251280908459203, + "learning_rate": 2.3049537267814984e-05, + "loss": 0.3431, + "step": 1235 + }, + { + "epoch": 2.5301944728761514, + "grad_norm": 0.34661535982225733, + "learning_rate": 2.3021256760336583e-05, + "loss": 0.3604, + "step": 1236 + }, + { + "epoch": 2.5322415557830094, + "grad_norm": 0.3630834914867728, + "learning_rate": 2.2992970069525202e-05, + "loss": 0.3472, + "step": 1237 + }, + { + "epoch": 2.534288638689867, + "grad_norm": 0.318836844861738, + "learning_rate": 2.296467725327264e-05, + "loss": 0.3174, + "step": 1238 + }, + { + "epoch": 2.536335721596725, + "grad_norm": 0.3365825133835808, + "learning_rate": 2.293637836948325e-05, + "loss": 0.3093, + "step": 1239 + }, + { + "epoch": 2.5383828045035823, + "grad_norm": 0.35403182852053533, + "learning_rate": 2.29080734760738e-05, + "loss": 0.3382, + "step": 1240 + }, + { + "epoch": 2.54042988741044, + "grad_norm": 0.3562520797769209, + "learning_rate": 2.2879762630973355e-05, + "loss": 0.3315, + "step": 1241 + }, + { + "epoch": 2.542476970317298, + "grad_norm": 0.3188066365019869, + "learning_rate": 2.285144589212316e-05, + "loss": 0.3079, + "step": 1242 + }, + { + "epoch": 2.5445240532241558, + "grad_norm": 0.3556195009937043, + "learning_rate": 2.2823123317476522e-05, + "loss": 0.3422, + "step": 1243 + }, + { + "epoch": 2.5465711361310133, + "grad_norm": 0.3474070822067162, + "learning_rate": 2.2794794964998705e-05, + "loss": 0.3049, + "step": 1244 + }, + { + "epoch": 2.548618219037871, + "grad_norm": 0.3464039443053579, + "learning_rate": 2.276646089266677e-05, + "loss": 0.2992, + "step": 1245 + }, + { + "epoch": 2.5506653019447287, + "grad_norm": 0.3767709540930306, + "learning_rate": 2.273812115846951e-05, + "loss": 0.3726, + "step": 1246 + }, + { + "epoch": 2.5527123848515867, + "grad_norm": 0.3206160362666913, + "learning_rate": 2.2709775820407292e-05, + "loss": 0.2909, + "step": 1247 + }, + { + "epoch": 2.554759467758444, + "grad_norm": 0.3673509781890066, + "learning_rate": 2.2681424936491954e-05, + "loss": 0.3669, + "step": 1248 + }, + { + "epoch": 2.5568065506653017, + "grad_norm": 0.3401943174394405, + "learning_rate": 2.2653068564746692e-05, + "loss": 0.3403, + "step": 1249 + }, + { + "epoch": 2.5588536335721597, + "grad_norm": 0.3490447254588359, + "learning_rate": 2.2624706763205935e-05, + "loss": 0.3603, + "step": 1250 + }, + { + "epoch": 2.5609007164790176, + "grad_norm": 0.3482076297056933, + "learning_rate": 2.2596339589915197e-05, + "loss": 0.3554, + "step": 1251 + }, + { + "epoch": 2.562947799385875, + "grad_norm": 0.3115817757213185, + "learning_rate": 2.2567967102931025e-05, + "loss": 0.3136, + "step": 1252 + }, + { + "epoch": 2.5649948822927326, + "grad_norm": 0.32599919557680007, + "learning_rate": 2.2539589360320802e-05, + "loss": 0.3256, + "step": 1253 + }, + { + "epoch": 2.5670419651995906, + "grad_norm": 0.3039164892771023, + "learning_rate": 2.2511206420162716e-05, + "loss": 0.3414, + "step": 1254 + }, + { + "epoch": 2.5690890481064486, + "grad_norm": 0.31157751749513196, + "learning_rate": 2.2482818340545534e-05, + "loss": 0.3102, + "step": 1255 + }, + { + "epoch": 2.571136131013306, + "grad_norm": 0.33852273483094864, + "learning_rate": 2.2454425179568594e-05, + "loss": 0.3434, + "step": 1256 + }, + { + "epoch": 2.5731832139201636, + "grad_norm": 0.30672315678878886, + "learning_rate": 2.2426026995341602e-05, + "loss": 0.318, + "step": 1257 + }, + { + "epoch": 2.5752302968270215, + "grad_norm": 0.31206788352544473, + "learning_rate": 2.2397623845984548e-05, + "loss": 0.3749, + "step": 1258 + }, + { + "epoch": 2.5772773797338795, + "grad_norm": 0.30841479866018223, + "learning_rate": 2.2369215789627593e-05, + "loss": 0.298, + "step": 1259 + }, + { + "epoch": 2.579324462640737, + "grad_norm": 0.32527208718455825, + "learning_rate": 2.234080288441095e-05, + "loss": 0.3419, + "step": 1260 + }, + { + "epoch": 2.5813715455475945, + "grad_norm": 0.3124189339174951, + "learning_rate": 2.2312385188484718e-05, + "loss": 0.3501, + "step": 1261 + }, + { + "epoch": 2.5834186284544525, + "grad_norm": 0.3352181407147001, + "learning_rate": 2.2283962760008845e-05, + "loss": 0.339, + "step": 1262 + }, + { + "epoch": 2.58546571136131, + "grad_norm": 0.3548208328356842, + "learning_rate": 2.225553565715294e-05, + "loss": 0.3334, + "step": 1263 + }, + { + "epoch": 2.587512794268168, + "grad_norm": 0.32210075431253954, + "learning_rate": 2.2227103938096176e-05, + "loss": 0.3139, + "step": 1264 + }, + { + "epoch": 2.5895598771750254, + "grad_norm": 0.3342742670935017, + "learning_rate": 2.2198667661027193e-05, + "loss": 0.3232, + "step": 1265 + }, + { + "epoch": 2.5916069600818834, + "grad_norm": 0.3332141930024874, + "learning_rate": 2.2170226884143942e-05, + "loss": 0.3089, + "step": 1266 + }, + { + "epoch": 2.593654042988741, + "grad_norm": 0.3259468299127148, + "learning_rate": 2.2141781665653584e-05, + "loss": 0.3069, + "step": 1267 + }, + { + "epoch": 2.595701125895599, + "grad_norm": 0.3756435234426335, + "learning_rate": 2.2113332063772387e-05, + "loss": 0.3343, + "step": 1268 + }, + { + "epoch": 2.5977482088024564, + "grad_norm": 0.38118415339444334, + "learning_rate": 2.208487813672557e-05, + "loss": 0.3246, + "step": 1269 + }, + { + "epoch": 2.5997952917093143, + "grad_norm": 0.33475197046166133, + "learning_rate": 2.205641994274721e-05, + "loss": 0.3543, + "step": 1270 + }, + { + "epoch": 2.601842374616172, + "grad_norm": 0.3609114565434103, + "learning_rate": 2.2027957540080125e-05, + "loss": 0.3412, + "step": 1271 + }, + { + "epoch": 2.60388945752303, + "grad_norm": 0.35268666756991185, + "learning_rate": 2.199949098697574e-05, + "loss": 0.321, + "step": 1272 + }, + { + "epoch": 2.6059365404298873, + "grad_norm": 0.3218634099552252, + "learning_rate": 2.1971020341693973e-05, + "loss": 0.321, + "step": 1273 + }, + { + "epoch": 2.6079836233367453, + "grad_norm": 0.3886771490605891, + "learning_rate": 2.1942545662503115e-05, + "loss": 0.3366, + "step": 1274 + }, + { + "epoch": 2.610030706243603, + "grad_norm": 0.3542254238953694, + "learning_rate": 2.1914067007679733e-05, + "loss": 0.346, + "step": 1275 + }, + { + "epoch": 2.6120777891504607, + "grad_norm": 0.33442059420132036, + "learning_rate": 2.188558443550849e-05, + "loss": 0.3471, + "step": 1276 + }, + { + "epoch": 2.6141248720573182, + "grad_norm": 0.31543081486502833, + "learning_rate": 2.185709800428211e-05, + "loss": 0.3523, + "step": 1277 + }, + { + "epoch": 2.616171954964176, + "grad_norm": 0.32475295410492505, + "learning_rate": 2.1828607772301187e-05, + "loss": 0.3456, + "step": 1278 + }, + { + "epoch": 2.6182190378710337, + "grad_norm": 0.35139374027023634, + "learning_rate": 2.180011379787411e-05, + "loss": 0.3309, + "step": 1279 + }, + { + "epoch": 2.6202661207778917, + "grad_norm": 0.3057519944170325, + "learning_rate": 2.1771616139316903e-05, + "loss": 0.3351, + "step": 1280 + }, + { + "epoch": 2.622313203684749, + "grad_norm": 0.363923325870162, + "learning_rate": 2.174311485495317e-05, + "loss": 0.3046, + "step": 1281 + }, + { + "epoch": 2.6243602865916067, + "grad_norm": 0.3291114320223632, + "learning_rate": 2.1714610003113887e-05, + "loss": 0.303, + "step": 1282 + }, + { + "epoch": 2.6264073694984647, + "grad_norm": 0.3289632021661567, + "learning_rate": 2.168610164213738e-05, + "loss": 0.3213, + "step": 1283 + }, + { + "epoch": 2.6284544524053226, + "grad_norm": 0.33306777417969263, + "learning_rate": 2.1657589830369113e-05, + "loss": 0.351, + "step": 1284 + }, + { + "epoch": 2.63050153531218, + "grad_norm": 0.3612382108372884, + "learning_rate": 2.1629074626161647e-05, + "loss": 0.3868, + "step": 1285 + }, + { + "epoch": 2.6325486182190376, + "grad_norm": 0.3580608267992191, + "learning_rate": 2.1600556087874472e-05, + "loss": 0.3175, + "step": 1286 + }, + { + "epoch": 2.6345957011258956, + "grad_norm": 0.31536303959925943, + "learning_rate": 2.1572034273873893e-05, + "loss": 0.3262, + "step": 1287 + }, + { + "epoch": 2.6366427840327535, + "grad_norm": 0.3578677663211092, + "learning_rate": 2.1543509242532932e-05, + "loss": 0.3716, + "step": 1288 + }, + { + "epoch": 2.638689866939611, + "grad_norm": 0.31607653936815944, + "learning_rate": 2.1514981052231187e-05, + "loss": 0.3166, + "step": 1289 + }, + { + "epoch": 2.6407369498464686, + "grad_norm": 0.35206333188454375, + "learning_rate": 2.1486449761354727e-05, + "loss": 0.3315, + "step": 1290 + }, + { + "epoch": 2.6427840327533265, + "grad_norm": 0.34365776192029646, + "learning_rate": 2.145791542829597e-05, + "loss": 0.3225, + "step": 1291 + }, + { + "epoch": 2.6448311156601845, + "grad_norm": 0.40791136566579844, + "learning_rate": 2.142937811145354e-05, + "loss": 0.3839, + "step": 1292 + }, + { + "epoch": 2.646878198567042, + "grad_norm": 0.3334669459335626, + "learning_rate": 2.140083786923221e-05, + "loss": 0.3277, + "step": 1293 + }, + { + "epoch": 2.6489252814738995, + "grad_norm": 0.33721029481105136, + "learning_rate": 2.1372294760042686e-05, + "loss": 0.3396, + "step": 1294 + }, + { + "epoch": 2.6509723643807575, + "grad_norm": 0.33022887238565724, + "learning_rate": 2.1343748842301575e-05, + "loss": 0.3199, + "step": 1295 + }, + { + "epoch": 2.6530194472876154, + "grad_norm": 0.3578342406147216, + "learning_rate": 2.1315200174431235e-05, + "loss": 0.3264, + "step": 1296 + }, + { + "epoch": 2.655066530194473, + "grad_norm": 0.2843605578164525, + "learning_rate": 2.1286648814859636e-05, + "loss": 0.3196, + "step": 1297 + }, + { + "epoch": 2.6571136131013304, + "grad_norm": 0.3320141991189249, + "learning_rate": 2.1258094822020263e-05, + "loss": 0.3132, + "step": 1298 + }, + { + "epoch": 2.6591606960081884, + "grad_norm": 0.37866233560469814, + "learning_rate": 2.1229538254351995e-05, + "loss": 0.3238, + "step": 1299 + }, + { + "epoch": 2.661207778915046, + "grad_norm": 0.327131535806631, + "learning_rate": 2.120097917029897e-05, + "loss": 0.3843, + "step": 1300 + }, + { + "epoch": 2.663254861821904, + "grad_norm": 0.3036315122393342, + "learning_rate": 2.1172417628310487e-05, + "loss": 0.3292, + "step": 1301 + }, + { + "epoch": 2.6653019447287614, + "grad_norm": 0.3515024908888374, + "learning_rate": 2.1143853686840874e-05, + "loss": 0.3102, + "step": 1302 + }, + { + "epoch": 2.6673490276356193, + "grad_norm": 0.34581222342499085, + "learning_rate": 2.1115287404349357e-05, + "loss": 0.3156, + "step": 1303 + }, + { + "epoch": 2.669396110542477, + "grad_norm": 0.35858989267221897, + "learning_rate": 2.1086718839299972e-05, + "loss": 0.3461, + "step": 1304 + }, + { + "epoch": 2.671443193449335, + "grad_norm": 0.3475194620828264, + "learning_rate": 2.1058148050161412e-05, + "loss": 0.3357, + "step": 1305 + }, + { + "epoch": 2.6734902763561923, + "grad_norm": 0.3585594367622285, + "learning_rate": 2.1029575095406933e-05, + "loss": 0.3454, + "step": 1306 + }, + { + "epoch": 2.6755373592630503, + "grad_norm": 0.3384161065196781, + "learning_rate": 2.1001000033514215e-05, + "loss": 0.3403, + "step": 1307 + }, + { + "epoch": 2.6775844421699078, + "grad_norm": 0.33004176854360945, + "learning_rate": 2.097242292296525e-05, + "loss": 0.3643, + "step": 1308 + }, + { + "epoch": 2.6796315250767657, + "grad_norm": 0.33026393179643815, + "learning_rate": 2.0943843822246234e-05, + "loss": 0.3224, + "step": 1309 + }, + { + "epoch": 2.6816786079836232, + "grad_norm": 0.34661171805934476, + "learning_rate": 2.0915262789847414e-05, + "loss": 0.3368, + "step": 1310 + }, + { + "epoch": 2.683725690890481, + "grad_norm": 0.3157977388993276, + "learning_rate": 2.088667988426302e-05, + "loss": 0.311, + "step": 1311 + }, + { + "epoch": 2.6857727737973387, + "grad_norm": 0.32541710739849666, + "learning_rate": 2.0858095163991094e-05, + "loss": 0.3145, + "step": 1312 + }, + { + "epoch": 2.6878198567041967, + "grad_norm": 0.3317095591959358, + "learning_rate": 2.0829508687533387e-05, + "loss": 0.343, + "step": 1313 + }, + { + "epoch": 2.689866939611054, + "grad_norm": 0.33508091050613437, + "learning_rate": 2.0800920513395276e-05, + "loss": 0.3102, + "step": 1314 + }, + { + "epoch": 2.691914022517912, + "grad_norm": 0.3092939306017529, + "learning_rate": 2.077233070008557e-05, + "loss": 0.3628, + "step": 1315 + }, + { + "epoch": 2.6939611054247696, + "grad_norm": 0.3644518282154973, + "learning_rate": 2.074373930611647e-05, + "loss": 0.3611, + "step": 1316 + }, + { + "epoch": 2.6960081883316276, + "grad_norm": 0.3268481968948613, + "learning_rate": 2.0715146390003395e-05, + "loss": 0.4382, + "step": 1317 + }, + { + "epoch": 2.698055271238485, + "grad_norm": 0.32366874488936354, + "learning_rate": 2.0686552010264872e-05, + "loss": 0.3316, + "step": 1318 + }, + { + "epoch": 2.7001023541453426, + "grad_norm": 0.3655397054326707, + "learning_rate": 2.0657956225422438e-05, + "loss": 0.3241, + "step": 1319 + }, + { + "epoch": 2.7021494370522006, + "grad_norm": 0.3511986539192374, + "learning_rate": 2.0629359094000502e-05, + "loss": 0.3634, + "step": 1320 + }, + { + "epoch": 2.7041965199590585, + "grad_norm": 0.33590806443682913, + "learning_rate": 2.060076067452622e-05, + "loss": 0.3434, + "step": 1321 + }, + { + "epoch": 2.706243602865916, + "grad_norm": 0.34940605685458204, + "learning_rate": 2.0572161025529396e-05, + "loss": 0.3592, + "step": 1322 + }, + { + "epoch": 2.7082906857727735, + "grad_norm": 0.33025728634487234, + "learning_rate": 2.0543560205542338e-05, + "loss": 0.3273, + "step": 1323 + }, + { + "epoch": 2.7103377686796315, + "grad_norm": 0.3111565817907451, + "learning_rate": 2.0514958273099778e-05, + "loss": 0.3528, + "step": 1324 + }, + { + "epoch": 2.7123848515864895, + "grad_norm": 0.34384380691800237, + "learning_rate": 2.0486355286738675e-05, + "loss": 0.3279, + "step": 1325 + }, + { + "epoch": 2.714431934493347, + "grad_norm": 0.34483761426361903, + "learning_rate": 2.0457751304998196e-05, + "loss": 0.3154, + "step": 1326 + }, + { + "epoch": 2.7164790174002045, + "grad_norm": 0.34739319472868174, + "learning_rate": 2.042914638641952e-05, + "loss": 0.3122, + "step": 1327 + }, + { + "epoch": 2.7185261003070624, + "grad_norm": 0.34204787000879766, + "learning_rate": 2.0400540589545738e-05, + "loss": 0.2987, + "step": 1328 + }, + { + "epoch": 2.7205731832139204, + "grad_norm": 0.31149494456320415, + "learning_rate": 2.0371933972921756e-05, + "loss": 0.3651, + "step": 1329 + }, + { + "epoch": 2.722620266120778, + "grad_norm": 0.3206059172994117, + "learning_rate": 2.0343326595094154e-05, + "loss": 0.3056, + "step": 1330 + }, + { + "epoch": 2.7246673490276354, + "grad_norm": 0.3460394891552501, + "learning_rate": 2.031471851461105e-05, + "loss": 0.3078, + "step": 1331 + }, + { + "epoch": 2.7267144319344934, + "grad_norm": 0.31434160088392427, + "learning_rate": 2.0286109790022023e-05, + "loss": 0.3019, + "step": 1332 + }, + { + "epoch": 2.7287615148413513, + "grad_norm": 0.30988899965990013, + "learning_rate": 2.0257500479877965e-05, + "loss": 0.3606, + "step": 1333 + }, + { + "epoch": 2.730808597748209, + "grad_norm": 0.3038902310146715, + "learning_rate": 2.0228890642730967e-05, + "loss": 0.3188, + "step": 1334 + }, + { + "epoch": 2.7328556806550663, + "grad_norm": 0.33615481049383383, + "learning_rate": 2.020028033713418e-05, + "loss": 0.3233, + "step": 1335 + }, + { + "epoch": 2.7349027635619243, + "grad_norm": 0.3416566631514737, + "learning_rate": 2.0171669621641743e-05, + "loss": 0.3563, + "step": 1336 + }, + { + "epoch": 2.7369498464687823, + "grad_norm": 0.33625693354138464, + "learning_rate": 2.0143058554808622e-05, + "loss": 0.3107, + "step": 1337 + }, + { + "epoch": 2.7389969293756398, + "grad_norm": 0.32608262090854195, + "learning_rate": 2.0114447195190486e-05, + "loss": 0.3445, + "step": 1338 + }, + { + "epoch": 2.7410440122824973, + "grad_norm": 0.3157455564515132, + "learning_rate": 2.0085835601343627e-05, + "loss": 0.3426, + "step": 1339 + }, + { + "epoch": 2.7430910951893552, + "grad_norm": 0.3108979356348658, + "learning_rate": 2.005722383182481e-05, + "loss": 0.3216, + "step": 1340 + }, + { + "epoch": 2.7451381780962127, + "grad_norm": 0.31525583618025826, + "learning_rate": 2.002861194519114e-05, + "loss": 0.3888, + "step": 1341 + }, + { + "epoch": 2.7471852610030707, + "grad_norm": 0.35286260637825495, + "learning_rate": 2e-05, + "loss": 0.3448, + "step": 1342 + }, + { + "epoch": 2.749232343909928, + "grad_norm": 0.33612012963227933, + "learning_rate": 1.9971388054808863e-05, + "loss": 0.3303, + "step": 1343 + }, + { + "epoch": 2.751279426816786, + "grad_norm": 0.3293365486688113, + "learning_rate": 1.99427761681752e-05, + "loss": 0.3378, + "step": 1344 + }, + { + "epoch": 2.7533265097236437, + "grad_norm": 0.3702026149273537, + "learning_rate": 1.9914164398656383e-05, + "loss": 0.3204, + "step": 1345 + }, + { + "epoch": 2.7553735926305016, + "grad_norm": 0.3008607535549162, + "learning_rate": 1.988555280480952e-05, + "loss": 0.3245, + "step": 1346 + }, + { + "epoch": 2.757420675537359, + "grad_norm": 0.3346226482861992, + "learning_rate": 1.9856941445191388e-05, + "loss": 0.2973, + "step": 1347 + }, + { + "epoch": 2.759467758444217, + "grad_norm": 0.333107377703825, + "learning_rate": 1.9828330378358264e-05, + "loss": 0.3462, + "step": 1348 + }, + { + "epoch": 2.7615148413510746, + "grad_norm": 0.33602464320692405, + "learning_rate": 1.9799719662865828e-05, + "loss": 0.3348, + "step": 1349 + }, + { + "epoch": 2.7635619242579326, + "grad_norm": 0.3460744761311549, + "learning_rate": 1.9771109357269047e-05, + "loss": 0.3041, + "step": 1350 + }, + { + "epoch": 2.76560900716479, + "grad_norm": 0.3398214327127035, + "learning_rate": 1.974249952012204e-05, + "loss": 0.3442, + "step": 1351 + }, + { + "epoch": 2.767656090071648, + "grad_norm": 0.32323094867372437, + "learning_rate": 1.9713890209977977e-05, + "loss": 0.3133, + "step": 1352 + }, + { + "epoch": 2.7697031729785055, + "grad_norm": 0.3439688117561741, + "learning_rate": 1.9685281485388955e-05, + "loss": 0.3091, + "step": 1353 + }, + { + "epoch": 2.7717502558853635, + "grad_norm": 0.29746767666002316, + "learning_rate": 1.9656673404905852e-05, + "loss": 0.2957, + "step": 1354 + }, + { + "epoch": 2.773797338792221, + "grad_norm": 0.32765073618687846, + "learning_rate": 1.9628066027078247e-05, + "loss": 0.3413, + "step": 1355 + }, + { + "epoch": 2.775844421699079, + "grad_norm": 0.3473220628635939, + "learning_rate": 1.9599459410454266e-05, + "loss": 0.2997, + "step": 1356 + }, + { + "epoch": 2.7778915046059365, + "grad_norm": 0.34784120186994494, + "learning_rate": 1.957085361358049e-05, + "loss": 0.3397, + "step": 1357 + }, + { + "epoch": 2.7799385875127944, + "grad_norm": 0.32647960906928786, + "learning_rate": 1.9542248695001808e-05, + "loss": 0.3269, + "step": 1358 + }, + { + "epoch": 2.781985670419652, + "grad_norm": 0.3386091012629272, + "learning_rate": 1.9513644713261328e-05, + "loss": 0.3398, + "step": 1359 + }, + { + "epoch": 2.7840327533265095, + "grad_norm": 0.31112512902574707, + "learning_rate": 1.9485041726900232e-05, + "loss": 0.3247, + "step": 1360 + }, + { + "epoch": 2.7860798362333674, + "grad_norm": 0.3267971495507797, + "learning_rate": 1.9456439794457665e-05, + "loss": 0.3699, + "step": 1361 + }, + { + "epoch": 2.7881269191402254, + "grad_norm": 0.34461638842492454, + "learning_rate": 1.942783897447061e-05, + "loss": 0.3315, + "step": 1362 + }, + { + "epoch": 2.790174002047083, + "grad_norm": 0.30853412340590924, + "learning_rate": 1.939923932547379e-05, + "loss": 0.3325, + "step": 1363 + }, + { + "epoch": 2.7922210849539404, + "grad_norm": 0.32960376681405235, + "learning_rate": 1.93706409059995e-05, + "loss": 0.3048, + "step": 1364 + }, + { + "epoch": 2.7942681678607983, + "grad_norm": 0.3216608733849923, + "learning_rate": 1.9342043774577562e-05, + "loss": 0.3478, + "step": 1365 + }, + { + "epoch": 2.7963152507676563, + "grad_norm": 0.369356903841249, + "learning_rate": 1.931344798973513e-05, + "loss": 0.3428, + "step": 1366 + }, + { + "epoch": 2.798362333674514, + "grad_norm": 0.3021279849899095, + "learning_rate": 1.928485360999661e-05, + "loss": 0.3641, + "step": 1367 + }, + { + "epoch": 2.8004094165813713, + "grad_norm": 0.3573666945556479, + "learning_rate": 1.9256260693883534e-05, + "loss": 0.2993, + "step": 1368 + }, + { + "epoch": 2.8024564994882293, + "grad_norm": 0.31061810018877584, + "learning_rate": 1.922766929991443e-05, + "loss": 0.3298, + "step": 1369 + }, + { + "epoch": 2.8045035823950872, + "grad_norm": 0.32086151227350046, + "learning_rate": 1.9199079486604727e-05, + "loss": 0.293, + "step": 1370 + }, + { + "epoch": 2.8065506653019447, + "grad_norm": 0.3339285911403012, + "learning_rate": 1.9170491312466616e-05, + "loss": 0.3239, + "step": 1371 + }, + { + "epoch": 2.8085977482088023, + "grad_norm": 0.3273867205248011, + "learning_rate": 1.914190483600891e-05, + "loss": 0.3502, + "step": 1372 + }, + { + "epoch": 2.81064483111566, + "grad_norm": 0.3707945367816578, + "learning_rate": 1.9113320115736986e-05, + "loss": 0.3357, + "step": 1373 + }, + { + "epoch": 2.812691914022518, + "grad_norm": 0.34848249764459066, + "learning_rate": 1.9084737210152593e-05, + "loss": 0.3185, + "step": 1374 + }, + { + "epoch": 2.8147389969293757, + "grad_norm": 0.32760861423269866, + "learning_rate": 1.9056156177753776e-05, + "loss": 0.3228, + "step": 1375 + }, + { + "epoch": 2.816786079836233, + "grad_norm": 0.3391917065894771, + "learning_rate": 1.902757707703475e-05, + "loss": 0.3475, + "step": 1376 + }, + { + "epoch": 2.818833162743091, + "grad_norm": 0.3257314300037398, + "learning_rate": 1.899899996648579e-05, + "loss": 0.3325, + "step": 1377 + }, + { + "epoch": 2.8208802456499487, + "grad_norm": 0.3453324225261829, + "learning_rate": 1.897042490459307e-05, + "loss": 0.3301, + "step": 1378 + }, + { + "epoch": 2.8229273285568066, + "grad_norm": 0.35448623951366837, + "learning_rate": 1.8941851949838595e-05, + "loss": 0.3261, + "step": 1379 + }, + { + "epoch": 2.824974411463664, + "grad_norm": 0.322963697569429, + "learning_rate": 1.8913281160700038e-05, + "loss": 0.3602, + "step": 1380 + }, + { + "epoch": 2.827021494370522, + "grad_norm": 0.36959087755198944, + "learning_rate": 1.8884712595650653e-05, + "loss": 0.3173, + "step": 1381 + }, + { + "epoch": 2.8290685772773796, + "grad_norm": 0.3404440796977021, + "learning_rate": 1.885614631315914e-05, + "loss": 0.3549, + "step": 1382 + }, + { + "epoch": 2.8311156601842375, + "grad_norm": 0.31042921976093146, + "learning_rate": 1.8827582371689516e-05, + "loss": 0.3202, + "step": 1383 + }, + { + "epoch": 2.833162743091095, + "grad_norm": 0.3733330139685621, + "learning_rate": 1.8799020829701036e-05, + "loss": 0.3704, + "step": 1384 + }, + { + "epoch": 2.835209825997953, + "grad_norm": 0.35311166747172257, + "learning_rate": 1.8770461745648012e-05, + "loss": 0.3159, + "step": 1385 + }, + { + "epoch": 2.8372569089048105, + "grad_norm": 0.30009262162818595, + "learning_rate": 1.8741905177979743e-05, + "loss": 0.347, + "step": 1386 + }, + { + "epoch": 2.8393039918116685, + "grad_norm": 0.3431225449790101, + "learning_rate": 1.871335118514037e-05, + "loss": 0.3259, + "step": 1387 + }, + { + "epoch": 2.841351074718526, + "grad_norm": 0.3379879693204325, + "learning_rate": 1.8684799825568775e-05, + "loss": 0.3354, + "step": 1388 + }, + { + "epoch": 2.843398157625384, + "grad_norm": 0.32284443139667096, + "learning_rate": 1.8656251157698425e-05, + "loss": 0.3332, + "step": 1389 + }, + { + "epoch": 2.8454452405322415, + "grad_norm": 0.3366299858528211, + "learning_rate": 1.862770523995732e-05, + "loss": 0.3379, + "step": 1390 + }, + { + "epoch": 2.8474923234390994, + "grad_norm": 0.3306626186125099, + "learning_rate": 1.85991621307678e-05, + "loss": 0.3304, + "step": 1391 + }, + { + "epoch": 2.849539406345957, + "grad_norm": 0.3154237186664138, + "learning_rate": 1.8570621888546464e-05, + "loss": 0.3162, + "step": 1392 + }, + { + "epoch": 2.851586489252815, + "grad_norm": 0.3070396971440444, + "learning_rate": 1.854208457170404e-05, + "loss": 0.3341, + "step": 1393 + }, + { + "epoch": 2.8536335721596724, + "grad_norm": 0.31391792776294475, + "learning_rate": 1.8513550238645283e-05, + "loss": 0.3385, + "step": 1394 + }, + { + "epoch": 2.8556806550665303, + "grad_norm": 0.3604543553956966, + "learning_rate": 1.8485018947768817e-05, + "loss": 0.3242, + "step": 1395 + }, + { + "epoch": 2.857727737973388, + "grad_norm": 0.33119690178793554, + "learning_rate": 1.8456490757467075e-05, + "loss": 0.3172, + "step": 1396 + }, + { + "epoch": 2.859774820880246, + "grad_norm": 0.3634627791673887, + "learning_rate": 1.8427965726126114e-05, + "loss": 0.3256, + "step": 1397 + }, + { + "epoch": 2.8618219037871033, + "grad_norm": 0.30347193226396807, + "learning_rate": 1.839944391212553e-05, + "loss": 0.3375, + "step": 1398 + }, + { + "epoch": 2.8638689866939613, + "grad_norm": 0.3273120420881456, + "learning_rate": 1.8370925373838356e-05, + "loss": 0.3388, + "step": 1399 + }, + { + "epoch": 2.865916069600819, + "grad_norm": 0.31525363609472923, + "learning_rate": 1.834241016963089e-05, + "loss": 0.361, + "step": 1400 + }, + { + "epoch": 2.8679631525076763, + "grad_norm": 0.32941174535440465, + "learning_rate": 1.8313898357862623e-05, + "loss": 0.3292, + "step": 1401 + }, + { + "epoch": 2.8700102354145343, + "grad_norm": 0.33198094596750977, + "learning_rate": 1.8285389996886113e-05, + "loss": 0.3239, + "step": 1402 + }, + { + "epoch": 2.872057318321392, + "grad_norm": 0.31365437902540855, + "learning_rate": 1.8256885145046837e-05, + "loss": 0.3442, + "step": 1403 + }, + { + "epoch": 2.8741044012282497, + "grad_norm": 0.30554897801238856, + "learning_rate": 1.82283838606831e-05, + "loss": 0.3387, + "step": 1404 + }, + { + "epoch": 2.8761514841351072, + "grad_norm": 0.32230864173441504, + "learning_rate": 1.8199886202125897e-05, + "loss": 0.364, + "step": 1405 + }, + { + "epoch": 2.878198567041965, + "grad_norm": 0.3283203936597496, + "learning_rate": 1.817139222769882e-05, + "loss": 0.3135, + "step": 1406 + }, + { + "epoch": 2.880245649948823, + "grad_norm": 0.3352622595322451, + "learning_rate": 1.8142901995717894e-05, + "loss": 0.3293, + "step": 1407 + }, + { + "epoch": 2.8822927328556807, + "grad_norm": 0.3199512613271915, + "learning_rate": 1.8114415564491513e-05, + "loss": 0.3382, + "step": 1408 + }, + { + "epoch": 2.884339815762538, + "grad_norm": 0.3243396061475919, + "learning_rate": 1.8085932992320273e-05, + "loss": 0.3346, + "step": 1409 + }, + { + "epoch": 2.886386898669396, + "grad_norm": 0.3177758192621551, + "learning_rate": 1.805745433749689e-05, + "loss": 0.3193, + "step": 1410 + }, + { + "epoch": 2.888433981576254, + "grad_norm": 0.36128950608945015, + "learning_rate": 1.8028979658306033e-05, + "loss": 0.3352, + "step": 1411 + }, + { + "epoch": 2.8904810644831116, + "grad_norm": 0.34067398643062763, + "learning_rate": 1.8000509013024266e-05, + "loss": 0.3704, + "step": 1412 + }, + { + "epoch": 2.892528147389969, + "grad_norm": 0.36758322105150537, + "learning_rate": 1.7972042459919878e-05, + "loss": 0.3548, + "step": 1413 + }, + { + "epoch": 2.894575230296827, + "grad_norm": 0.3446360571011566, + "learning_rate": 1.794358005725279e-05, + "loss": 0.3379, + "step": 1414 + }, + { + "epoch": 2.896622313203685, + "grad_norm": 0.33967697044665596, + "learning_rate": 1.791512186327444e-05, + "loss": 0.3685, + "step": 1415 + }, + { + "epoch": 2.8986693961105425, + "grad_norm": 0.32938434012320356, + "learning_rate": 1.7886667936227616e-05, + "loss": 0.3224, + "step": 1416 + }, + { + "epoch": 2.9007164790174, + "grad_norm": 0.3492413938668141, + "learning_rate": 1.785821833434642e-05, + "loss": 0.3234, + "step": 1417 + }, + { + "epoch": 2.902763561924258, + "grad_norm": 0.3556331967459014, + "learning_rate": 1.7829773115856065e-05, + "loss": 0.305, + "step": 1418 + }, + { + "epoch": 2.9048106448311155, + "grad_norm": 0.3322076227286167, + "learning_rate": 1.7801332338972813e-05, + "loss": 0.3463, + "step": 1419 + }, + { + "epoch": 2.9068577277379735, + "grad_norm": 0.44401272792536567, + "learning_rate": 1.7772896061903824e-05, + "loss": 0.3441, + "step": 1420 + }, + { + "epoch": 2.908904810644831, + "grad_norm": 0.3346724756218441, + "learning_rate": 1.7744464342847062e-05, + "loss": 0.3243, + "step": 1421 + }, + { + "epoch": 2.910951893551689, + "grad_norm": 0.37288189845084224, + "learning_rate": 1.771603723999116e-05, + "loss": 0.328, + "step": 1422 + }, + { + "epoch": 2.9129989764585464, + "grad_norm": 0.3543864660540506, + "learning_rate": 1.768761481151529e-05, + "loss": 0.3239, + "step": 1423 + }, + { + "epoch": 2.9150460593654044, + "grad_norm": 0.33338686726894085, + "learning_rate": 1.765919711558906e-05, + "loss": 0.3373, + "step": 1424 + }, + { + "epoch": 2.917093142272262, + "grad_norm": 0.32542482524106037, + "learning_rate": 1.7630784210372413e-05, + "loss": 0.3473, + "step": 1425 + }, + { + "epoch": 2.91914022517912, + "grad_norm": 0.333470225286717, + "learning_rate": 1.7602376154015456e-05, + "loss": 0.3285, + "step": 1426 + }, + { + "epoch": 2.9211873080859774, + "grad_norm": 0.33573772915244177, + "learning_rate": 1.7573973004658404e-05, + "loss": 0.3024, + "step": 1427 + }, + { + "epoch": 2.9232343909928353, + "grad_norm": 0.34259215508028346, + "learning_rate": 1.7545574820431412e-05, + "loss": 0.3315, + "step": 1428 + }, + { + "epoch": 2.925281473899693, + "grad_norm": 0.33102210911596164, + "learning_rate": 1.751718165945447e-05, + "loss": 0.3519, + "step": 1429 + }, + { + "epoch": 2.927328556806551, + "grad_norm": 0.32436438467262657, + "learning_rate": 1.7488793579837297e-05, + "loss": 0.3661, + "step": 1430 + }, + { + "epoch": 2.9293756397134083, + "grad_norm": 0.31331782808616476, + "learning_rate": 1.74604106396792e-05, + "loss": 0.3141, + "step": 1431 + }, + { + "epoch": 2.9314227226202663, + "grad_norm": 0.4141533412404601, + "learning_rate": 1.743203289706898e-05, + "loss": 0.3557, + "step": 1432 + }, + { + "epoch": 2.9334698055271238, + "grad_norm": 0.4622088658579696, + "learning_rate": 1.7403660410084806e-05, + "loss": 0.399, + "step": 1433 + }, + { + "epoch": 2.9355168884339817, + "grad_norm": 0.3236744034505386, + "learning_rate": 1.737529323679407e-05, + "loss": 0.3286, + "step": 1434 + }, + { + "epoch": 2.9375639713408392, + "grad_norm": 0.2993041210404907, + "learning_rate": 1.734693143525331e-05, + "loss": 0.3259, + "step": 1435 + }, + { + "epoch": 2.939611054247697, + "grad_norm": 0.31919342750079005, + "learning_rate": 1.731857506350805e-05, + "loss": 0.3438, + "step": 1436 + }, + { + "epoch": 2.9416581371545547, + "grad_norm": 0.3479704658589382, + "learning_rate": 1.7290224179592718e-05, + "loss": 0.3561, + "step": 1437 + }, + { + "epoch": 2.943705220061412, + "grad_norm": 0.32794236732374565, + "learning_rate": 1.7261878841530494e-05, + "loss": 0.2956, + "step": 1438 + }, + { + "epoch": 2.94575230296827, + "grad_norm": 0.3348567062695728, + "learning_rate": 1.7233539107333234e-05, + "loss": 0.3207, + "step": 1439 + }, + { + "epoch": 2.947799385875128, + "grad_norm": 0.3107019732230144, + "learning_rate": 1.72052050350013e-05, + "loss": 0.3324, + "step": 1440 + }, + { + "epoch": 2.9498464687819856, + "grad_norm": 0.334794121032537, + "learning_rate": 1.717687668252348e-05, + "loss": 0.3296, + "step": 1441 + }, + { + "epoch": 2.951893551688843, + "grad_norm": 0.34219772695899153, + "learning_rate": 1.7148554107876847e-05, + "loss": 0.3504, + "step": 1442 + }, + { + "epoch": 2.953940634595701, + "grad_norm": 0.33351281779847475, + "learning_rate": 1.7120237369026655e-05, + "loss": 0.378, + "step": 1443 + }, + { + "epoch": 2.955987717502559, + "grad_norm": 0.3475392701501788, + "learning_rate": 1.7091926523926205e-05, + "loss": 0.3437, + "step": 1444 + }, + { + "epoch": 2.9580348004094166, + "grad_norm": 0.33100597785259966, + "learning_rate": 1.7063621630516755e-05, + "loss": 0.3289, + "step": 1445 + }, + { + "epoch": 2.960081883316274, + "grad_norm": 0.3615110501855717, + "learning_rate": 1.7035322746727366e-05, + "loss": 0.3148, + "step": 1446 + }, + { + "epoch": 2.962128966223132, + "grad_norm": 0.31694424871677895, + "learning_rate": 1.7007029930474804e-05, + "loss": 0.3389, + "step": 1447 + }, + { + "epoch": 2.96417604912999, + "grad_norm": 0.3428230537054886, + "learning_rate": 1.697874323966342e-05, + "loss": 0.3286, + "step": 1448 + }, + { + "epoch": 2.9662231320368475, + "grad_norm": 0.33741669479723163, + "learning_rate": 1.6950462732185023e-05, + "loss": 0.3197, + "step": 1449 + }, + { + "epoch": 2.968270214943705, + "grad_norm": 0.3203471822470365, + "learning_rate": 1.6922188465918763e-05, + "loss": 0.3297, + "step": 1450 + }, + { + "epoch": 2.970317297850563, + "grad_norm": 0.34824872554820474, + "learning_rate": 1.689392049873104e-05, + "loss": 0.3577, + "step": 1451 + }, + { + "epoch": 2.972364380757421, + "grad_norm": 0.33484691097376085, + "learning_rate": 1.6865658888475334e-05, + "loss": 0.3252, + "step": 1452 + }, + { + "epoch": 2.9744114636642784, + "grad_norm": 0.3184369996466899, + "learning_rate": 1.6837403692992136e-05, + "loss": 0.3267, + "step": 1453 + }, + { + "epoch": 2.976458546571136, + "grad_norm": 0.34524241797219873, + "learning_rate": 1.680915497010879e-05, + "loss": 0.362, + "step": 1454 + }, + { + "epoch": 2.978505629477994, + "grad_norm": 0.3517522038568365, + "learning_rate": 1.6780912777639407e-05, + "loss": 0.3455, + "step": 1455 + }, + { + "epoch": 2.9805527123848514, + "grad_norm": 0.35084133720055133, + "learning_rate": 1.6752677173384734e-05, + "loss": 0.3476, + "step": 1456 + }, + { + "epoch": 2.9825997952917094, + "grad_norm": 0.33510673606401725, + "learning_rate": 1.6724448215132006e-05, + "loss": 0.373, + "step": 1457 + }, + { + "epoch": 2.984646878198567, + "grad_norm": 0.32173623731436524, + "learning_rate": 1.669622596065491e-05, + "loss": 0.3416, + "step": 1458 + }, + { + "epoch": 2.986693961105425, + "grad_norm": 0.3405913760888214, + "learning_rate": 1.6668010467713363e-05, + "loss": 0.3336, + "step": 1459 + }, + { + "epoch": 2.9887410440122824, + "grad_norm": 0.33253196052599165, + "learning_rate": 1.6639801794053468e-05, + "loss": 0.3281, + "step": 1460 + }, + { + "epoch": 2.9907881269191403, + "grad_norm": 0.3127274815254226, + "learning_rate": 1.6611599997407366e-05, + "loss": 0.3219, + "step": 1461 + }, + { + "epoch": 2.992835209825998, + "grad_norm": 0.35589359153787264, + "learning_rate": 1.658340513549312e-05, + "loss": 0.3253, + "step": 1462 + }, + { + "epoch": 2.9948822927328558, + "grad_norm": 0.33592684199714334, + "learning_rate": 1.6555217266014604e-05, + "loss": 0.3679, + "step": 1463 + }, + { + "epoch": 2.9969293756397133, + "grad_norm": 0.3293658658288109, + "learning_rate": 1.6527036446661396e-05, + "loss": 0.3107, + "step": 1464 + }, + { + "epoch": 2.9989764585465712, + "grad_norm": 0.32358581638845413, + "learning_rate": 1.649886273510861e-05, + "loss": 0.3537, + "step": 1465 + }, + { + "epoch": 3.0010235414534288, + "grad_norm": 0.6530375339161546, + "learning_rate": 1.6470696189016853e-05, + "loss": 0.4219, + "step": 1466 + }, + { + "epoch": 3.0030706243602867, + "grad_norm": 0.41349403660281325, + "learning_rate": 1.6442536866032027e-05, + "loss": 0.2489, + "step": 1467 + }, + { + "epoch": 3.0051177072671442, + "grad_norm": 0.49742564287488156, + "learning_rate": 1.641438482378529e-05, + "loss": 0.2308, + "step": 1468 + }, + { + "epoch": 3.007164790174002, + "grad_norm": 0.5603085497770336, + "learning_rate": 1.6386240119892867e-05, + "loss": 0.2283, + "step": 1469 + }, + { + "epoch": 3.0092118730808597, + "grad_norm": 0.37112551523365644, + "learning_rate": 1.6358102811955985e-05, + "loss": 0.2255, + "step": 1470 + }, + { + "epoch": 3.0112589559877176, + "grad_norm": 0.4964687759956586, + "learning_rate": 1.6329972957560736e-05, + "loss": 0.2499, + "step": 1471 + }, + { + "epoch": 3.013306038894575, + "grad_norm": 0.5159224089011558, + "learning_rate": 1.6301850614277936e-05, + "loss": 0.2899, + "step": 1472 + }, + { + "epoch": 3.015353121801433, + "grad_norm": 0.3865653836943384, + "learning_rate": 1.6273735839663044e-05, + "loss": 0.23, + "step": 1473 + }, + { + "epoch": 3.0174002047082906, + "grad_norm": 0.36109737314160684, + "learning_rate": 1.6245628691256032e-05, + "loss": 0.2087, + "step": 1474 + }, + { + "epoch": 3.0194472876151486, + "grad_norm": 0.44209392274842507, + "learning_rate": 1.6217529226581247e-05, + "loss": 0.2523, + "step": 1475 + }, + { + "epoch": 3.021494370522006, + "grad_norm": 0.4472352934034512, + "learning_rate": 1.6189437503147338e-05, + "loss": 0.284, + "step": 1476 + }, + { + "epoch": 3.023541453428864, + "grad_norm": 0.3324235418829143, + "learning_rate": 1.616135357844709e-05, + "loss": 0.203, + "step": 1477 + }, + { + "epoch": 3.0255885363357216, + "grad_norm": 0.36268455499664337, + "learning_rate": 1.613327750995732e-05, + "loss": 0.2223, + "step": 1478 + }, + { + "epoch": 3.0276356192425795, + "grad_norm": 0.38026408150755925, + "learning_rate": 1.61052093551388e-05, + "loss": 0.2122, + "step": 1479 + }, + { + "epoch": 3.029682702149437, + "grad_norm": 0.3766577760390122, + "learning_rate": 1.6077149171436063e-05, + "loss": 0.263, + "step": 1480 + }, + { + "epoch": 3.031729785056295, + "grad_norm": 0.36393521520091426, + "learning_rate": 1.6049097016277358e-05, + "loss": 0.2729, + "step": 1481 + }, + { + "epoch": 3.0337768679631525, + "grad_norm": 0.36192400255610996, + "learning_rate": 1.60210529470745e-05, + "loss": 0.2318, + "step": 1482 + }, + { + "epoch": 3.0358239508700104, + "grad_norm": 0.35713778733325363, + "learning_rate": 1.599301702122274e-05, + "loss": 0.2356, + "step": 1483 + }, + { + "epoch": 3.037871033776868, + "grad_norm": 0.3518563642429611, + "learning_rate": 1.5964989296100682e-05, + "loss": 0.2367, + "step": 1484 + }, + { + "epoch": 3.039918116683726, + "grad_norm": 0.40298228736514674, + "learning_rate": 1.5936969829070125e-05, + "loss": 0.3027, + "step": 1485 + }, + { + "epoch": 3.0419651995905834, + "grad_norm": 0.31807307761105635, + "learning_rate": 1.590895867747599e-05, + "loss": 0.2541, + "step": 1486 + }, + { + "epoch": 3.044012282497441, + "grad_norm": 0.3409445938787892, + "learning_rate": 1.588095589864615e-05, + "loss": 0.2289, + "step": 1487 + }, + { + "epoch": 3.046059365404299, + "grad_norm": 0.3600773888625388, + "learning_rate": 1.5852961549891376e-05, + "loss": 0.2341, + "step": 1488 + }, + { + "epoch": 3.0481064483111564, + "grad_norm": 0.32453736981408465, + "learning_rate": 1.582497568850517e-05, + "loss": 0.2797, + "step": 1489 + }, + { + "epoch": 3.0501535312180144, + "grad_norm": 0.3331181801930424, + "learning_rate": 1.579699837176366e-05, + "loss": 0.2036, + "step": 1490 + }, + { + "epoch": 3.052200614124872, + "grad_norm": 0.4136442196087828, + "learning_rate": 1.5769029656925486e-05, + "loss": 0.2599, + "step": 1491 + }, + { + "epoch": 3.05424769703173, + "grad_norm": 0.35863640915527445, + "learning_rate": 1.574106960123169e-05, + "loss": 0.2526, + "step": 1492 + }, + { + "epoch": 3.0562947799385873, + "grad_norm": 0.35564688451602927, + "learning_rate": 1.571311826190559e-05, + "loss": 0.2503, + "step": 1493 + }, + { + "epoch": 3.0583418628454453, + "grad_norm": 0.33302867243814094, + "learning_rate": 1.5685175696152657e-05, + "loss": 0.2283, + "step": 1494 + }, + { + "epoch": 3.060388945752303, + "grad_norm": 0.36946655881677004, + "learning_rate": 1.5657241961160434e-05, + "loss": 0.272, + "step": 1495 + }, + { + "epoch": 3.0624360286591608, + "grad_norm": 0.33870469772598516, + "learning_rate": 1.562931711409835e-05, + "loss": 0.2527, + "step": 1496 + }, + { + "epoch": 3.0644831115660183, + "grad_norm": 0.29057383945914955, + "learning_rate": 1.5601401212117676e-05, + "loss": 0.2468, + "step": 1497 + }, + { + "epoch": 3.0665301944728762, + "grad_norm": 0.3538607726440218, + "learning_rate": 1.557349431235135e-05, + "loss": 0.2527, + "step": 1498 + }, + { + "epoch": 3.0685772773797337, + "grad_norm": 0.36881305364134004, + "learning_rate": 1.554559647191392e-05, + "loss": 0.2572, + "step": 1499 + }, + { + "epoch": 3.0706243602865917, + "grad_norm": 0.31303360312121764, + "learning_rate": 1.5517707747901352e-05, + "loss": 0.2015, + "step": 1500 + }, + { + "epoch": 3.072671443193449, + "grad_norm": 0.3468523227469292, + "learning_rate": 1.5489828197390988e-05, + "loss": 0.2522, + "step": 1501 + }, + { + "epoch": 3.074718526100307, + "grad_norm": 0.3189025943422717, + "learning_rate": 1.5461957877441387e-05, + "loss": 0.2467, + "step": 1502 + }, + { + "epoch": 3.0767656090071647, + "grad_norm": 0.36693872460390387, + "learning_rate": 1.5434096845092203e-05, + "loss": 0.2266, + "step": 1503 + }, + { + "epoch": 3.0788126919140226, + "grad_norm": 0.33990261356941326, + "learning_rate": 1.5406245157364093e-05, + "loss": 0.2511, + "step": 1504 + }, + { + "epoch": 3.08085977482088, + "grad_norm": 0.3224675841478402, + "learning_rate": 1.537840287125859e-05, + "loss": 0.2013, + "step": 1505 + }, + { + "epoch": 3.082906857727738, + "grad_norm": 0.3392490287207665, + "learning_rate": 1.5350570043757976e-05, + "loss": 0.2222, + "step": 1506 + }, + { + "epoch": 3.0849539406345956, + "grad_norm": 0.33119892853448313, + "learning_rate": 1.5322746731825195e-05, + "loss": 0.2403, + "step": 1507 + }, + { + "epoch": 3.0870010235414536, + "grad_norm": 0.3189197704594123, + "learning_rate": 1.5294932992403695e-05, + "loss": 0.2156, + "step": 1508 + }, + { + "epoch": 3.089048106448311, + "grad_norm": 0.33202442707430363, + "learning_rate": 1.526712888241734e-05, + "loss": 0.2449, + "step": 1509 + }, + { + "epoch": 3.091095189355169, + "grad_norm": 0.3126714573879982, + "learning_rate": 1.5239334458770291e-05, + "loss": 0.2345, + "step": 1510 + }, + { + "epoch": 3.0931422722620265, + "grad_norm": 0.3199987863370429, + "learning_rate": 1.5211549778346882e-05, + "loss": 0.2547, + "step": 1511 + }, + { + "epoch": 3.0951893551688845, + "grad_norm": 0.2865206932540882, + "learning_rate": 1.5183774898011496e-05, + "loss": 0.2262, + "step": 1512 + }, + { + "epoch": 3.097236438075742, + "grad_norm": 0.2978122286180525, + "learning_rate": 1.5156009874608484e-05, + "loss": 0.2454, + "step": 1513 + }, + { + "epoch": 3.0992835209826, + "grad_norm": 0.33009627047245504, + "learning_rate": 1.5128254764962e-05, + "loss": 0.2762, + "step": 1514 + }, + { + "epoch": 3.1013306038894575, + "grad_norm": 0.328880261572203, + "learning_rate": 1.5100509625875921e-05, + "loss": 0.3284, + "step": 1515 + }, + { + "epoch": 3.1033776867963154, + "grad_norm": 0.31916303688606795, + "learning_rate": 1.5072774514133708e-05, + "loss": 0.2299, + "step": 1516 + }, + { + "epoch": 3.105424769703173, + "grad_norm": 0.3226893313357512, + "learning_rate": 1.5045049486498311e-05, + "loss": 0.2338, + "step": 1517 + }, + { + "epoch": 3.107471852610031, + "grad_norm": 0.31484899101790875, + "learning_rate": 1.5017334599712028e-05, + "loss": 0.2039, + "step": 1518 + }, + { + "epoch": 3.1095189355168884, + "grad_norm": 0.32998837828193894, + "learning_rate": 1.4989629910496424e-05, + "loss": 0.2345, + "step": 1519 + }, + { + "epoch": 3.1115660184237464, + "grad_norm": 0.3027257775351982, + "learning_rate": 1.4961935475552178e-05, + "loss": 0.2285, + "step": 1520 + }, + { + "epoch": 3.113613101330604, + "grad_norm": 0.3071518323366728, + "learning_rate": 1.4934251351558983e-05, + "loss": 0.2384, + "step": 1521 + }, + { + "epoch": 3.115660184237462, + "grad_norm": 0.30152633341709373, + "learning_rate": 1.4906577595175428e-05, + "loss": 0.2201, + "step": 1522 + }, + { + "epoch": 3.1177072671443193, + "grad_norm": 0.33663282454591203, + "learning_rate": 1.4878914263038895e-05, + "loss": 0.2324, + "step": 1523 + }, + { + "epoch": 3.119754350051177, + "grad_norm": 0.33606866012096387, + "learning_rate": 1.4851261411765414e-05, + "loss": 0.2629, + "step": 1524 + }, + { + "epoch": 3.121801432958035, + "grad_norm": 0.32073469626766654, + "learning_rate": 1.4823619097949584e-05, + "loss": 0.2309, + "step": 1525 + }, + { + "epoch": 3.1238485158648923, + "grad_norm": 0.34825982979835035, + "learning_rate": 1.4795987378164432e-05, + "loss": 0.2361, + "step": 1526 + }, + { + "epoch": 3.1258955987717503, + "grad_norm": 0.32245272558823557, + "learning_rate": 1.4768366308961288e-05, + "loss": 0.2281, + "step": 1527 + }, + { + "epoch": 3.127942681678608, + "grad_norm": 0.35652651564540333, + "learning_rate": 1.4740755946869708e-05, + "loss": 0.2508, + "step": 1528 + }, + { + "epoch": 3.1299897645854657, + "grad_norm": 0.3215049499878307, + "learning_rate": 1.4713156348397317e-05, + "loss": 0.2144, + "step": 1529 + }, + { + "epoch": 3.1320368474923233, + "grad_norm": 0.33346781365441597, + "learning_rate": 1.468556757002972e-05, + "loss": 0.2425, + "step": 1530 + }, + { + "epoch": 3.134083930399181, + "grad_norm": 0.35639968613552386, + "learning_rate": 1.4657989668230363e-05, + "loss": 0.2227, + "step": 1531 + }, + { + "epoch": 3.1361310133060387, + "grad_norm": 0.3010242077787357, + "learning_rate": 1.4630422699440461e-05, + "loss": 0.2742, + "step": 1532 + }, + { + "epoch": 3.1381780962128967, + "grad_norm": 0.36016875940221305, + "learning_rate": 1.4602866720078832e-05, + "loss": 0.2747, + "step": 1533 + }, + { + "epoch": 3.140225179119754, + "grad_norm": 0.35237386827329403, + "learning_rate": 1.4575321786541801e-05, + "loss": 0.2408, + "step": 1534 + }, + { + "epoch": 3.142272262026612, + "grad_norm": 0.319112209857982, + "learning_rate": 1.45477879552031e-05, + "loss": 0.2488, + "step": 1535 + }, + { + "epoch": 3.1443193449334697, + "grad_norm": 0.2967734684654477, + "learning_rate": 1.4520265282413722e-05, + "loss": 0.213, + "step": 1536 + }, + { + "epoch": 3.1463664278403276, + "grad_norm": 0.3524504353601547, + "learning_rate": 1.4492753824501833e-05, + "loss": 0.222, + "step": 1537 + }, + { + "epoch": 3.148413510747185, + "grad_norm": 0.33500821048449647, + "learning_rate": 1.4465253637772651e-05, + "loss": 0.2513, + "step": 1538 + }, + { + "epoch": 3.150460593654043, + "grad_norm": 0.34077100422476553, + "learning_rate": 1.443776477850833e-05, + "loss": 0.2701, + "step": 1539 + }, + { + "epoch": 3.1525076765609006, + "grad_norm": 0.34275143758140053, + "learning_rate": 1.4410287302967813e-05, + "loss": 0.3137, + "step": 1540 + }, + { + "epoch": 3.1545547594677585, + "grad_norm": 0.31464152797503897, + "learning_rate": 1.4382821267386781e-05, + "loss": 0.2329, + "step": 1541 + }, + { + "epoch": 3.156601842374616, + "grad_norm": 0.29997611145802033, + "learning_rate": 1.4355366727977473e-05, + "loss": 0.2355, + "step": 1542 + }, + { + "epoch": 3.158648925281474, + "grad_norm": 0.3098740828854044, + "learning_rate": 1.4327923740928613e-05, + "loss": 0.2364, + "step": 1543 + }, + { + "epoch": 3.1606960081883315, + "grad_norm": 0.3182081076670239, + "learning_rate": 1.4300492362405296e-05, + "loss": 0.2307, + "step": 1544 + }, + { + "epoch": 3.1627430910951895, + "grad_norm": 0.33289523643049107, + "learning_rate": 1.4273072648548827e-05, + "loss": 0.2658, + "step": 1545 + }, + { + "epoch": 3.164790174002047, + "grad_norm": 0.3483349343296588, + "learning_rate": 1.4245664655476663e-05, + "loss": 0.239, + "step": 1546 + }, + { + "epoch": 3.166837256908905, + "grad_norm": 0.3344228654090122, + "learning_rate": 1.4218268439282259e-05, + "loss": 0.2136, + "step": 1547 + }, + { + "epoch": 3.1688843398157625, + "grad_norm": 0.31443022141500176, + "learning_rate": 1.4190884056034983e-05, + "loss": 0.2642, + "step": 1548 + }, + { + "epoch": 3.1709314227226204, + "grad_norm": 0.32146571298562293, + "learning_rate": 1.4163511561779956e-05, + "loss": 0.2532, + "step": 1549 + }, + { + "epoch": 3.172978505629478, + "grad_norm": 0.3550608808689066, + "learning_rate": 1.4136151012538008e-05, + "loss": 0.2358, + "step": 1550 + }, + { + "epoch": 3.175025588536336, + "grad_norm": 0.3166912892735251, + "learning_rate": 1.4108802464305496e-05, + "loss": 0.2128, + "step": 1551 + }, + { + "epoch": 3.1770726714431934, + "grad_norm": 0.3332651741225492, + "learning_rate": 1.4081465973054216e-05, + "loss": 0.2423, + "step": 1552 + }, + { + "epoch": 3.1791197543500513, + "grad_norm": 0.3369578277421136, + "learning_rate": 1.4054141594731289e-05, + "loss": 0.205, + "step": 1553 + }, + { + "epoch": 3.181166837256909, + "grad_norm": 0.3549410163798208, + "learning_rate": 1.402682938525906e-05, + "loss": 0.2587, + "step": 1554 + }, + { + "epoch": 3.183213920163767, + "grad_norm": 0.31908575274366446, + "learning_rate": 1.3999529400534941e-05, + "loss": 0.2669, + "step": 1555 + }, + { + "epoch": 3.1852610030706243, + "grad_norm": 0.33778438982482334, + "learning_rate": 1.3972241696431357e-05, + "loss": 0.244, + "step": 1556 + }, + { + "epoch": 3.1873080859774823, + "grad_norm": 0.3536190099748012, + "learning_rate": 1.3944966328795584e-05, + "loss": 0.243, + "step": 1557 + }, + { + "epoch": 3.18935516888434, + "grad_norm": 0.3395539666321202, + "learning_rate": 1.3917703353449646e-05, + "loss": 0.2231, + "step": 1558 + }, + { + "epoch": 3.1914022517911977, + "grad_norm": 0.3107794197012843, + "learning_rate": 1.3890452826190208e-05, + "loss": 0.203, + "step": 1559 + }, + { + "epoch": 3.1934493346980553, + "grad_norm": 0.33383730315910054, + "learning_rate": 1.3863214802788459e-05, + "loss": 0.2239, + "step": 1560 + }, + { + "epoch": 3.1954964176049128, + "grad_norm": 0.3236027308800084, + "learning_rate": 1.3835989338989996e-05, + "loss": 0.2602, + "step": 1561 + }, + { + "epoch": 3.1975435005117707, + "grad_norm": 0.3383450416141259, + "learning_rate": 1.3808776490514727e-05, + "loss": 0.2775, + "step": 1562 + }, + { + "epoch": 3.1995905834186287, + "grad_norm": 0.29556678254047786, + "learning_rate": 1.3781576313056713e-05, + "loss": 0.2305, + "step": 1563 + }, + { + "epoch": 3.201637666325486, + "grad_norm": 0.34240014338764224, + "learning_rate": 1.375438886228411e-05, + "loss": 0.2284, + "step": 1564 + }, + { + "epoch": 3.2036847492323437, + "grad_norm": 0.35927171229864285, + "learning_rate": 1.3727214193839002e-05, + "loss": 0.2669, + "step": 1565 + }, + { + "epoch": 3.2057318321392017, + "grad_norm": 0.295141784625082, + "learning_rate": 1.3700052363337337e-05, + "loss": 0.2237, + "step": 1566 + }, + { + "epoch": 3.207778915046059, + "grad_norm": 0.31352782892999664, + "learning_rate": 1.3672903426368773e-05, + "loss": 0.2105, + "step": 1567 + }, + { + "epoch": 3.209825997952917, + "grad_norm": 0.31282387528552047, + "learning_rate": 1.3645767438496567e-05, + "loss": 0.2252, + "step": 1568 + }, + { + "epoch": 3.2118730808597746, + "grad_norm": 0.3487407042028981, + "learning_rate": 1.3618644455257521e-05, + "loss": 0.2717, + "step": 1569 + }, + { + "epoch": 3.2139201637666326, + "grad_norm": 0.34749040489450855, + "learning_rate": 1.3591534532161781e-05, + "loss": 0.2463, + "step": 1570 + }, + { + "epoch": 3.21596724667349, + "grad_norm": 0.32899297696851715, + "learning_rate": 1.3564437724692766e-05, + "loss": 0.2275, + "step": 1571 + }, + { + "epoch": 3.218014329580348, + "grad_norm": 0.2725890794790335, + "learning_rate": 1.353735408830707e-05, + "loss": 0.2153, + "step": 1572 + }, + { + "epoch": 3.2200614124872056, + "grad_norm": 0.37569994527165246, + "learning_rate": 1.3510283678434317e-05, + "loss": 0.2445, + "step": 1573 + }, + { + "epoch": 3.2221084953940635, + "grad_norm": 0.3527821656094053, + "learning_rate": 1.348322655047707e-05, + "loss": 0.2088, + "step": 1574 + }, + { + "epoch": 3.224155578300921, + "grad_norm": 0.3319266141186732, + "learning_rate": 1.3456182759810708e-05, + "loss": 0.2336, + "step": 1575 + }, + { + "epoch": 3.226202661207779, + "grad_norm": 0.3545486115689844, + "learning_rate": 1.3429152361783307e-05, + "loss": 0.2681, + "step": 1576 + }, + { + "epoch": 3.2282497441146365, + "grad_norm": 0.3385783041962988, + "learning_rate": 1.3402135411715545e-05, + "loss": 0.2315, + "step": 1577 + }, + { + "epoch": 3.2302968270214945, + "grad_norm": 0.32938060023769156, + "learning_rate": 1.337513196490056e-05, + "loss": 0.2498, + "step": 1578 + }, + { + "epoch": 3.232343909928352, + "grad_norm": 0.30316278108042816, + "learning_rate": 1.3348142076603876e-05, + "loss": 0.1928, + "step": 1579 + }, + { + "epoch": 3.23439099283521, + "grad_norm": 0.34764955355779054, + "learning_rate": 1.3321165802063243e-05, + "loss": 0.28, + "step": 1580 + }, + { + "epoch": 3.2364380757420674, + "grad_norm": 0.3268398726034169, + "learning_rate": 1.3294203196488576e-05, + "loss": 0.2804, + "step": 1581 + }, + { + "epoch": 3.2384851586489254, + "grad_norm": 0.31999282008130525, + "learning_rate": 1.3267254315061797e-05, + "loss": 0.26, + "step": 1582 + }, + { + "epoch": 3.240532241555783, + "grad_norm": 0.31712302189517827, + "learning_rate": 1.324031921293674e-05, + "loss": 0.23, + "step": 1583 + }, + { + "epoch": 3.242579324462641, + "grad_norm": 0.3137795221764201, + "learning_rate": 1.3213397945239053e-05, + "loss": 0.243, + "step": 1584 + }, + { + "epoch": 3.2446264073694984, + "grad_norm": 0.3236833391733087, + "learning_rate": 1.318649056706605e-05, + "loss": 0.2621, + "step": 1585 + }, + { + "epoch": 3.2466734902763563, + "grad_norm": 0.28341768420158997, + "learning_rate": 1.3159597133486628e-05, + "loss": 0.2105, + "step": 1586 + }, + { + "epoch": 3.248720573183214, + "grad_norm": 0.34778024409649644, + "learning_rate": 1.313271769954115e-05, + "loss": 0.2899, + "step": 1587 + }, + { + "epoch": 3.250767656090072, + "grad_norm": 0.3072274493404741, + "learning_rate": 1.3105852320241326e-05, + "loss": 0.2141, + "step": 1588 + }, + { + "epoch": 3.2528147389969293, + "grad_norm": 0.32380150432128973, + "learning_rate": 1.307900105057009e-05, + "loss": 0.2218, + "step": 1589 + }, + { + "epoch": 3.2548618219037873, + "grad_norm": 0.3234313545922506, + "learning_rate": 1.3052163945481517e-05, + "loss": 0.2301, + "step": 1590 + }, + { + "epoch": 3.2569089048106448, + "grad_norm": 0.3409588063960833, + "learning_rate": 1.3025341059900675e-05, + "loss": 0.2331, + "step": 1591 + }, + { + "epoch": 3.2589559877175027, + "grad_norm": 0.33214628078621744, + "learning_rate": 1.2998532448723536e-05, + "loss": 0.2253, + "step": 1592 + }, + { + "epoch": 3.2610030706243602, + "grad_norm": 0.3495327746489498, + "learning_rate": 1.2971738166816871e-05, + "loss": 0.2369, + "step": 1593 + }, + { + "epoch": 3.263050153531218, + "grad_norm": 0.35093939156017323, + "learning_rate": 1.2944958269018103e-05, + "loss": 0.2329, + "step": 1594 + }, + { + "epoch": 3.2650972364380757, + "grad_norm": 0.3204843846779589, + "learning_rate": 1.291819281013524e-05, + "loss": 0.2144, + "step": 1595 + }, + { + "epoch": 3.2671443193449337, + "grad_norm": 0.3484251329111784, + "learning_rate": 1.289144184494671e-05, + "loss": 0.2531, + "step": 1596 + }, + { + "epoch": 3.269191402251791, + "grad_norm": 0.29945051356789365, + "learning_rate": 1.2864705428201307e-05, + "loss": 0.2293, + "step": 1597 + }, + { + "epoch": 3.2712384851586487, + "grad_norm": 0.3521856225970276, + "learning_rate": 1.2837983614618023e-05, + "loss": 0.3006, + "step": 1598 + }, + { + "epoch": 3.2732855680655066, + "grad_norm": 0.3221884922752649, + "learning_rate": 1.2811276458885993e-05, + "loss": 0.2331, + "step": 1599 + }, + { + "epoch": 3.2753326509723646, + "grad_norm": 0.30627973138617387, + "learning_rate": 1.2784584015664337e-05, + "loss": 0.222, + "step": 1600 + }, + { + "epoch": 3.277379733879222, + "grad_norm": 0.3650912580402862, + "learning_rate": 1.2757906339582053e-05, + "loss": 0.269, + "step": 1601 + }, + { + "epoch": 3.2794268167860796, + "grad_norm": 0.2847330580470038, + "learning_rate": 1.2731243485237932e-05, + "loss": 0.2161, + "step": 1602 + }, + { + "epoch": 3.2814738996929376, + "grad_norm": 0.3140163871014674, + "learning_rate": 1.2704595507200435e-05, + "loss": 0.271, + "step": 1603 + }, + { + "epoch": 3.2835209825997955, + "grad_norm": 0.324180451733573, + "learning_rate": 1.2677962460007555e-05, + "loss": 0.2124, + "step": 1604 + }, + { + "epoch": 3.285568065506653, + "grad_norm": 0.32841827310726324, + "learning_rate": 1.2651344398166745e-05, + "loss": 0.2905, + "step": 1605 + }, + { + "epoch": 3.2876151484135105, + "grad_norm": 0.2992524116524214, + "learning_rate": 1.26247413761548e-05, + "loss": 0.1951, + "step": 1606 + }, + { + "epoch": 3.2896622313203685, + "grad_norm": 0.3536568449695457, + "learning_rate": 1.2598153448417701e-05, + "loss": 0.3008, + "step": 1607 + }, + { + "epoch": 3.291709314227226, + "grad_norm": 0.2745017989746043, + "learning_rate": 1.2571580669370565e-05, + "loss": 0.218, + "step": 1608 + }, + { + "epoch": 3.293756397134084, + "grad_norm": 0.336785618203528, + "learning_rate": 1.254502309339749e-05, + "loss": 0.2163, + "step": 1609 + }, + { + "epoch": 3.2958034800409415, + "grad_norm": 0.29711300837975474, + "learning_rate": 1.2518480774851472e-05, + "loss": 0.2217, + "step": 1610 + }, + { + "epoch": 3.2978505629477994, + "grad_norm": 0.32116672616384045, + "learning_rate": 1.2491953768054263e-05, + "loss": 0.2595, + "step": 1611 + }, + { + "epoch": 3.299897645854657, + "grad_norm": 0.3367433516635412, + "learning_rate": 1.2465442127296297e-05, + "loss": 0.2293, + "step": 1612 + }, + { + "epoch": 3.301944728761515, + "grad_norm": 0.3300984778131218, + "learning_rate": 1.2438945906836557e-05, + "loss": 0.2443, + "step": 1613 + }, + { + "epoch": 3.3039918116683724, + "grad_norm": 0.31907195158009316, + "learning_rate": 1.241246516090245e-05, + "loss": 0.2321, + "step": 1614 + }, + { + "epoch": 3.3060388945752304, + "grad_norm": 0.3096668811954514, + "learning_rate": 1.2385999943689732e-05, + "loss": 0.2216, + "step": 1615 + }, + { + "epoch": 3.308085977482088, + "grad_norm": 0.3776462069541184, + "learning_rate": 1.2359550309362368e-05, + "loss": 0.2104, + "step": 1616 + }, + { + "epoch": 3.310133060388946, + "grad_norm": 0.3196294164019087, + "learning_rate": 1.2333116312052416e-05, + "loss": 0.2403, + "step": 1617 + }, + { + "epoch": 3.3121801432958033, + "grad_norm": 0.34176563178349245, + "learning_rate": 1.2306698005859975e-05, + "loss": 0.2409, + "step": 1618 + }, + { + "epoch": 3.3142272262026613, + "grad_norm": 0.34417655045127915, + "learning_rate": 1.2280295444852994e-05, + "loss": 0.2899, + "step": 1619 + }, + { + "epoch": 3.316274309109519, + "grad_norm": 0.34668338984896196, + "learning_rate": 1.22539086830672e-05, + "loss": 0.216, + "step": 1620 + }, + { + "epoch": 3.3183213920163768, + "grad_norm": 0.307563527231833, + "learning_rate": 1.2227537774505996e-05, + "loss": 0.204, + "step": 1621 + }, + { + "epoch": 3.3203684749232343, + "grad_norm": 0.36360020699523277, + "learning_rate": 1.2201182773140334e-05, + "loss": 0.2321, + "step": 1622 + }, + { + "epoch": 3.3224155578300922, + "grad_norm": 0.34959023754084967, + "learning_rate": 1.2174843732908609e-05, + "loss": 0.2417, + "step": 1623 + }, + { + "epoch": 3.3244626407369497, + "grad_norm": 0.35791334099991956, + "learning_rate": 1.2148520707716567e-05, + "loss": 0.267, + "step": 1624 + }, + { + "epoch": 3.3265097236438077, + "grad_norm": 0.31882901909973, + "learning_rate": 1.2122213751437147e-05, + "loss": 0.2368, + "step": 1625 + }, + { + "epoch": 3.328556806550665, + "grad_norm": 0.31734552594196086, + "learning_rate": 1.2095922917910427e-05, + "loss": 0.2437, + "step": 1626 + }, + { + "epoch": 3.330603889457523, + "grad_norm": 0.3303738558908603, + "learning_rate": 1.2069648260943473e-05, + "loss": 0.2326, + "step": 1627 + }, + { + "epoch": 3.3326509723643807, + "grad_norm": 0.32163501618546503, + "learning_rate": 1.2043389834310257e-05, + "loss": 0.2579, + "step": 1628 + }, + { + "epoch": 3.3346980552712386, + "grad_norm": 0.33400441651552865, + "learning_rate": 1.2017147691751512e-05, + "loss": 0.2572, + "step": 1629 + }, + { + "epoch": 3.336745138178096, + "grad_norm": 0.3062361134778084, + "learning_rate": 1.1990921886974669e-05, + "loss": 0.1971, + "step": 1630 + }, + { + "epoch": 3.338792221084954, + "grad_norm": 0.3617880140065343, + "learning_rate": 1.1964712473653713e-05, + "loss": 0.2752, + "step": 1631 + }, + { + "epoch": 3.3408393039918116, + "grad_norm": 0.31208903084798195, + "learning_rate": 1.1938519505429072e-05, + "loss": 0.1891, + "step": 1632 + }, + { + "epoch": 3.3428863868986696, + "grad_norm": 0.3846438154154162, + "learning_rate": 1.1912343035907535e-05, + "loss": 0.2663, + "step": 1633 + }, + { + "epoch": 3.344933469805527, + "grad_norm": 0.3360119347175398, + "learning_rate": 1.1886183118662108e-05, + "loss": 0.225, + "step": 1634 + }, + { + "epoch": 3.346980552712385, + "grad_norm": 0.34434286466245134, + "learning_rate": 1.1860039807231923e-05, + "loss": 0.2311, + "step": 1635 + }, + { + "epoch": 3.3490276356192425, + "grad_norm": 0.3193089695495996, + "learning_rate": 1.1833913155122132e-05, + "loss": 0.1958, + "step": 1636 + }, + { + "epoch": 3.3510747185261005, + "grad_norm": 0.34254339904952036, + "learning_rate": 1.1807803215803806e-05, + "loss": 0.2301, + "step": 1637 + }, + { + "epoch": 3.353121801432958, + "grad_norm": 0.37284073286468844, + "learning_rate": 1.1781710042713783e-05, + "loss": 0.2321, + "step": 1638 + }, + { + "epoch": 3.3551688843398155, + "grad_norm": 0.3400024012806843, + "learning_rate": 1.1755633689254609e-05, + "loss": 0.2481, + "step": 1639 + }, + { + "epoch": 3.3572159672466735, + "grad_norm": 0.3381525204885461, + "learning_rate": 1.1729574208794388e-05, + "loss": 0.2486, + "step": 1640 + }, + { + "epoch": 3.3592630501535314, + "grad_norm": 0.3197266149899627, + "learning_rate": 1.1703531654666714e-05, + "loss": 0.2368, + "step": 1641 + }, + { + "epoch": 3.361310133060389, + "grad_norm": 0.3130217545123078, + "learning_rate": 1.1677506080170512e-05, + "loss": 0.2342, + "step": 1642 + }, + { + "epoch": 3.3633572159672465, + "grad_norm": 0.32947434661394787, + "learning_rate": 1.1651497538569984e-05, + "loss": 0.2124, + "step": 1643 + }, + { + "epoch": 3.3654042988741044, + "grad_norm": 0.344686988795317, + "learning_rate": 1.162550608309446e-05, + "loss": 0.2464, + "step": 1644 + }, + { + "epoch": 3.3674513817809624, + "grad_norm": 0.30351992821717133, + "learning_rate": 1.1599531766938306e-05, + "loss": 0.2378, + "step": 1645 + }, + { + "epoch": 3.36949846468782, + "grad_norm": 0.31537723127663564, + "learning_rate": 1.1573574643260787e-05, + "loss": 0.2683, + "step": 1646 + }, + { + "epoch": 3.3715455475946774, + "grad_norm": 0.3179109407008773, + "learning_rate": 1.1547634765186016e-05, + "loss": 0.21, + "step": 1647 + }, + { + "epoch": 3.3735926305015353, + "grad_norm": 0.3414400925992681, + "learning_rate": 1.1521712185802789e-05, + "loss": 0.2467, + "step": 1648 + }, + { + "epoch": 3.375639713408393, + "grad_norm": 0.3015513958093698, + "learning_rate": 1.1495806958164508e-05, + "loss": 0.2333, + "step": 1649 + }, + { + "epoch": 3.377686796315251, + "grad_norm": 0.3221671987582906, + "learning_rate": 1.1469919135289058e-05, + "loss": 0.2697, + "step": 1650 + }, + { + "epoch": 3.3797338792221083, + "grad_norm": 0.29450678032670125, + "learning_rate": 1.1444048770158718e-05, + "loss": 0.2255, + "step": 1651 + }, + { + "epoch": 3.3817809621289663, + "grad_norm": 0.39235810212847905, + "learning_rate": 1.1418195915720002e-05, + "loss": 0.2599, + "step": 1652 + }, + { + "epoch": 3.383828045035824, + "grad_norm": 0.2945867088296247, + "learning_rate": 1.139236062488362e-05, + "loss": 0.2336, + "step": 1653 + }, + { + "epoch": 3.3858751279426818, + "grad_norm": 0.3178722215887932, + "learning_rate": 1.136654295052433e-05, + "loss": 0.266, + "step": 1654 + }, + { + "epoch": 3.3879222108495393, + "grad_norm": 0.32262451442066525, + "learning_rate": 1.134074294548082e-05, + "loss": 0.2282, + "step": 1655 + }, + { + "epoch": 3.389969293756397, + "grad_norm": 0.3260714484305257, + "learning_rate": 1.1314960662555639e-05, + "loss": 0.2294, + "step": 1656 + }, + { + "epoch": 3.3920163766632547, + "grad_norm": 0.29552572043468256, + "learning_rate": 1.1289196154515048e-05, + "loss": 0.1852, + "step": 1657 + }, + { + "epoch": 3.3940634595701127, + "grad_norm": 0.32257518359378057, + "learning_rate": 1.1263449474088944e-05, + "loss": 0.2122, + "step": 1658 + }, + { + "epoch": 3.39611054247697, + "grad_norm": 0.31678272789529893, + "learning_rate": 1.1237720673970713e-05, + "loss": 0.2391, + "step": 1659 + }, + { + "epoch": 3.398157625383828, + "grad_norm": 0.3294063880106706, + "learning_rate": 1.1212009806817163e-05, + "loss": 0.2582, + "step": 1660 + }, + { + "epoch": 3.4002047082906857, + "grad_norm": 0.33445038877415256, + "learning_rate": 1.118631692524843e-05, + "loss": 0.2325, + "step": 1661 + }, + { + "epoch": 3.4022517911975436, + "grad_norm": 0.31780769894415345, + "learning_rate": 1.1160642081847782e-05, + "loss": 0.2114, + "step": 1662 + }, + { + "epoch": 3.404298874104401, + "grad_norm": 0.34936980398469303, + "learning_rate": 1.1134985329161608e-05, + "loss": 0.2633, + "step": 1663 + }, + { + "epoch": 3.406345957011259, + "grad_norm": 0.25267194298381324, + "learning_rate": 1.1109346719699263e-05, + "loss": 0.1672, + "step": 1664 + }, + { + "epoch": 3.4083930399181166, + "grad_norm": 0.39414125375432546, + "learning_rate": 1.108372630593298e-05, + "loss": 0.3388, + "step": 1665 + }, + { + "epoch": 3.4104401228249746, + "grad_norm": 0.31231427401371586, + "learning_rate": 1.1058124140297718e-05, + "loss": 0.247, + "step": 1666 + }, + { + "epoch": 3.412487205731832, + "grad_norm": 0.2819687858861257, + "learning_rate": 1.1032540275191148e-05, + "loss": 0.2171, + "step": 1667 + }, + { + "epoch": 3.41453428863869, + "grad_norm": 0.32854044476275013, + "learning_rate": 1.1006974762973425e-05, + "loss": 0.2265, + "step": 1668 + }, + { + "epoch": 3.4165813715455475, + "grad_norm": 0.34353435192327664, + "learning_rate": 1.0981427655967183e-05, + "loss": 0.2469, + "step": 1669 + }, + { + "epoch": 3.4186284544524055, + "grad_norm": 0.32710285779281467, + "learning_rate": 1.0955899006457373e-05, + "loss": 0.2437, + "step": 1670 + }, + { + "epoch": 3.420675537359263, + "grad_norm": 0.3122882795616273, + "learning_rate": 1.0930388866691181e-05, + "loss": 0.2433, + "step": 1671 + }, + { + "epoch": 3.422722620266121, + "grad_norm": 0.29841602691342367, + "learning_rate": 1.0904897288877891e-05, + "loss": 0.2373, + "step": 1672 + }, + { + "epoch": 3.4247697031729785, + "grad_norm": 0.32318146749811455, + "learning_rate": 1.0879424325188805e-05, + "loss": 0.2477, + "step": 1673 + }, + { + "epoch": 3.4268167860798364, + "grad_norm": 0.32962691544440564, + "learning_rate": 1.085397002775716e-05, + "loss": 0.2577, + "step": 1674 + }, + { + "epoch": 3.428863868986694, + "grad_norm": 0.3028821115102944, + "learning_rate": 1.0828534448677942e-05, + "loss": 0.256, + "step": 1675 + }, + { + "epoch": 3.4309109518935514, + "grad_norm": 0.31035238566507123, + "learning_rate": 1.080311764000786e-05, + "loss": 0.223, + "step": 1676 + }, + { + "epoch": 3.4329580348004094, + "grad_norm": 0.30168173543643445, + "learning_rate": 1.0777719653765191e-05, + "loss": 0.2389, + "step": 1677 + }, + { + "epoch": 3.4350051177072674, + "grad_norm": 0.28500495929546144, + "learning_rate": 1.0752340541929711e-05, + "loss": 0.226, + "step": 1678 + }, + { + "epoch": 3.437052200614125, + "grad_norm": 0.3141146687901098, + "learning_rate": 1.0726980356442524e-05, + "loss": 0.2684, + "step": 1679 + }, + { + "epoch": 3.4390992835209824, + "grad_norm": 0.4682196146944551, + "learning_rate": 1.0701639149206061e-05, + "loss": 0.2647, + "step": 1680 + }, + { + "epoch": 3.4411463664278403, + "grad_norm": 0.3365183925944955, + "learning_rate": 1.0676316972083867e-05, + "loss": 0.2432, + "step": 1681 + }, + { + "epoch": 3.4431934493346983, + "grad_norm": 0.28313743388646095, + "learning_rate": 1.0651013876900546e-05, + "loss": 0.2099, + "step": 1682 + }, + { + "epoch": 3.445240532241556, + "grad_norm": 0.34192081983514505, + "learning_rate": 1.0625729915441659e-05, + "loss": 0.2482, + "step": 1683 + }, + { + "epoch": 3.4472876151484133, + "grad_norm": 0.31305659538484126, + "learning_rate": 1.060046513945361e-05, + "loss": 0.2137, + "step": 1684 + }, + { + "epoch": 3.4493346980552713, + "grad_norm": 0.3191034853933784, + "learning_rate": 1.0575219600643508e-05, + "loss": 0.2329, + "step": 1685 + }, + { + "epoch": 3.4513817809621288, + "grad_norm": 0.3245403446485034, + "learning_rate": 1.0549993350679138e-05, + "loss": 0.235, + "step": 1686 + }, + { + "epoch": 3.4534288638689867, + "grad_norm": 0.3088632467059862, + "learning_rate": 1.0524786441188786e-05, + "loss": 0.2155, + "step": 1687 + }, + { + "epoch": 3.4554759467758442, + "grad_norm": 0.3354339197261192, + "learning_rate": 1.0499598923761139e-05, + "loss": 0.2341, + "step": 1688 + }, + { + "epoch": 3.457523029682702, + "grad_norm": 0.3090008978999402, + "learning_rate": 1.0474430849945214e-05, + "loss": 0.2081, + "step": 1689 + }, + { + "epoch": 3.4595701125895597, + "grad_norm": 0.30640868139315103, + "learning_rate": 1.0449282271250239e-05, + "loss": 0.2258, + "step": 1690 + }, + { + "epoch": 3.4616171954964177, + "grad_norm": 0.325313731285426, + "learning_rate": 1.0424153239145527e-05, + "loss": 0.267, + "step": 1691 + }, + { + "epoch": 3.463664278403275, + "grad_norm": 0.3226758771977029, + "learning_rate": 1.0399043805060406e-05, + "loss": 0.2761, + "step": 1692 + }, + { + "epoch": 3.465711361310133, + "grad_norm": 0.2751548117016905, + "learning_rate": 1.0373954020384073e-05, + "loss": 0.2102, + "step": 1693 + }, + { + "epoch": 3.4677584442169906, + "grad_norm": 0.3197825524327039, + "learning_rate": 1.0348883936465537e-05, + "loss": 0.2558, + "step": 1694 + }, + { + "epoch": 3.4698055271238486, + "grad_norm": 0.3009633405603159, + "learning_rate": 1.0323833604613454e-05, + "loss": 0.2214, + "step": 1695 + }, + { + "epoch": 3.471852610030706, + "grad_norm": 0.3104955641697045, + "learning_rate": 1.029880307609608e-05, + "loss": 0.2686, + "step": 1696 + }, + { + "epoch": 3.473899692937564, + "grad_norm": 0.2692226200882447, + "learning_rate": 1.0273792402141134e-05, + "loss": 0.2312, + "step": 1697 + }, + { + "epoch": 3.4759467758444216, + "grad_norm": 0.31761762398562665, + "learning_rate": 1.0248801633935699e-05, + "loss": 0.2536, + "step": 1698 + }, + { + "epoch": 3.4779938587512795, + "grad_norm": 0.3196706676343915, + "learning_rate": 1.0223830822626124e-05, + "loss": 0.2556, + "step": 1699 + }, + { + "epoch": 3.480040941658137, + "grad_norm": 0.2983026635325646, + "learning_rate": 1.0198880019317913e-05, + "loss": 0.2468, + "step": 1700 + }, + { + "epoch": 3.482088024564995, + "grad_norm": 0.304844430252478, + "learning_rate": 1.017394927507561e-05, + "loss": 0.2252, + "step": 1701 + }, + { + "epoch": 3.4841351074718525, + "grad_norm": 0.32219494329823706, + "learning_rate": 1.0149038640922715e-05, + "loss": 0.2576, + "step": 1702 + }, + { + "epoch": 3.4861821903787105, + "grad_norm": 0.27799343218915756, + "learning_rate": 1.0124148167841577e-05, + "loss": 0.1703, + "step": 1703 + }, + { + "epoch": 3.488229273285568, + "grad_norm": 0.32497007511792664, + "learning_rate": 1.009927790677327e-05, + "loss": 0.2366, + "step": 1704 + }, + { + "epoch": 3.490276356192426, + "grad_norm": 0.3095778446432627, + "learning_rate": 1.0074427908617515e-05, + "loss": 0.2806, + "step": 1705 + }, + { + "epoch": 3.4923234390992834, + "grad_norm": 0.3249340961496839, + "learning_rate": 1.004959822423255e-05, + "loss": 0.2406, + "step": 1706 + }, + { + "epoch": 3.4943705220061414, + "grad_norm": 0.3092490415262631, + "learning_rate": 1.0024788904435054e-05, + "loss": 0.225, + "step": 1707 + }, + { + "epoch": 3.496417604912999, + "grad_norm": 0.3376859697600405, + "learning_rate": 1.0000000000000006e-05, + "loss": 0.2859, + "step": 1708 + }, + { + "epoch": 3.498464687819857, + "grad_norm": 0.2986922022588596, + "learning_rate": 9.975231561660617e-06, + "loss": 0.2241, + "step": 1709 + }, + { + "epoch": 3.5005117707267144, + "grad_norm": 0.3304776655435591, + "learning_rate": 9.950483640108215e-06, + "loss": 0.2865, + "step": 1710 + }, + { + "epoch": 3.5025588536335723, + "grad_norm": 0.30674425880777934, + "learning_rate": 9.92575628599213e-06, + "loss": 0.204, + "step": 1711 + }, + { + "epoch": 3.50460593654043, + "grad_norm": 0.3372001770402179, + "learning_rate": 9.901049549919601e-06, + "loss": 0.2705, + "step": 1712 + }, + { + "epoch": 3.5066530194472874, + "grad_norm": 0.28957256516814167, + "learning_rate": 9.876363482455675e-06, + "loss": 0.215, + "step": 1713 + }, + { + "epoch": 3.5087001023541453, + "grad_norm": 0.36096631037593485, + "learning_rate": 9.851698134123095e-06, + "loss": 0.2522, + "step": 1714 + }, + { + "epoch": 3.5107471852610033, + "grad_norm": 0.3128276504087451, + "learning_rate": 9.827053555402191e-06, + "loss": 0.2661, + "step": 1715 + }, + { + "epoch": 3.512794268167861, + "grad_norm": 0.3126033507485691, + "learning_rate": 9.802429796730792e-06, + "loss": 0.2329, + "step": 1716 + }, + { + "epoch": 3.5148413510747183, + "grad_norm": 0.33709412127432403, + "learning_rate": 9.777826908504126e-06, + "loss": 0.2673, + "step": 1717 + }, + { + "epoch": 3.5168884339815762, + "grad_norm": 0.31504566512999976, + "learning_rate": 9.753244941074696e-06, + "loss": 0.1942, + "step": 1718 + }, + { + "epoch": 3.518935516888434, + "grad_norm": 0.3081794499055162, + "learning_rate": 9.728683944752193e-06, + "loss": 0.23, + "step": 1719 + }, + { + "epoch": 3.5209825997952917, + "grad_norm": 0.3205819253659646, + "learning_rate": 9.704143969803392e-06, + "loss": 0.2599, + "step": 1720 + }, + { + "epoch": 3.5230296827021492, + "grad_norm": 0.3179566007920735, + "learning_rate": 9.679625066452028e-06, + "loss": 0.2468, + "step": 1721 + }, + { + "epoch": 3.525076765609007, + "grad_norm": 0.33546341224030785, + "learning_rate": 9.655127284878723e-06, + "loss": 0.2285, + "step": 1722 + }, + { + "epoch": 3.527123848515865, + "grad_norm": 0.3406356093850999, + "learning_rate": 9.630650675220892e-06, + "loss": 0.2277, + "step": 1723 + }, + { + "epoch": 3.5291709314227226, + "grad_norm": 0.297528564327505, + "learning_rate": 9.606195287572577e-06, + "loss": 0.199, + "step": 1724 + }, + { + "epoch": 3.53121801432958, + "grad_norm": 0.3221532376937218, + "learning_rate": 9.581761171984416e-06, + "loss": 0.2157, + "step": 1725 + }, + { + "epoch": 3.533265097236438, + "grad_norm": 0.35339763123663803, + "learning_rate": 9.557348378463503e-06, + "loss": 0.2377, + "step": 1726 + }, + { + "epoch": 3.535312180143296, + "grad_norm": 0.3011217732360328, + "learning_rate": 9.532956956973302e-06, + "loss": 0.2316, + "step": 1727 + }, + { + "epoch": 3.5373592630501536, + "grad_norm": 0.31883004437994317, + "learning_rate": 9.50858695743351e-06, + "loss": 0.2358, + "step": 1728 + }, + { + "epoch": 3.539406345957011, + "grad_norm": 0.4148367793712331, + "learning_rate": 9.484238429720018e-06, + "loss": 0.2412, + "step": 1729 + }, + { + "epoch": 3.541453428863869, + "grad_norm": 0.3651190389759453, + "learning_rate": 9.459911423664763e-06, + "loss": 0.2496, + "step": 1730 + }, + { + "epoch": 3.5435005117707266, + "grad_norm": 0.2782188953479953, + "learning_rate": 9.435605989055607e-06, + "loss": 0.2028, + "step": 1731 + }, + { + "epoch": 3.5455475946775845, + "grad_norm": 0.31391987453704123, + "learning_rate": 9.411322175636298e-06, + "loss": 0.2561, + "step": 1732 + }, + { + "epoch": 3.547594677584442, + "grad_norm": 0.3158415427773136, + "learning_rate": 9.387060033106321e-06, + "loss": 0.2956, + "step": 1733 + }, + { + "epoch": 3.5496417604913, + "grad_norm": 0.3175891657245625, + "learning_rate": 9.362819611120793e-06, + "loss": 0.2566, + "step": 1734 + }, + { + "epoch": 3.5516888433981575, + "grad_norm": 0.3312219072193364, + "learning_rate": 9.338600959290414e-06, + "loss": 0.2317, + "step": 1735 + }, + { + "epoch": 3.5537359263050154, + "grad_norm": 0.3341176350083145, + "learning_rate": 9.314404127181307e-06, + "loss": 0.293, + "step": 1736 + }, + { + "epoch": 3.555783009211873, + "grad_norm": 0.29940835888683603, + "learning_rate": 9.290229164314928e-06, + "loss": 0.2221, + "step": 1737 + }, + { + "epoch": 3.557830092118731, + "grad_norm": 0.3046910176714613, + "learning_rate": 9.266076120167992e-06, + "loss": 0.2472, + "step": 1738 + }, + { + "epoch": 3.5598771750255884, + "grad_norm": 0.2841913350596404, + "learning_rate": 9.241945044172353e-06, + "loss": 0.2277, + "step": 1739 + }, + { + "epoch": 3.5619242579324464, + "grad_norm": 0.323279705340557, + "learning_rate": 9.217835985714898e-06, + "loss": 0.2709, + "step": 1740 + }, + { + "epoch": 3.563971340839304, + "grad_norm": 0.29421984064136286, + "learning_rate": 9.193748994137462e-06, + "loss": 0.2064, + "step": 1741 + }, + { + "epoch": 3.566018423746162, + "grad_norm": 0.3431113829919363, + "learning_rate": 9.169684118736708e-06, + "loss": 0.2581, + "step": 1742 + }, + { + "epoch": 3.5680655066530194, + "grad_norm": 0.31454166876772843, + "learning_rate": 9.145641408764048e-06, + "loss": 0.2135, + "step": 1743 + }, + { + "epoch": 3.5701125895598773, + "grad_norm": 0.3294672393878416, + "learning_rate": 9.121620913425508e-06, + "loss": 0.2607, + "step": 1744 + }, + { + "epoch": 3.572159672466735, + "grad_norm": 0.29027976941333605, + "learning_rate": 9.097622681881673e-06, + "loss": 0.1969, + "step": 1745 + }, + { + "epoch": 3.574206755373593, + "grad_norm": 0.3177811895638415, + "learning_rate": 9.073646763247558e-06, + "loss": 0.2103, + "step": 1746 + }, + { + "epoch": 3.5762538382804503, + "grad_norm": 0.3196047544970432, + "learning_rate": 9.04969320659249e-06, + "loss": 0.239, + "step": 1747 + }, + { + "epoch": 3.5783009211873082, + "grad_norm": 0.2925767290546702, + "learning_rate": 9.025762060940062e-06, + "loss": 0.198, + "step": 1748 + }, + { + "epoch": 3.5803480040941658, + "grad_norm": 0.33053764466506, + "learning_rate": 9.001853375267989e-06, + "loss": 0.2366, + "step": 1749 + }, + { + "epoch": 3.5823950870010233, + "grad_norm": 0.3174304755008507, + "learning_rate": 8.977967198508001e-06, + "loss": 0.2256, + "step": 1750 + }, + { + "epoch": 3.5844421699078812, + "grad_norm": 0.343170859287156, + "learning_rate": 8.954103579545785e-06, + "loss": 0.2341, + "step": 1751 + }, + { + "epoch": 3.586489252814739, + "grad_norm": 0.3215320683527329, + "learning_rate": 8.93026256722085e-06, + "loss": 0.2045, + "step": 1752 + }, + { + "epoch": 3.5885363357215967, + "grad_norm": 0.345488281284008, + "learning_rate": 8.906444210326441e-06, + "loss": 0.2708, + "step": 1753 + }, + { + "epoch": 3.590583418628454, + "grad_norm": 0.29374650212717546, + "learning_rate": 8.882648557609434e-06, + "loss": 0.2144, + "step": 1754 + }, + { + "epoch": 3.592630501535312, + "grad_norm": 0.30791903954132194, + "learning_rate": 8.858875657770241e-06, + "loss": 0.2196, + "step": 1755 + }, + { + "epoch": 3.59467758444217, + "grad_norm": 0.37604604250081547, + "learning_rate": 8.83512555946271e-06, + "loss": 0.2842, + "step": 1756 + }, + { + "epoch": 3.5967246673490276, + "grad_norm": 0.2815977468791065, + "learning_rate": 8.811398311294008e-06, + "loss": 0.2128, + "step": 1757 + }, + { + "epoch": 3.598771750255885, + "grad_norm": 0.32463396760239493, + "learning_rate": 8.787693961824555e-06, + "loss": 0.2635, + "step": 1758 + }, + { + "epoch": 3.600818833162743, + "grad_norm": 0.3178563263559145, + "learning_rate": 8.764012559567899e-06, + "loss": 0.2749, + "step": 1759 + }, + { + "epoch": 3.602865916069601, + "grad_norm": 0.32193828000242514, + "learning_rate": 8.740354152990624e-06, + "loss": 0.232, + "step": 1760 + }, + { + "epoch": 3.6049129989764586, + "grad_norm": 0.35115161370306397, + "learning_rate": 8.716718790512251e-06, + "loss": 0.2421, + "step": 1761 + }, + { + "epoch": 3.606960081883316, + "grad_norm": 0.3150133989578825, + "learning_rate": 8.693106520505147e-06, + "loss": 0.2078, + "step": 1762 + }, + { + "epoch": 3.609007164790174, + "grad_norm": 0.34022251620199256, + "learning_rate": 8.669517391294397e-06, + "loss": 0.248, + "step": 1763 + }, + { + "epoch": 3.611054247697032, + "grad_norm": 0.3039508397036727, + "learning_rate": 8.645951451157741e-06, + "loss": 0.187, + "step": 1764 + }, + { + "epoch": 3.6131013306038895, + "grad_norm": 0.3287212950737981, + "learning_rate": 8.622408748325461e-06, + "loss": 0.2774, + "step": 1765 + }, + { + "epoch": 3.615148413510747, + "grad_norm": 0.3159497126746659, + "learning_rate": 8.598889330980277e-06, + "loss": 0.2251, + "step": 1766 + }, + { + "epoch": 3.617195496417605, + "grad_norm": 0.2931231286120822, + "learning_rate": 8.575393247257256e-06, + "loss": 0.2267, + "step": 1767 + }, + { + "epoch": 3.619242579324463, + "grad_norm": 0.3287345655736662, + "learning_rate": 8.551920545243704e-06, + "loss": 0.24, + "step": 1768 + }, + { + "epoch": 3.6212896622313204, + "grad_norm": 0.30400959536179484, + "learning_rate": 8.528471272979083e-06, + "loss": 0.2133, + "step": 1769 + }, + { + "epoch": 3.623336745138178, + "grad_norm": 0.31203010646854484, + "learning_rate": 8.50504547845489e-06, + "loss": 0.2404, + "step": 1770 + }, + { + "epoch": 3.625383828045036, + "grad_norm": 0.30015537192427894, + "learning_rate": 8.481643209614576e-06, + "loss": 0.2059, + "step": 1771 + }, + { + "epoch": 3.6274309109518934, + "grad_norm": 0.295949459736467, + "learning_rate": 8.45826451435347e-06, + "loss": 0.2209, + "step": 1772 + }, + { + "epoch": 3.6294779938587514, + "grad_norm": 0.32651774554006335, + "learning_rate": 8.434909440518613e-06, + "loss": 0.258, + "step": 1773 + }, + { + "epoch": 3.631525076765609, + "grad_norm": 0.28180464315900705, + "learning_rate": 8.411578035908728e-06, + "loss": 0.205, + "step": 1774 + }, + { + "epoch": 3.633572159672467, + "grad_norm": 0.3413634143023636, + "learning_rate": 8.388270348274092e-06, + "loss": 0.2769, + "step": 1775 + }, + { + "epoch": 3.6356192425793243, + "grad_norm": 0.33236322758981973, + "learning_rate": 8.364986425316448e-06, + "loss": 0.2234, + "step": 1776 + }, + { + "epoch": 3.6376663254861823, + "grad_norm": 0.3020595253520059, + "learning_rate": 8.341726314688875e-06, + "loss": 0.2509, + "step": 1777 + }, + { + "epoch": 3.63971340839304, + "grad_norm": 0.31852165056270737, + "learning_rate": 8.318490063995761e-06, + "loss": 0.2537, + "step": 1778 + }, + { + "epoch": 3.6417604912998978, + "grad_norm": 0.2944306330035562, + "learning_rate": 8.295277720792634e-06, + "loss": 0.2222, + "step": 1779 + }, + { + "epoch": 3.6438075742067553, + "grad_norm": 0.3191896831407699, + "learning_rate": 8.272089332586089e-06, + "loss": 0.2437, + "step": 1780 + }, + { + "epoch": 3.6458546571136132, + "grad_norm": 0.3341784697234189, + "learning_rate": 8.248924946833705e-06, + "loss": 0.25, + "step": 1781 + }, + { + "epoch": 3.6479017400204707, + "grad_norm": 0.28526708897000397, + "learning_rate": 8.225784610943948e-06, + "loss": 0.2586, + "step": 1782 + }, + { + "epoch": 3.6499488229273287, + "grad_norm": 0.30819644727048756, + "learning_rate": 8.20266837227603e-06, + "loss": 0.2482, + "step": 1783 + }, + { + "epoch": 3.651995905834186, + "grad_norm": 0.3391130155927512, + "learning_rate": 8.179576278139872e-06, + "loss": 0.2703, + "step": 1784 + }, + { + "epoch": 3.654042988741044, + "grad_norm": 0.3457772155494689, + "learning_rate": 8.156508375795995e-06, + "loss": 0.2138, + "step": 1785 + }, + { + "epoch": 3.6560900716479017, + "grad_norm": 0.3545988817947354, + "learning_rate": 8.133464712455364e-06, + "loss": 0.2381, + "step": 1786 + }, + { + "epoch": 3.6581371545547596, + "grad_norm": 0.3051355101331329, + "learning_rate": 8.11044533527937e-06, + "loss": 0.2212, + "step": 1787 + }, + { + "epoch": 3.660184237461617, + "grad_norm": 0.31093454380049773, + "learning_rate": 8.087450291379693e-06, + "loss": 0.2782, + "step": 1788 + }, + { + "epoch": 3.662231320368475, + "grad_norm": 0.3076008520431847, + "learning_rate": 8.064479627818213e-06, + "loss": 0.2563, + "step": 1789 + }, + { + "epoch": 3.6642784032753326, + "grad_norm": 0.2914759795466477, + "learning_rate": 8.041533391606892e-06, + "loss": 0.237, + "step": 1790 + }, + { + "epoch": 3.66632548618219, + "grad_norm": 0.32811153076900396, + "learning_rate": 8.018611629707735e-06, + "loss": 0.2192, + "step": 1791 + }, + { + "epoch": 3.668372569089048, + "grad_norm": 0.3349979236041854, + "learning_rate": 7.995714389032638e-06, + "loss": 0.2544, + "step": 1792 + }, + { + "epoch": 3.670419651995906, + "grad_norm": 0.30481283300379364, + "learning_rate": 7.972841716443304e-06, + "loss": 0.2021, + "step": 1793 + }, + { + "epoch": 3.6724667349027635, + "grad_norm": 0.31038423151434946, + "learning_rate": 7.949993658751168e-06, + "loss": 0.2714, + "step": 1794 + }, + { + "epoch": 3.674513817809621, + "grad_norm": 0.3046182836563986, + "learning_rate": 7.927170262717284e-06, + "loss": 0.2486, + "step": 1795 + }, + { + "epoch": 3.676560900716479, + "grad_norm": 0.321493522566335, + "learning_rate": 7.904371575052224e-06, + "loss": 0.257, + "step": 1796 + }, + { + "epoch": 3.678607983623337, + "grad_norm": 0.32116628196410996, + "learning_rate": 7.881597642416012e-06, + "loss": 0.2351, + "step": 1797 + }, + { + "epoch": 3.6806550665301945, + "grad_norm": 0.3225460384671046, + "learning_rate": 7.858848511417998e-06, + "loss": 0.2787, + "step": 1798 + }, + { + "epoch": 3.682702149437052, + "grad_norm": 0.3065477845223782, + "learning_rate": 7.836124228616762e-06, + "loss": 0.2059, + "step": 1799 + }, + { + "epoch": 3.68474923234391, + "grad_norm": 0.2820736922840727, + "learning_rate": 7.81342484052004e-06, + "loss": 0.2065, + "step": 1800 + }, + { + "epoch": 3.686796315250768, + "grad_norm": 0.3199543711113544, + "learning_rate": 7.790750393584616e-06, + "loss": 0.2482, + "step": 1801 + }, + { + "epoch": 3.6888433981576254, + "grad_norm": 0.3041948190106927, + "learning_rate": 7.768100934216234e-06, + "loss": 0.2278, + "step": 1802 + }, + { + "epoch": 3.690890481064483, + "grad_norm": 0.3053390429608892, + "learning_rate": 7.745476508769494e-06, + "loss": 0.2356, + "step": 1803 + }, + { + "epoch": 3.692937563971341, + "grad_norm": 0.3110750944178115, + "learning_rate": 7.72287716354776e-06, + "loss": 0.2402, + "step": 1804 + }, + { + "epoch": 3.694984646878199, + "grad_norm": 0.28367962348790254, + "learning_rate": 7.700302944803076e-06, + "loss": 0.1827, + "step": 1805 + }, + { + "epoch": 3.6970317297850563, + "grad_norm": 0.33323381054790285, + "learning_rate": 7.67775389873604e-06, + "loss": 0.2293, + "step": 1806 + }, + { + "epoch": 3.699078812691914, + "grad_norm": 0.3521970989279458, + "learning_rate": 7.65523007149575e-06, + "loss": 0.2657, + "step": 1807 + }, + { + "epoch": 3.701125895598772, + "grad_norm": 0.30699833196534315, + "learning_rate": 7.63273150917969e-06, + "loss": 0.2421, + "step": 1808 + }, + { + "epoch": 3.7031729785056293, + "grad_norm": 0.31416536601357237, + "learning_rate": 7.6102582578336315e-06, + "loss": 0.1997, + "step": 1809 + }, + { + "epoch": 3.7052200614124873, + "grad_norm": 0.3803922818493654, + "learning_rate": 7.587810363451544e-06, + "loss": 0.2428, + "step": 1810 + }, + { + "epoch": 3.707267144319345, + "grad_norm": 0.32978409370696515, + "learning_rate": 7.565387871975511e-06, + "loss": 0.2037, + "step": 1811 + }, + { + "epoch": 3.7093142272262027, + "grad_norm": 0.33553313939376517, + "learning_rate": 7.5429908292956045e-06, + "loss": 0.2675, + "step": 1812 + }, + { + "epoch": 3.7113613101330603, + "grad_norm": 0.3164776677964802, + "learning_rate": 7.5206192812498345e-06, + "loss": 0.262, + "step": 1813 + }, + { + "epoch": 3.713408393039918, + "grad_norm": 0.29867646878594656, + "learning_rate": 7.498273273624022e-06, + "loss": 0.2468, + "step": 1814 + }, + { + "epoch": 3.7154554759467757, + "grad_norm": 0.31216831365580633, + "learning_rate": 7.475952852151722e-06, + "loss": 0.2225, + "step": 1815 + }, + { + "epoch": 3.7175025588536337, + "grad_norm": 0.2854940367151503, + "learning_rate": 7.4536580625141244e-06, + "loss": 0.2302, + "step": 1816 + }, + { + "epoch": 3.719549641760491, + "grad_norm": 0.3133497229188554, + "learning_rate": 7.431388950339955e-06, + "loss": 0.2188, + "step": 1817 + }, + { + "epoch": 3.721596724667349, + "grad_norm": 0.3115695729233575, + "learning_rate": 7.409145561205402e-06, + "loss": 0.2251, + "step": 1818 + }, + { + "epoch": 3.7236438075742067, + "grad_norm": 0.3211680619051747, + "learning_rate": 7.386927940633981e-06, + "loss": 0.244, + "step": 1819 + }, + { + "epoch": 3.7256908904810646, + "grad_norm": 0.3048950617764936, + "learning_rate": 7.364736134096497e-06, + "loss": 0.264, + "step": 1820 + }, + { + "epoch": 3.727737973387922, + "grad_norm": 0.2939363289247235, + "learning_rate": 7.342570187010913e-06, + "loss": 0.235, + "step": 1821 + }, + { + "epoch": 3.72978505629478, + "grad_norm": 0.3233466134307953, + "learning_rate": 7.32043014474227e-06, + "loss": 0.306, + "step": 1822 + }, + { + "epoch": 3.7318321392016376, + "grad_norm": 0.30575032489620757, + "learning_rate": 7.2983160526025854e-06, + "loss": 0.2719, + "step": 1823 + }, + { + "epoch": 3.7338792221084955, + "grad_norm": 0.297249466993302, + "learning_rate": 7.276227955850774e-06, + "loss": 0.235, + "step": 1824 + }, + { + "epoch": 3.735926305015353, + "grad_norm": 0.29688130457735584, + "learning_rate": 7.254165899692554e-06, + "loss": 0.2313, + "step": 1825 + }, + { + "epoch": 3.737973387922211, + "grad_norm": 0.315867334334272, + "learning_rate": 7.2321299292803275e-06, + "loss": 0.2554, + "step": 1826 + }, + { + "epoch": 3.7400204708290685, + "grad_norm": 0.35865826681363316, + "learning_rate": 7.210120089713117e-06, + "loss": 0.2657, + "step": 1827 + }, + { + "epoch": 3.742067553735926, + "grad_norm": 0.29556390069528465, + "learning_rate": 7.188136426036498e-06, + "loss": 0.2309, + "step": 1828 + }, + { + "epoch": 3.744114636642784, + "grad_norm": 0.3062750470379352, + "learning_rate": 7.166178983242425e-06, + "loss": 0.25, + "step": 1829 + }, + { + "epoch": 3.746161719549642, + "grad_norm": 0.3388760658555746, + "learning_rate": 7.1442478062692135e-06, + "loss": 0.2837, + "step": 1830 + }, + { + "epoch": 3.7482088024564995, + "grad_norm": 0.3198475086242932, + "learning_rate": 7.12234294000143e-06, + "loss": 0.2334, + "step": 1831 + }, + { + "epoch": 3.750255885363357, + "grad_norm": 0.3089175153993069, + "learning_rate": 7.100464429269769e-06, + "loss": 0.2647, + "step": 1832 + }, + { + "epoch": 3.752302968270215, + "grad_norm": 0.28196699295142263, + "learning_rate": 7.078612318850999e-06, + "loss": 0.2168, + "step": 1833 + }, + { + "epoch": 3.754350051177073, + "grad_norm": 0.297420997662474, + "learning_rate": 7.056786653467882e-06, + "loss": 0.2028, + "step": 1834 + }, + { + "epoch": 3.7563971340839304, + "grad_norm": 0.28045674846802615, + "learning_rate": 7.034987477789008e-06, + "loss": 0.1939, + "step": 1835 + }, + { + "epoch": 3.758444216990788, + "grad_norm": 0.311062703079177, + "learning_rate": 7.01321483642879e-06, + "loss": 0.2611, + "step": 1836 + }, + { + "epoch": 3.760491299897646, + "grad_norm": 0.297760397539936, + "learning_rate": 6.991468773947321e-06, + "loss": 0.2575, + "step": 1837 + }, + { + "epoch": 3.762538382804504, + "grad_norm": 0.2972100075369938, + "learning_rate": 6.969749334850308e-06, + "loss": 0.2088, + "step": 1838 + }, + { + "epoch": 3.7645854657113613, + "grad_norm": 0.3155194168733677, + "learning_rate": 6.948056563588943e-06, + "loss": 0.2469, + "step": 1839 + }, + { + "epoch": 3.766632548618219, + "grad_norm": 0.33253411719664255, + "learning_rate": 6.926390504559879e-06, + "loss": 0.2066, + "step": 1840 + }, + { + "epoch": 3.768679631525077, + "grad_norm": 0.31443745809454954, + "learning_rate": 6.90475120210508e-06, + "loss": 0.2462, + "step": 1841 + }, + { + "epoch": 3.7707267144319347, + "grad_norm": 0.2916854017686706, + "learning_rate": 6.883138700511735e-06, + "loss": 0.206, + "step": 1842 + }, + { + "epoch": 3.7727737973387923, + "grad_norm": 0.29896805378427554, + "learning_rate": 6.861553044012206e-06, + "loss": 0.2458, + "step": 1843 + }, + { + "epoch": 3.7748208802456498, + "grad_norm": 0.3070231834047609, + "learning_rate": 6.8399942767839075e-06, + "loss": 0.2375, + "step": 1844 + }, + { + "epoch": 3.7768679631525077, + "grad_norm": 0.35269614289266477, + "learning_rate": 6.818462442949203e-06, + "loss": 0.2354, + "step": 1845 + }, + { + "epoch": 3.7789150460593657, + "grad_norm": 0.2885923676811397, + "learning_rate": 6.796957586575364e-06, + "loss": 0.2669, + "step": 1846 + }, + { + "epoch": 3.780962128966223, + "grad_norm": 0.31946737683114335, + "learning_rate": 6.775479751674439e-06, + "loss": 0.2292, + "step": 1847 + }, + { + "epoch": 3.7830092118730807, + "grad_norm": 0.31027789747129964, + "learning_rate": 6.754028982203154e-06, + "loss": 0.2204, + "step": 1848 + }, + { + "epoch": 3.7850562947799387, + "grad_norm": 0.28163894937364203, + "learning_rate": 6.732605322062869e-06, + "loss": 0.2131, + "step": 1849 + }, + { + "epoch": 3.787103377686796, + "grad_norm": 0.32382848241213846, + "learning_rate": 6.711208815099451e-06, + "loss": 0.2349, + "step": 1850 + }, + { + "epoch": 3.789150460593654, + "grad_norm": 0.3138159392771698, + "learning_rate": 6.689839505103195e-06, + "loss": 0.2214, + "step": 1851 + }, + { + "epoch": 3.7911975435005116, + "grad_norm": 0.3336295131707369, + "learning_rate": 6.668497435808736e-06, + "loss": 0.2318, + "step": 1852 + }, + { + "epoch": 3.7932446264073696, + "grad_norm": 0.2805986894444167, + "learning_rate": 6.647182650894956e-06, + "loss": 0.1898, + "step": 1853 + }, + { + "epoch": 3.795291709314227, + "grad_norm": 0.311652087268774, + "learning_rate": 6.6258951939849055e-06, + "loss": 0.2244, + "step": 1854 + }, + { + "epoch": 3.797338792221085, + "grad_norm": 0.33194907539363816, + "learning_rate": 6.604635108645683e-06, + "loss": 0.272, + "step": 1855 + }, + { + "epoch": 3.7993858751279426, + "grad_norm": 0.2974799213005746, + "learning_rate": 6.583402438388391e-06, + "loss": 0.2967, + "step": 1856 + }, + { + "epoch": 3.8014329580348005, + "grad_norm": 0.31133998039092214, + "learning_rate": 6.562197226668015e-06, + "loss": 0.2587, + "step": 1857 + }, + { + "epoch": 3.803480040941658, + "grad_norm": 0.30821960779378943, + "learning_rate": 6.5410195168833425e-06, + "loss": 0.242, + "step": 1858 + }, + { + "epoch": 3.805527123848516, + "grad_norm": 0.30511264324428783, + "learning_rate": 6.519869352376878e-06, + "loss": 0.2318, + "step": 1859 + }, + { + "epoch": 3.8075742067553735, + "grad_norm": 0.2885137726399915, + "learning_rate": 6.498746776434759e-06, + "loss": 0.2412, + "step": 1860 + }, + { + "epoch": 3.8096212896622315, + "grad_norm": 0.30572665106073443, + "learning_rate": 6.477651832286633e-06, + "loss": 0.2714, + "step": 1861 + }, + { + "epoch": 3.811668372569089, + "grad_norm": 0.2999528878346436, + "learning_rate": 6.456584563105628e-06, + "loss": 0.225, + "step": 1862 + }, + { + "epoch": 3.813715455475947, + "grad_norm": 0.306996947567515, + "learning_rate": 6.435545012008213e-06, + "loss": 0.2228, + "step": 1863 + }, + { + "epoch": 3.8157625383828044, + "grad_norm": 0.30156526722849053, + "learning_rate": 6.414533222054138e-06, + "loss": 0.2271, + "step": 1864 + }, + { + "epoch": 3.8178096212896624, + "grad_norm": 0.3027861977559771, + "learning_rate": 6.393549236246333e-06, + "loss": 0.2358, + "step": 1865 + }, + { + "epoch": 3.81985670419652, + "grad_norm": 0.3100676988224775, + "learning_rate": 6.372593097530822e-06, + "loss": 0.2224, + "step": 1866 + }, + { + "epoch": 3.821903787103378, + "grad_norm": 0.30881142890916535, + "learning_rate": 6.3516648487966456e-06, + "loss": 0.212, + "step": 1867 + }, + { + "epoch": 3.8239508700102354, + "grad_norm": 0.3074430788928016, + "learning_rate": 6.330764532875748e-06, + "loss": 0.2559, + "step": 1868 + }, + { + "epoch": 3.825997952917093, + "grad_norm": 0.29993534561417257, + "learning_rate": 6.309892192542919e-06, + "loss": 0.2414, + "step": 1869 + }, + { + "epoch": 3.828045035823951, + "grad_norm": 0.2948226082874635, + "learning_rate": 6.289047870515692e-06, + "loss": 0.2634, + "step": 1870 + }, + { + "epoch": 3.830092118730809, + "grad_norm": 0.35876524638729923, + "learning_rate": 6.268231609454254e-06, + "loss": 0.3221, + "step": 1871 + }, + { + "epoch": 3.8321392016376663, + "grad_norm": 0.29964527729201973, + "learning_rate": 6.247443451961366e-06, + "loss": 0.2046, + "step": 1872 + }, + { + "epoch": 3.834186284544524, + "grad_norm": 0.32365319216721733, + "learning_rate": 6.226683440582268e-06, + "loss": 0.2233, + "step": 1873 + }, + { + "epoch": 3.8362333674513818, + "grad_norm": 0.3064343796948702, + "learning_rate": 6.2059516178046064e-06, + "loss": 0.2548, + "step": 1874 + }, + { + "epoch": 3.8382804503582397, + "grad_norm": 0.31074022748353647, + "learning_rate": 6.185248026058312e-06, + "loss": 0.2682, + "step": 1875 + }, + { + "epoch": 3.8403275332650972, + "grad_norm": 0.3001350448200465, + "learning_rate": 6.164572707715564e-06, + "loss": 0.2287, + "step": 1876 + }, + { + "epoch": 3.8423746161719547, + "grad_norm": 0.3230753170136733, + "learning_rate": 6.143925705090666e-06, + "loss": 0.3244, + "step": 1877 + }, + { + "epoch": 3.8444216990788127, + "grad_norm": 0.2998343099705012, + "learning_rate": 6.123307060439967e-06, + "loss": 0.2163, + "step": 1878 + }, + { + "epoch": 3.8464687819856707, + "grad_norm": 0.3171615427794975, + "learning_rate": 6.102716815961787e-06, + "loss": 0.2126, + "step": 1879 + }, + { + "epoch": 3.848515864892528, + "grad_norm": 0.30038516037265045, + "learning_rate": 6.082155013796323e-06, + "loss": 0.2197, + "step": 1880 + }, + { + "epoch": 3.8505629477993857, + "grad_norm": 0.2706760721698934, + "learning_rate": 6.061621696025539e-06, + "loss": 0.203, + "step": 1881 + }, + { + "epoch": 3.8526100307062436, + "grad_norm": 0.3019695460717632, + "learning_rate": 6.041116904673125e-06, + "loss": 0.243, + "step": 1882 + }, + { + "epoch": 3.8546571136131016, + "grad_norm": 0.3238748626818905, + "learning_rate": 6.020640681704402e-06, + "loss": 0.2387, + "step": 1883 + }, + { + "epoch": 3.856704196519959, + "grad_norm": 0.2845226343722909, + "learning_rate": 6.000193069026181e-06, + "loss": 0.1939, + "step": 1884 + }, + { + "epoch": 3.8587512794268166, + "grad_norm": 0.30062466940994315, + "learning_rate": 5.979774108486751e-06, + "loss": 0.2417, + "step": 1885 + }, + { + "epoch": 3.8607983623336746, + "grad_norm": 0.29657229767791343, + "learning_rate": 5.95938384187575e-06, + "loss": 0.2513, + "step": 1886 + }, + { + "epoch": 3.862845445240532, + "grad_norm": 0.2965578800271511, + "learning_rate": 5.939022310924099e-06, + "loss": 0.238, + "step": 1887 + }, + { + "epoch": 3.86489252814739, + "grad_norm": 0.28944984642689814, + "learning_rate": 5.918689557303885e-06, + "loss": 0.233, + "step": 1888 + }, + { + "epoch": 3.8669396110542475, + "grad_norm": 0.3405343709910588, + "learning_rate": 5.898385622628336e-06, + "loss": 0.2568, + "step": 1889 + }, + { + "epoch": 3.8689866939611055, + "grad_norm": 0.31192910196034185, + "learning_rate": 5.878110548451675e-06, + "loss": 0.255, + "step": 1890 + }, + { + "epoch": 3.871033776867963, + "grad_norm": 0.3090737941594832, + "learning_rate": 5.857864376269051e-06, + "loss": 0.2356, + "step": 1891 + }, + { + "epoch": 3.873080859774821, + "grad_norm": 0.27438554656264486, + "learning_rate": 5.837647147516483e-06, + "loss": 0.19, + "step": 1892 + }, + { + "epoch": 3.8751279426816785, + "grad_norm": 0.3021169809249445, + "learning_rate": 5.817458903570747e-06, + "loss": 0.2248, + "step": 1893 + }, + { + "epoch": 3.8771750255885364, + "grad_norm": 0.3213447653550202, + "learning_rate": 5.7972996857492896e-06, + "loss": 0.2269, + "step": 1894 + }, + { + "epoch": 3.879222108495394, + "grad_norm": 0.33058912837962084, + "learning_rate": 5.777169535310152e-06, + "loss": 0.2359, + "step": 1895 + }, + { + "epoch": 3.881269191402252, + "grad_norm": 0.28572806654091426, + "learning_rate": 5.7570684934519135e-06, + "loss": 0.2147, + "step": 1896 + }, + { + "epoch": 3.8833162743091094, + "grad_norm": 0.32405838839318063, + "learning_rate": 5.736996601313545e-06, + "loss": 0.25, + "step": 1897 + }, + { + "epoch": 3.8853633572159674, + "grad_norm": 0.31407112083239963, + "learning_rate": 5.716953899974371e-06, + "loss": 0.2628, + "step": 1898 + }, + { + "epoch": 3.887410440122825, + "grad_norm": 0.24848541316900655, + "learning_rate": 5.696940430453981e-06, + "loss": 0.1664, + "step": 1899 + }, + { + "epoch": 3.889457523029683, + "grad_norm": 0.3315987651736212, + "learning_rate": 5.676956233712139e-06, + "loss": 0.2637, + "step": 1900 + }, + { + "epoch": 3.8915046059365404, + "grad_norm": 0.2855594139469349, + "learning_rate": 5.657001350648674e-06, + "loss": 0.207, + "step": 1901 + }, + { + "epoch": 3.8935516888433983, + "grad_norm": 0.3282465345015928, + "learning_rate": 5.6370758221034595e-06, + "loss": 0.2535, + "step": 1902 + }, + { + "epoch": 3.895598771750256, + "grad_norm": 0.30475560401896684, + "learning_rate": 5.617179688856271e-06, + "loss": 0.2432, + "step": 1903 + }, + { + "epoch": 3.8976458546571138, + "grad_norm": 0.30871123340361434, + "learning_rate": 5.597312991626713e-06, + "loss": 0.2134, + "step": 1904 + }, + { + "epoch": 3.8996929375639713, + "grad_norm": 0.33995724036296693, + "learning_rate": 5.577475771074168e-06, + "loss": 0.2485, + "step": 1905 + }, + { + "epoch": 3.901740020470829, + "grad_norm": 0.2811551664838514, + "learning_rate": 5.557668067797677e-06, + "loss": 0.2453, + "step": 1906 + }, + { + "epoch": 3.9037871033776868, + "grad_norm": 0.3085482770411968, + "learning_rate": 5.537889922335877e-06, + "loss": 0.2841, + "step": 1907 + }, + { + "epoch": 3.9058341862845447, + "grad_norm": 0.291552651807712, + "learning_rate": 5.5181413751669125e-06, + "loss": 0.2519, + "step": 1908 + }, + { + "epoch": 3.907881269191402, + "grad_norm": 0.301795002796971, + "learning_rate": 5.498422466708349e-06, + "loss": 0.2157, + "step": 1909 + }, + { + "epoch": 3.9099283520982597, + "grad_norm": 0.33010524936332025, + "learning_rate": 5.478733237317084e-06, + "loss": 0.2272, + "step": 1910 + }, + { + "epoch": 3.9119754350051177, + "grad_norm": 0.30279565810368547, + "learning_rate": 5.459073727289291e-06, + "loss": 0.2462, + "step": 1911 + }, + { + "epoch": 3.9140225179119756, + "grad_norm": 0.3180781810526128, + "learning_rate": 5.439443976860306e-06, + "loss": 0.2358, + "step": 1912 + }, + { + "epoch": 3.916069600818833, + "grad_norm": 0.3040038712809124, + "learning_rate": 5.419844026204568e-06, + "loss": 0.2266, + "step": 1913 + }, + { + "epoch": 3.9181166837256907, + "grad_norm": 0.31115682004986517, + "learning_rate": 5.400273915435526e-06, + "loss": 0.2706, + "step": 1914 + }, + { + "epoch": 3.9201637666325486, + "grad_norm": 0.2723072282278328, + "learning_rate": 5.38073368460555e-06, + "loss": 0.2172, + "step": 1915 + }, + { + "epoch": 3.9222108495394066, + "grad_norm": 0.3236989253933042, + "learning_rate": 5.361223373705873e-06, + "loss": 0.2671, + "step": 1916 + }, + { + "epoch": 3.924257932446264, + "grad_norm": 0.2949286813519759, + "learning_rate": 5.341743022666468e-06, + "loss": 0.2575, + "step": 1917 + }, + { + "epoch": 3.9263050153531216, + "grad_norm": 0.2929617720522718, + "learning_rate": 5.32229267135602e-06, + "loss": 0.2477, + "step": 1918 + }, + { + "epoch": 3.9283520982599796, + "grad_norm": 0.29944445931066044, + "learning_rate": 5.302872359581799e-06, + "loss": 0.3016, + "step": 1919 + }, + { + "epoch": 3.9303991811668375, + "grad_norm": 0.2792530693391496, + "learning_rate": 5.283482127089603e-06, + "loss": 0.2116, + "step": 1920 + }, + { + "epoch": 3.932446264073695, + "grad_norm": 0.3148045265972349, + "learning_rate": 5.2641220135636685e-06, + "loss": 0.251, + "step": 1921 + }, + { + "epoch": 3.9344933469805525, + "grad_norm": 0.29519836172160036, + "learning_rate": 5.244792058626587e-06, + "loss": 0.2379, + "step": 1922 + }, + { + "epoch": 3.9365404298874105, + "grad_norm": 0.27640580348684785, + "learning_rate": 5.2254923018392344e-06, + "loss": 0.2094, + "step": 1923 + }, + { + "epoch": 3.9385875127942684, + "grad_norm": 0.3216119033954627, + "learning_rate": 5.206222782700667e-06, + "loss": 0.2226, + "step": 1924 + }, + { + "epoch": 3.940634595701126, + "grad_norm": 0.28724430058767236, + "learning_rate": 5.186983540648074e-06, + "loss": 0.229, + "step": 1925 + }, + { + "epoch": 3.9426816786079835, + "grad_norm": 0.30838073757141554, + "learning_rate": 5.167774615056669e-06, + "loss": 0.2471, + "step": 1926 + }, + { + "epoch": 3.9447287615148414, + "grad_norm": 0.3050237811038075, + "learning_rate": 5.1485960452396266e-06, + "loss": 0.2516, + "step": 1927 + }, + { + "epoch": 3.946775844421699, + "grad_norm": 0.3269582811451654, + "learning_rate": 5.1294478704479896e-06, + "loss": 0.2757, + "step": 1928 + }, + { + "epoch": 3.948822927328557, + "grad_norm": 0.32559352766006666, + "learning_rate": 5.1103301298705995e-06, + "loss": 0.2234, + "step": 1929 + }, + { + "epoch": 3.9508700102354144, + "grad_norm": 0.3018696607635989, + "learning_rate": 5.091242862634e-06, + "loss": 0.2112, + "step": 1930 + }, + { + "epoch": 3.9529170931422724, + "grad_norm": 0.28807273157869273, + "learning_rate": 5.072186107802377e-06, + "loss": 0.2698, + "step": 1931 + }, + { + "epoch": 3.95496417604913, + "grad_norm": 0.324456956245341, + "learning_rate": 5.05315990437747e-06, + "loss": 0.2451, + "step": 1932 + }, + { + "epoch": 3.957011258955988, + "grad_norm": 0.314429989537044, + "learning_rate": 5.0341642912984844e-06, + "loss": 0.2311, + "step": 1933 + }, + { + "epoch": 3.9590583418628453, + "grad_norm": 0.2926963524885184, + "learning_rate": 5.015199307442027e-06, + "loss": 0.2418, + "step": 1934 + }, + { + "epoch": 3.9611054247697033, + "grad_norm": 0.30442791024762783, + "learning_rate": 4.996264991622015e-06, + "loss": 0.2513, + "step": 1935 + }, + { + "epoch": 3.963152507676561, + "grad_norm": 0.30348062095693085, + "learning_rate": 4.977361382589607e-06, + "loss": 0.217, + "step": 1936 + }, + { + "epoch": 3.9651995905834188, + "grad_norm": 0.2876460419928197, + "learning_rate": 4.958488519033096e-06, + "loss": 0.2143, + "step": 1937 + }, + { + "epoch": 3.9672466734902763, + "grad_norm": 0.3075426518738084, + "learning_rate": 4.939646439577868e-06, + "loss": 0.2275, + "step": 1938 + }, + { + "epoch": 3.969293756397134, + "grad_norm": 0.3060349427123557, + "learning_rate": 4.920835182786316e-06, + "loss": 0.2371, + "step": 1939 + }, + { + "epoch": 3.9713408393039917, + "grad_norm": 0.31280082710142615, + "learning_rate": 4.9020547871577265e-06, + "loss": 0.2234, + "step": 1940 + }, + { + "epoch": 3.9733879222108497, + "grad_norm": 0.3134880106721245, + "learning_rate": 4.8833052911282375e-06, + "loss": 0.2492, + "step": 1941 + }, + { + "epoch": 3.975435005117707, + "grad_norm": 0.2953871360194199, + "learning_rate": 4.864586733070755e-06, + "loss": 0.221, + "step": 1942 + }, + { + "epoch": 3.977482088024565, + "grad_norm": 0.31671009607399275, + "learning_rate": 4.845899151294848e-06, + "loss": 0.2335, + "step": 1943 + }, + { + "epoch": 3.9795291709314227, + "grad_norm": 0.2908681144449216, + "learning_rate": 4.827242584046698e-06, + "loss": 0.2767, + "step": 1944 + }, + { + "epoch": 3.9815762538382806, + "grad_norm": 0.2931420599842891, + "learning_rate": 4.808617069509034e-06, + "loss": 0.2475, + "step": 1945 + }, + { + "epoch": 3.983623336745138, + "grad_norm": 0.2955583206381109, + "learning_rate": 4.790022645800994e-06, + "loss": 0.2737, + "step": 1946 + }, + { + "epoch": 3.9856704196519956, + "grad_norm": 0.2873161139594403, + "learning_rate": 4.77145935097811e-06, + "loss": 0.2116, + "step": 1947 + }, + { + "epoch": 3.9877175025588536, + "grad_norm": 0.30419560839821447, + "learning_rate": 4.752927223032196e-06, + "loss": 0.2261, + "step": 1948 + }, + { + "epoch": 3.9897645854657116, + "grad_norm": 0.3199774705798064, + "learning_rate": 4.7344262998912885e-06, + "loss": 0.2478, + "step": 1949 + }, + { + "epoch": 3.991811668372569, + "grad_norm": 0.31331370811795206, + "learning_rate": 4.715956619419539e-06, + "loss": 0.2427, + "step": 1950 + }, + { + "epoch": 3.9938587512794266, + "grad_norm": 0.3035495734347721, + "learning_rate": 4.697518219417188e-06, + "loss": 0.2412, + "step": 1951 + }, + { + "epoch": 3.9959058341862845, + "grad_norm": 0.2835191629580971, + "learning_rate": 4.679111137620442e-06, + "loss": 0.2054, + "step": 1952 + }, + { + "epoch": 3.9979529170931425, + "grad_norm": 0.29245602269669696, + "learning_rate": 4.660735411701398e-06, + "loss": 0.2047, + "step": 1953 + }, + { + "epoch": 4.0, + "grad_norm": 0.7074262659570464, + "learning_rate": 4.6423910792680005e-06, + "loss": 0.2856, + "step": 1954 + }, + { + "epoch": 4.0020470829068575, + "grad_norm": 0.4712597575605154, + "learning_rate": 4.62407817786394e-06, + "loss": 0.1814, + "step": 1955 + }, + { + "epoch": 4.004094165813715, + "grad_norm": 0.38151984418088236, + "learning_rate": 4.605796744968556e-06, + "loss": 0.1724, + "step": 1956 + }, + { + "epoch": 4.006141248720573, + "grad_norm": 0.34020967642434874, + "learning_rate": 4.587546817996826e-06, + "loss": 0.149, + "step": 1957 + }, + { + "epoch": 4.008188331627431, + "grad_norm": 0.31262949437875953, + "learning_rate": 4.56932843429922e-06, + "loss": 0.161, + "step": 1958 + }, + { + "epoch": 4.0102354145342884, + "grad_norm": 0.4303302435498056, + "learning_rate": 4.551141631161651e-06, + "loss": 0.1656, + "step": 1959 + }, + { + "epoch": 4.012282497441146, + "grad_norm": 0.4285520812331891, + "learning_rate": 4.532986445805405e-06, + "loss": 0.1627, + "step": 1960 + }, + { + "epoch": 4.014329580348004, + "grad_norm": 0.4192066755161973, + "learning_rate": 4.514862915387059e-06, + "loss": 0.1827, + "step": 1961 + }, + { + "epoch": 4.016376663254862, + "grad_norm": 0.3658567404594509, + "learning_rate": 4.496771076998405e-06, + "loss": 0.2017, + "step": 1962 + }, + { + "epoch": 4.018423746161719, + "grad_norm": 0.3317640866707838, + "learning_rate": 4.478710967666371e-06, + "loss": 0.1817, + "step": 1963 + }, + { + "epoch": 4.020470829068577, + "grad_norm": 0.38931119420729465, + "learning_rate": 4.460682624352952e-06, + "loss": 0.1582, + "step": 1964 + }, + { + "epoch": 4.022517911975435, + "grad_norm": 0.4025748777608708, + "learning_rate": 4.442686083955132e-06, + "loss": 0.1692, + "step": 1965 + }, + { + "epoch": 4.024564994882293, + "grad_norm": 0.3465330209700674, + "learning_rate": 4.424721383304791e-06, + "loss": 0.1623, + "step": 1966 + }, + { + "epoch": 4.02661207778915, + "grad_norm": 0.33474185562540504, + "learning_rate": 4.4067885591686625e-06, + "loss": 0.1629, + "step": 1967 + }, + { + "epoch": 4.028659160696008, + "grad_norm": 0.2994162000223648, + "learning_rate": 4.388887648248237e-06, + "loss": 0.1943, + "step": 1968 + }, + { + "epoch": 4.030706243602866, + "grad_norm": 0.30244570971558177, + "learning_rate": 4.371018687179689e-06, + "loss": 0.2009, + "step": 1969 + }, + { + "epoch": 4.032753326509724, + "grad_norm": 0.31048799941390864, + "learning_rate": 4.353181712533807e-06, + "loss": 0.1763, + "step": 1970 + }, + { + "epoch": 4.034800409416581, + "grad_norm": 0.31183964535945935, + "learning_rate": 4.3353767608159125e-06, + "loss": 0.1695, + "step": 1971 + }, + { + "epoch": 4.036847492323439, + "grad_norm": 0.31495339654929116, + "learning_rate": 4.317603868465794e-06, + "loss": 0.156, + "step": 1972 + }, + { + "epoch": 4.038894575230297, + "grad_norm": 0.2985528590711877, + "learning_rate": 4.299863071857617e-06, + "loss": 0.1687, + "step": 1973 + }, + { + "epoch": 4.040941658137155, + "grad_norm": 0.2898165046108191, + "learning_rate": 4.2821544072998655e-06, + "loss": 0.1689, + "step": 1974 + }, + { + "epoch": 4.042988741044012, + "grad_norm": 0.2806552847148077, + "learning_rate": 4.264477911035265e-06, + "loss": 0.1463, + "step": 1975 + }, + { + "epoch": 4.04503582395087, + "grad_norm": 0.2973917660709716, + "learning_rate": 4.246833619240702e-06, + "loss": 0.1452, + "step": 1976 + }, + { + "epoch": 4.047082906857728, + "grad_norm": 0.3137432130391123, + "learning_rate": 4.229221568027151e-06, + "loss": 0.1821, + "step": 1977 + }, + { + "epoch": 4.049129989764586, + "grad_norm": 0.29853330078810963, + "learning_rate": 4.211641793439609e-06, + "loss": 0.168, + "step": 1978 + }, + { + "epoch": 4.051177072671443, + "grad_norm": 0.3165013554197021, + "learning_rate": 4.194094331457004e-06, + "loss": 0.1753, + "step": 1979 + }, + { + "epoch": 4.053224155578301, + "grad_norm": 0.30677661034631387, + "learning_rate": 4.176579217992143e-06, + "loss": 0.1525, + "step": 1980 + }, + { + "epoch": 4.055271238485159, + "grad_norm": 0.29846779652611327, + "learning_rate": 4.159096488891623e-06, + "loss": 0.1559, + "step": 1981 + }, + { + "epoch": 4.0573183213920165, + "grad_norm": 0.3014892204744538, + "learning_rate": 4.1416461799357675e-06, + "loss": 0.1732, + "step": 1982 + }, + { + "epoch": 4.059365404298874, + "grad_norm": 0.29810838609292906, + "learning_rate": 4.124228326838544e-06, + "loss": 0.1442, + "step": 1983 + }, + { + "epoch": 4.061412487205732, + "grad_norm": 0.28754100275614525, + "learning_rate": 4.106842965247497e-06, + "loss": 0.1587, + "step": 1984 + }, + { + "epoch": 4.06345957011259, + "grad_norm": 0.28330676946462274, + "learning_rate": 4.0894901307436805e-06, + "loss": 0.1697, + "step": 1985 + }, + { + "epoch": 4.0655066530194475, + "grad_norm": 0.285301416935174, + "learning_rate": 4.072169858841561e-06, + "loss": 0.1595, + "step": 1986 + }, + { + "epoch": 4.067553735926305, + "grad_norm": 0.2837041098864574, + "learning_rate": 4.054882184988971e-06, + "loss": 0.1871, + "step": 1987 + }, + { + "epoch": 4.0696008188331625, + "grad_norm": 0.28867050594006555, + "learning_rate": 4.0376271445670465e-06, + "loss": 0.1805, + "step": 1988 + }, + { + "epoch": 4.071647901740021, + "grad_norm": 0.27159571765823515, + "learning_rate": 4.020404772890101e-06, + "loss": 0.1782, + "step": 1989 + }, + { + "epoch": 4.073694984646878, + "grad_norm": 0.29070033660022687, + "learning_rate": 4.003215105205613e-06, + "loss": 0.1971, + "step": 1990 + }, + { + "epoch": 4.075742067553736, + "grad_norm": 0.29431870068785426, + "learning_rate": 3.986058176694123e-06, + "loss": 0.1836, + "step": 1991 + }, + { + "epoch": 4.077789150460593, + "grad_norm": 0.30901872319472756, + "learning_rate": 3.968934022469157e-06, + "loss": 0.1615, + "step": 1992 + }, + { + "epoch": 4.079836233367452, + "grad_norm": 0.26872512623266737, + "learning_rate": 3.951842677577171e-06, + "loss": 0.1571, + "step": 1993 + }, + { + "epoch": 4.081883316274309, + "grad_norm": 0.277601430071484, + "learning_rate": 3.9347841769974925e-06, + "loss": 0.1993, + "step": 1994 + }, + { + "epoch": 4.083930399181167, + "grad_norm": 0.2815481603482528, + "learning_rate": 3.917758555642195e-06, + "loss": 0.1776, + "step": 1995 + }, + { + "epoch": 4.085977482088024, + "grad_norm": 0.31199877087236383, + "learning_rate": 3.900765848356083e-06, + "loss": 0.1807, + "step": 1996 + }, + { + "epoch": 4.088024564994882, + "grad_norm": 0.3049249771323251, + "learning_rate": 3.883806089916593e-06, + "loss": 0.1738, + "step": 1997 + }, + { + "epoch": 4.09007164790174, + "grad_norm": 0.29235847173840074, + "learning_rate": 3.866879315033738e-06, + "loss": 0.178, + "step": 1998 + }, + { + "epoch": 4.092118730808598, + "grad_norm": 0.2849050552270467, + "learning_rate": 3.849985558349998e-06, + "loss": 0.1679, + "step": 1999 + }, + { + "epoch": 4.094165813715455, + "grad_norm": 0.2977669293917896, + "learning_rate": 3.8331248544403135e-06, + "loss": 0.1556, + "step": 2000 + }, + { + "epoch": 4.096212896622313, + "grad_norm": 0.2927932662986805, + "learning_rate": 3.8162972378119635e-06, + "loss": 0.1894, + "step": 2001 + }, + { + "epoch": 4.098259979529171, + "grad_norm": 0.29328702976060467, + "learning_rate": 3.799502742904497e-06, + "loss": 0.173, + "step": 2002 + }, + { + "epoch": 4.100307062436029, + "grad_norm": 0.2969906241666511, + "learning_rate": 3.7827414040896958e-06, + "loss": 0.1812, + "step": 2003 + }, + { + "epoch": 4.102354145342886, + "grad_norm": 0.32948789795871786, + "learning_rate": 3.766013255671479e-06, + "loss": 0.1825, + "step": 2004 + }, + { + "epoch": 4.104401228249744, + "grad_norm": 0.28692228882067206, + "learning_rate": 3.749318331885825e-06, + "loss": 0.1654, + "step": 2005 + }, + { + "epoch": 4.106448311156602, + "grad_norm": 0.27400445663207174, + "learning_rate": 3.7326566669007268e-06, + "loss": 0.1913, + "step": 2006 + }, + { + "epoch": 4.10849539406346, + "grad_norm": 0.30606472444023525, + "learning_rate": 3.716028294816119e-06, + "loss": 0.1545, + "step": 2007 + }, + { + "epoch": 4.110542476970317, + "grad_norm": 0.3123611845587673, + "learning_rate": 3.699433249663775e-06, + "loss": 0.201, + "step": 2008 + }, + { + "epoch": 4.112589559877175, + "grad_norm": 0.31139724775462246, + "learning_rate": 3.6828715654072776e-06, + "loss": 0.1618, + "step": 2009 + }, + { + "epoch": 4.114636642784033, + "grad_norm": 0.29336426025625717, + "learning_rate": 3.666343275941926e-06, + "loss": 0.1903, + "step": 2010 + }, + { + "epoch": 4.116683725690891, + "grad_norm": 0.2893384058971807, + "learning_rate": 3.649848415094681e-06, + "loss": 0.17, + "step": 2011 + }, + { + "epoch": 4.118730808597748, + "grad_norm": 0.29125235390587384, + "learning_rate": 3.6333870166240703e-06, + "loss": 0.1996, + "step": 2012 + }, + { + "epoch": 4.120777891504606, + "grad_norm": 0.29871612249436, + "learning_rate": 3.616959114220162e-06, + "loss": 0.1985, + "step": 2013 + }, + { + "epoch": 4.122824974411464, + "grad_norm": 0.271555858016362, + "learning_rate": 3.60056474150446e-06, + "loss": 0.1818, + "step": 2014 + }, + { + "epoch": 4.1248720573183215, + "grad_norm": 0.3012910833873169, + "learning_rate": 3.5842039320298327e-06, + "loss": 0.1414, + "step": 2015 + }, + { + "epoch": 4.126919140225179, + "grad_norm": 0.27667654273766706, + "learning_rate": 3.5678767192804764e-06, + "loss": 0.1882, + "step": 2016 + }, + { + "epoch": 4.1289662231320365, + "grad_norm": 0.2869593416102505, + "learning_rate": 3.551583136671817e-06, + "loss": 0.1906, + "step": 2017 + }, + { + "epoch": 4.131013306038895, + "grad_norm": 0.2822576095654785, + "learning_rate": 3.5353232175504614e-06, + "loss": 0.1828, + "step": 2018 + }, + { + "epoch": 4.1330603889457525, + "grad_norm": 0.3121511458643644, + "learning_rate": 3.5190969951941113e-06, + "loss": 0.161, + "step": 2019 + }, + { + "epoch": 4.13510747185261, + "grad_norm": 0.2829822535321191, + "learning_rate": 3.5029045028115105e-06, + "loss": 0.1514, + "step": 2020 + }, + { + "epoch": 4.1371545547594675, + "grad_norm": 0.2837746021156371, + "learning_rate": 3.486745773542375e-06, + "loss": 0.1935, + "step": 2021 + }, + { + "epoch": 4.139201637666326, + "grad_norm": 0.3025896496647199, + "learning_rate": 3.470620840457304e-06, + "loss": 0.208, + "step": 2022 + }, + { + "epoch": 4.141248720573183, + "grad_norm": 0.28695976072627655, + "learning_rate": 3.4545297365577437e-06, + "loss": 0.18, + "step": 2023 + }, + { + "epoch": 4.143295803480041, + "grad_norm": 0.29084046593921437, + "learning_rate": 3.438472494775902e-06, + "loss": 0.1797, + "step": 2024 + }, + { + "epoch": 4.145342886386898, + "grad_norm": 0.2964669445537806, + "learning_rate": 3.4224491479746822e-06, + "loss": 0.2066, + "step": 2025 + }, + { + "epoch": 4.147389969293757, + "grad_norm": 0.2735467093182229, + "learning_rate": 3.406459728947622e-06, + "loss": 0.1805, + "step": 2026 + }, + { + "epoch": 4.149437052200614, + "grad_norm": 0.2883440394869004, + "learning_rate": 3.390504270418822e-06, + "loss": 0.1935, + "step": 2027 + }, + { + "epoch": 4.151484135107472, + "grad_norm": 0.28347253308933157, + "learning_rate": 3.3745828050428675e-06, + "loss": 0.2042, + "step": 2028 + }, + { + "epoch": 4.153531218014329, + "grad_norm": 0.287714374967884, + "learning_rate": 3.358695365404785e-06, + "loss": 0.1965, + "step": 2029 + }, + { + "epoch": 4.155578300921187, + "grad_norm": 0.29340681821156966, + "learning_rate": 3.3428419840199623e-06, + "loss": 0.1603, + "step": 2030 + }, + { + "epoch": 4.157625383828045, + "grad_norm": 0.2727874360906274, + "learning_rate": 3.327022693334083e-06, + "loss": 0.155, + "step": 2031 + }, + { + "epoch": 4.159672466734903, + "grad_norm": 0.30308908503638726, + "learning_rate": 3.3112375257230547e-06, + "loss": 0.1909, + "step": 2032 + }, + { + "epoch": 4.16171954964176, + "grad_norm": 0.2782743108107041, + "learning_rate": 3.295486513492954e-06, + "loss": 0.1912, + "step": 2033 + }, + { + "epoch": 4.163766632548619, + "grad_norm": 0.2960778023128933, + "learning_rate": 3.279769688879959e-06, + "loss": 0.1706, + "step": 2034 + }, + { + "epoch": 4.165813715455476, + "grad_norm": 0.2855041966453197, + "learning_rate": 3.2640870840502646e-06, + "loss": 0.1812, + "step": 2035 + }, + { + "epoch": 4.167860798362334, + "grad_norm": 0.2982604591341762, + "learning_rate": 3.2484387311000364e-06, + "loss": 0.1651, + "step": 2036 + }, + { + "epoch": 4.169907881269191, + "grad_norm": 0.2957570058273647, + "learning_rate": 3.2328246620553605e-06, + "loss": 0.1632, + "step": 2037 + }, + { + "epoch": 4.171954964176049, + "grad_norm": 0.2951616591774616, + "learning_rate": 3.2172449088721235e-06, + "loss": 0.1624, + "step": 2038 + }, + { + "epoch": 4.174002047082907, + "grad_norm": 0.2851463082934784, + "learning_rate": 3.2016995034360045e-06, + "loss": 0.1808, + "step": 2039 + }, + { + "epoch": 4.176049129989765, + "grad_norm": 0.2845084237530719, + "learning_rate": 3.186188477562382e-06, + "loss": 0.1786, + "step": 2040 + }, + { + "epoch": 4.178096212896622, + "grad_norm": 0.30650921370752987, + "learning_rate": 3.1707118629962607e-06, + "loss": 0.1472, + "step": 2041 + }, + { + "epoch": 4.18014329580348, + "grad_norm": 0.2777664159680818, + "learning_rate": 3.1552696914122327e-06, + "loss": 0.1531, + "step": 2042 + }, + { + "epoch": 4.182190378710338, + "grad_norm": 0.28828599604321864, + "learning_rate": 3.139861994414397e-06, + "loss": 0.1845, + "step": 2043 + }, + { + "epoch": 4.184237461617196, + "grad_norm": 0.30773699136825977, + "learning_rate": 3.1244888035362875e-06, + "loss": 0.1769, + "step": 2044 + }, + { + "epoch": 4.186284544524053, + "grad_norm": 0.3155552426235748, + "learning_rate": 3.1091501502408293e-06, + "loss": 0.1643, + "step": 2045 + }, + { + "epoch": 4.188331627430911, + "grad_norm": 0.2898415631190067, + "learning_rate": 3.093846065920254e-06, + "loss": 0.168, + "step": 2046 + }, + { + "epoch": 4.190378710337769, + "grad_norm": 0.3080290062858047, + "learning_rate": 3.0785765818960534e-06, + "loss": 0.215, + "step": 2047 + }, + { + "epoch": 4.1924257932446265, + "grad_norm": 0.26531974258516, + "learning_rate": 3.0633417294188896e-06, + "loss": 0.1902, + "step": 2048 + }, + { + "epoch": 4.194472876151484, + "grad_norm": 0.29101494010309975, + "learning_rate": 3.0481415396685564e-06, + "loss": 0.1851, + "step": 2049 + }, + { + "epoch": 4.1965199590583415, + "grad_norm": 0.2805440252695212, + "learning_rate": 3.0329760437539233e-06, + "loss": 0.2106, + "step": 2050 + }, + { + "epoch": 4.1985670419652, + "grad_norm": 0.2954059617809206, + "learning_rate": 3.017845272712825e-06, + "loss": 0.178, + "step": 2051 + }, + { + "epoch": 4.200614124872057, + "grad_norm": 0.2750668596805512, + "learning_rate": 3.0027492575120453e-06, + "loss": 0.1639, + "step": 2052 + }, + { + "epoch": 4.202661207778915, + "grad_norm": 0.2954652665384484, + "learning_rate": 2.9876880290472376e-06, + "loss": 0.1654, + "step": 2053 + }, + { + "epoch": 4.2047082906857725, + "grad_norm": 0.2844611490184932, + "learning_rate": 2.9726616181428515e-06, + "loss": 0.1824, + "step": 2054 + }, + { + "epoch": 4.206755373592631, + "grad_norm": 0.28973919117898683, + "learning_rate": 2.957670055552078e-06, + "loss": 0.1785, + "step": 2055 + }, + { + "epoch": 4.208802456499488, + "grad_norm": 0.3065738121168008, + "learning_rate": 2.942713371956809e-06, + "loss": 0.1466, + "step": 2056 + }, + { + "epoch": 4.210849539406346, + "grad_norm": 0.27595990092382205, + "learning_rate": 2.927791597967522e-06, + "loss": 0.1674, + "step": 2057 + }, + { + "epoch": 4.212896622313203, + "grad_norm": 0.2787886162407018, + "learning_rate": 2.9129047641232653e-06, + "loss": 0.1739, + "step": 2058 + }, + { + "epoch": 4.214943705220062, + "grad_norm": 0.2835601286496118, + "learning_rate": 2.8980529008915793e-06, + "loss": 0.1851, + "step": 2059 + }, + { + "epoch": 4.216990788126919, + "grad_norm": 0.2734235721997136, + "learning_rate": 2.8832360386684287e-06, + "loss": 0.1894, + "step": 2060 + }, + { + "epoch": 4.219037871033777, + "grad_norm": 0.313282561765329, + "learning_rate": 2.8684542077781376e-06, + "loss": 0.1844, + "step": 2061 + }, + { + "epoch": 4.221084953940634, + "grad_norm": 0.29657107157085116, + "learning_rate": 2.853707438473352e-06, + "loss": 0.1861, + "step": 2062 + }, + { + "epoch": 4.223132036847493, + "grad_norm": 0.2835879428086791, + "learning_rate": 2.838995760934953e-06, + "loss": 0.1992, + "step": 2063 + }, + { + "epoch": 4.22517911975435, + "grad_norm": 0.28406764855143096, + "learning_rate": 2.8243192052719902e-06, + "loss": 0.1743, + "step": 2064 + }, + { + "epoch": 4.227226202661208, + "grad_norm": 0.2735358759641007, + "learning_rate": 2.8096778015216484e-06, + "loss": 0.1663, + "step": 2065 + }, + { + "epoch": 4.229273285568065, + "grad_norm": 0.28994987108300085, + "learning_rate": 2.7950715796491623e-06, + "loss": 0.1693, + "step": 2066 + }, + { + "epoch": 4.231320368474924, + "grad_norm": 0.2918242884998421, + "learning_rate": 2.7805005695477704e-06, + "loss": 0.1659, + "step": 2067 + }, + { + "epoch": 4.233367451381781, + "grad_norm": 0.2914555807372611, + "learning_rate": 2.7659648010386365e-06, + "loss": 0.2082, + "step": 2068 + }, + { + "epoch": 4.235414534288639, + "grad_norm": 0.31049832846223957, + "learning_rate": 2.75146430387081e-06, + "loss": 0.1745, + "step": 2069 + }, + { + "epoch": 4.237461617195496, + "grad_norm": 0.2786106859690272, + "learning_rate": 2.736999107721137e-06, + "loss": 0.1689, + "step": 2070 + }, + { + "epoch": 4.239508700102354, + "grad_norm": 0.2813229103286878, + "learning_rate": 2.7225692421942306e-06, + "loss": 0.1938, + "step": 2071 + }, + { + "epoch": 4.241555783009212, + "grad_norm": 0.2911201694179184, + "learning_rate": 2.7081747368223953e-06, + "loss": 0.1835, + "step": 2072 + }, + { + "epoch": 4.24360286591607, + "grad_norm": 0.29816674195659204, + "learning_rate": 2.6938156210655584e-06, + "loss": 0.1842, + "step": 2073 + }, + { + "epoch": 4.245649948822927, + "grad_norm": 0.2996092573579164, + "learning_rate": 2.679491924311226e-06, + "loss": 0.1847, + "step": 2074 + }, + { + "epoch": 4.247697031729785, + "grad_norm": 0.2895248563688147, + "learning_rate": 2.6652036758744148e-06, + "loss": 0.1827, + "step": 2075 + }, + { + "epoch": 4.249744114636643, + "grad_norm": 0.28226805129886656, + "learning_rate": 2.6509509049975913e-06, + "loss": 0.1765, + "step": 2076 + }, + { + "epoch": 4.2517911975435005, + "grad_norm": 0.2949640489147984, + "learning_rate": 2.6367336408506063e-06, + "loss": 0.1705, + "step": 2077 + }, + { + "epoch": 4.253838280450358, + "grad_norm": 0.2753720710190184, + "learning_rate": 2.622551912530653e-06, + "loss": 0.186, + "step": 2078 + }, + { + "epoch": 4.255885363357216, + "grad_norm": 0.2936131556811062, + "learning_rate": 2.608405749062193e-06, + "loss": 0.1855, + "step": 2079 + }, + { + "epoch": 4.257932446264074, + "grad_norm": 0.28222624495514065, + "learning_rate": 2.594295179396895e-06, + "loss": 0.1743, + "step": 2080 + }, + { + "epoch": 4.2599795291709315, + "grad_norm": 0.29367758291953516, + "learning_rate": 2.5802202324135926e-06, + "loss": 0.1926, + "step": 2081 + }, + { + "epoch": 4.262026612077789, + "grad_norm": 0.29329088174637774, + "learning_rate": 2.566180936918203e-06, + "loss": 0.1985, + "step": 2082 + }, + { + "epoch": 4.2640736949846465, + "grad_norm": 0.27495980250181845, + "learning_rate": 2.5521773216436875e-06, + "loss": 0.1694, + "step": 2083 + }, + { + "epoch": 4.266120777891505, + "grad_norm": 0.2745597317735457, + "learning_rate": 2.5382094152499705e-06, + "loss": 0.1623, + "step": 2084 + }, + { + "epoch": 4.268167860798362, + "grad_norm": 0.2860214870247153, + "learning_rate": 2.5242772463239075e-06, + "loss": 0.1759, + "step": 2085 + }, + { + "epoch": 4.27021494370522, + "grad_norm": 0.2829523123556956, + "learning_rate": 2.5103808433792075e-06, + "loss": 0.1953, + "step": 2086 + }, + { + "epoch": 4.272262026612077, + "grad_norm": 0.29162058098697563, + "learning_rate": 2.4965202348563834e-06, + "loss": 0.1851, + "step": 2087 + }, + { + "epoch": 4.274309109518936, + "grad_norm": 0.28401444187787156, + "learning_rate": 2.4826954491226875e-06, + "loss": 0.1625, + "step": 2088 + }, + { + "epoch": 4.276356192425793, + "grad_norm": 0.29640280698764426, + "learning_rate": 2.468906514472065e-06, + "loss": 0.1733, + "step": 2089 + }, + { + "epoch": 4.278403275332651, + "grad_norm": 0.28541928385296134, + "learning_rate": 2.4551534591250725e-06, + "loss": 0.2083, + "step": 2090 + }, + { + "epoch": 4.280450358239508, + "grad_norm": 0.2927844886794109, + "learning_rate": 2.4414363112288464e-06, + "loss": 0.1518, + "step": 2091 + }, + { + "epoch": 4.282497441146367, + "grad_norm": 0.29701394558657035, + "learning_rate": 2.4277550988570362e-06, + "loss": 0.1621, + "step": 2092 + }, + { + "epoch": 4.284544524053224, + "grad_norm": 0.290771795085261, + "learning_rate": 2.4141098500097403e-06, + "loss": 0.1648, + "step": 2093 + }, + { + "epoch": 4.286591606960082, + "grad_norm": 0.27774161175473394, + "learning_rate": 2.400500592613455e-06, + "loss": 0.1711, + "step": 2094 + }, + { + "epoch": 4.288638689866939, + "grad_norm": 0.30883225453627294, + "learning_rate": 2.3869273545210158e-06, + "loss": 0.134, + "step": 2095 + }, + { + "epoch": 4.290685772773798, + "grad_norm": 0.2797146271613822, + "learning_rate": 2.3733901635115486e-06, + "loss": 0.1692, + "step": 2096 + }, + { + "epoch": 4.292732855680655, + "grad_norm": 0.2969804226131455, + "learning_rate": 2.359889047290389e-06, + "loss": 0.1671, + "step": 2097 + }, + { + "epoch": 4.294779938587513, + "grad_norm": 0.3217916741414342, + "learning_rate": 2.3464240334890496e-06, + "loss": 0.1575, + "step": 2098 + }, + { + "epoch": 4.29682702149437, + "grad_norm": 0.27919309597905084, + "learning_rate": 2.332995149665169e-06, + "loss": 0.2121, + "step": 2099 + }, + { + "epoch": 4.298874104401229, + "grad_norm": 0.2800459009067448, + "learning_rate": 2.3196024233024185e-06, + "loss": 0.1837, + "step": 2100 + }, + { + "epoch": 4.300921187308086, + "grad_norm": 0.2671979507192257, + "learning_rate": 2.3062458818104804e-06, + "loss": 0.1952, + "step": 2101 + }, + { + "epoch": 4.302968270214944, + "grad_norm": 0.28262802438305323, + "learning_rate": 2.2929255525249894e-06, + "loss": 0.1713, + "step": 2102 + }, + { + "epoch": 4.305015353121801, + "grad_norm": 0.3005757045715249, + "learning_rate": 2.279641462707445e-06, + "loss": 0.1668, + "step": 2103 + }, + { + "epoch": 4.30706243602866, + "grad_norm": 0.286800196637234, + "learning_rate": 2.266393639545197e-06, + "loss": 0.1896, + "step": 2104 + }, + { + "epoch": 4.309109518935517, + "grad_norm": 0.2833256809700336, + "learning_rate": 2.2531821101513796e-06, + "loss": 0.1417, + "step": 2105 + }, + { + "epoch": 4.311156601842375, + "grad_norm": 0.31197523215491646, + "learning_rate": 2.2400069015648173e-06, + "loss": 0.1952, + "step": 2106 + }, + { + "epoch": 4.313203684749232, + "grad_norm": 0.2678003946049261, + "learning_rate": 2.22686804075003e-06, + "loss": 0.1807, + "step": 2107 + }, + { + "epoch": 4.3152507676560905, + "grad_norm": 0.30659909160474746, + "learning_rate": 2.213765554597129e-06, + "loss": 0.1873, + "step": 2108 + }, + { + "epoch": 4.317297850562948, + "grad_norm": 0.27320285898761, + "learning_rate": 2.2006994699217963e-06, + "loss": 0.1783, + "step": 2109 + }, + { + "epoch": 4.3193449334698055, + "grad_norm": 0.2952250752859734, + "learning_rate": 2.187669813465192e-06, + "loss": 0.1666, + "step": 2110 + }, + { + "epoch": 4.321392016376663, + "grad_norm": 0.2648240745330133, + "learning_rate": 2.174676611893947e-06, + "loss": 0.1671, + "step": 2111 + }, + { + "epoch": 4.3234390992835205, + "grad_norm": 0.2811036034280304, + "learning_rate": 2.1617198918000737e-06, + "loss": 0.1765, + "step": 2112 + }, + { + "epoch": 4.325486182190379, + "grad_norm": 0.29793182339880603, + "learning_rate": 2.1487996797009103e-06, + "loss": 0.167, + "step": 2113 + }, + { + "epoch": 4.3275332650972365, + "grad_norm": 0.2752900542965174, + "learning_rate": 2.135916002039089e-06, + "loss": 0.1821, + "step": 2114 + }, + { + "epoch": 4.329580348004094, + "grad_norm": 0.27908851781361865, + "learning_rate": 2.123068885182471e-06, + "loss": 0.1875, + "step": 2115 + }, + { + "epoch": 4.3316274309109515, + "grad_norm": 0.30369045741991585, + "learning_rate": 2.110258355424093e-06, + "loss": 0.1565, + "step": 2116 + }, + { + "epoch": 4.33367451381781, + "grad_norm": 0.27387260120165463, + "learning_rate": 2.0974844389820914e-06, + "loss": 0.2037, + "step": 2117 + }, + { + "epoch": 4.335721596724667, + "grad_norm": 0.2695259971121429, + "learning_rate": 2.084747161999703e-06, + "loss": 0.1883, + "step": 2118 + }, + { + "epoch": 4.337768679631525, + "grad_norm": 0.27646790728531134, + "learning_rate": 2.0720465505451524e-06, + "loss": 0.1674, + "step": 2119 + }, + { + "epoch": 4.339815762538382, + "grad_norm": 0.27363984511519157, + "learning_rate": 2.0593826306116328e-06, + "loss": 0.1987, + "step": 2120 + }, + { + "epoch": 4.341862845445241, + "grad_norm": 0.3058041319945061, + "learning_rate": 2.0467554281172443e-06, + "loss": 0.1668, + "step": 2121 + }, + { + "epoch": 4.343909928352098, + "grad_norm": 0.28711732195110085, + "learning_rate": 2.0341649689049458e-06, + "loss": 0.1589, + "step": 2122 + }, + { + "epoch": 4.345957011258956, + "grad_norm": 0.29676194594555955, + "learning_rate": 2.021611278742479e-06, + "loss": 0.2006, + "step": 2123 + }, + { + "epoch": 4.348004094165813, + "grad_norm": 0.2894509415136224, + "learning_rate": 2.009094383322356e-06, + "loss": 0.1759, + "step": 2124 + }, + { + "epoch": 4.350051177072672, + "grad_norm": 0.27051010203429876, + "learning_rate": 1.9966143082617797e-06, + "loss": 0.2051, + "step": 2125 + }, + { + "epoch": 4.352098259979529, + "grad_norm": 0.29097733973563933, + "learning_rate": 1.9841710791025793e-06, + "loss": 0.1718, + "step": 2126 + }, + { + "epoch": 4.354145342886387, + "grad_norm": 0.321019324398248, + "learning_rate": 1.971764721311191e-06, + "loss": 0.2109, + "step": 2127 + }, + { + "epoch": 4.356192425793244, + "grad_norm": 0.30390810271040936, + "learning_rate": 1.959395260278587e-06, + "loss": 0.1489, + "step": 2128 + }, + { + "epoch": 4.358239508700103, + "grad_norm": 0.29799569461187914, + "learning_rate": 1.947062721320221e-06, + "loss": 0.1661, + "step": 2129 + }, + { + "epoch": 4.36028659160696, + "grad_norm": 0.2832110200623686, + "learning_rate": 1.9347671296759896e-06, + "loss": 0.1714, + "step": 2130 + }, + { + "epoch": 4.362333674513818, + "grad_norm": 0.2951503600181551, + "learning_rate": 1.922508510510166e-06, + "loss": 0.157, + "step": 2131 + }, + { + "epoch": 4.364380757420675, + "grad_norm": 0.28017904966905216, + "learning_rate": 1.9102868889113613e-06, + "loss": 0.1822, + "step": 2132 + }, + { + "epoch": 4.366427840327534, + "grad_norm": 0.27123684319205094, + "learning_rate": 1.8981022898924562e-06, + "loss": 0.2035, + "step": 2133 + }, + { + "epoch": 4.368474923234391, + "grad_norm": 0.3005307169297444, + "learning_rate": 1.885954738390572e-06, + "loss": 0.178, + "step": 2134 + }, + { + "epoch": 4.370522006141249, + "grad_norm": 0.30373485662664135, + "learning_rate": 1.8738442592670014e-06, + "loss": 0.1791, + "step": 2135 + }, + { + "epoch": 4.372569089048106, + "grad_norm": 0.29560692277511486, + "learning_rate": 1.8617708773071698e-06, + "loss": 0.1702, + "step": 2136 + }, + { + "epoch": 4.3746161719549645, + "grad_norm": 0.2799036916467131, + "learning_rate": 1.8497346172205733e-06, + "loss": 0.1757, + "step": 2137 + }, + { + "epoch": 4.376663254861822, + "grad_norm": 0.2914915460350074, + "learning_rate": 1.8377355036407408e-06, + "loss": 0.1537, + "step": 2138 + }, + { + "epoch": 4.37871033776868, + "grad_norm": 0.28848134194988, + "learning_rate": 1.8257735611251704e-06, + "loss": 0.1794, + "step": 2139 + }, + { + "epoch": 4.380757420675537, + "grad_norm": 0.2663575035541803, + "learning_rate": 1.8138488141552856e-06, + "loss": 0.1895, + "step": 2140 + }, + { + "epoch": 4.3828045035823955, + "grad_norm": 0.28041126513615366, + "learning_rate": 1.801961287136391e-06, + "loss": 0.1705, + "step": 2141 + }, + { + "epoch": 4.384851586489253, + "grad_norm": 0.2878226194974552, + "learning_rate": 1.7901110043976122e-06, + "loss": 0.1869, + "step": 2142 + }, + { + "epoch": 4.3868986693961105, + "grad_norm": 0.2845987197363273, + "learning_rate": 1.7782979901918507e-06, + "loss": 0.1822, + "step": 2143 + }, + { + "epoch": 4.388945752302968, + "grad_norm": 0.2750503273666346, + "learning_rate": 1.7665222686957362e-06, + "loss": 0.1778, + "step": 2144 + }, + { + "epoch": 4.3909928352098255, + "grad_norm": 0.3136573248950073, + "learning_rate": 1.754783864009575e-06, + "loss": 0.1569, + "step": 2145 + }, + { + "epoch": 4.393039918116684, + "grad_norm": 0.2957114276760941, + "learning_rate": 1.7430828001572897e-06, + "loss": 0.2259, + "step": 2146 + }, + { + "epoch": 4.395087001023541, + "grad_norm": 0.306095495611476, + "learning_rate": 1.7314191010863933e-06, + "loss": 0.2185, + "step": 2147 + }, + { + "epoch": 4.397134083930399, + "grad_norm": 0.2870140982518843, + "learning_rate": 1.7197927906679335e-06, + "loss": 0.2054, + "step": 2148 + }, + { + "epoch": 4.399181166837257, + "grad_norm": 0.2679693208920003, + "learning_rate": 1.7082038926964162e-06, + "loss": 0.1553, + "step": 2149 + }, + { + "epoch": 4.401228249744115, + "grad_norm": 0.30186153363388674, + "learning_rate": 1.6966524308897935e-06, + "loss": 0.1927, + "step": 2150 + }, + { + "epoch": 4.403275332650972, + "grad_norm": 0.2784578317629372, + "learning_rate": 1.6851384288894058e-06, + "loss": 0.1776, + "step": 2151 + }, + { + "epoch": 4.40532241555783, + "grad_norm": 0.2843049715212868, + "learning_rate": 1.6736619102599073e-06, + "loss": 0.1672, + "step": 2152 + }, + { + "epoch": 4.407369498464687, + "grad_norm": 0.31694116691154817, + "learning_rate": 1.6622228984892585e-06, + "loss": 0.1539, + "step": 2153 + }, + { + "epoch": 4.409416581371546, + "grad_norm": 0.26686752025138655, + "learning_rate": 1.6508214169886483e-06, + "loss": 0.1754, + "step": 2154 + }, + { + "epoch": 4.411463664278403, + "grad_norm": 0.2654670971586521, + "learning_rate": 1.6394574890924574e-06, + "loss": 0.2013, + "step": 2155 + }, + { + "epoch": 4.413510747185261, + "grad_norm": 0.29492009598586416, + "learning_rate": 1.6281311380582087e-06, + "loss": 0.2082, + "step": 2156 + }, + { + "epoch": 4.415557830092118, + "grad_norm": 0.30045313874805496, + "learning_rate": 1.616842387066524e-06, + "loss": 0.1767, + "step": 2157 + }, + { + "epoch": 4.417604912998977, + "grad_norm": 0.2910283873769807, + "learning_rate": 1.605591259221071e-06, + "loss": 0.1766, + "step": 2158 + }, + { + "epoch": 4.419651995905834, + "grad_norm": 0.28823745158894704, + "learning_rate": 1.5943777775485058e-06, + "loss": 0.1868, + "step": 2159 + }, + { + "epoch": 4.421699078812692, + "grad_norm": 0.26486227426645287, + "learning_rate": 1.583201964998451e-06, + "loss": 0.2016, + "step": 2160 + }, + { + "epoch": 4.423746161719549, + "grad_norm": 0.28343110323179144, + "learning_rate": 1.572063844443441e-06, + "loss": 0.1855, + "step": 2161 + }, + { + "epoch": 4.425793244626408, + "grad_norm": 0.2863936086901088, + "learning_rate": 1.5609634386788485e-06, + "loss": 0.1952, + "step": 2162 + }, + { + "epoch": 4.427840327533265, + "grad_norm": 0.29827769122199016, + "learning_rate": 1.5499007704228742e-06, + "loss": 0.1679, + "step": 2163 + }, + { + "epoch": 4.429887410440123, + "grad_norm": 0.2492563784192527, + "learning_rate": 1.5388758623164802e-06, + "loss": 0.1679, + "step": 2164 + }, + { + "epoch": 4.43193449334698, + "grad_norm": 0.2802794951076192, + "learning_rate": 1.5278887369233509e-06, + "loss": 0.1792, + "step": 2165 + }, + { + "epoch": 4.433981576253839, + "grad_norm": 0.2859554141734814, + "learning_rate": 1.5169394167298367e-06, + "loss": 0.1771, + "step": 2166 + }, + { + "epoch": 4.436028659160696, + "grad_norm": 0.3090415351876153, + "learning_rate": 1.5060279241449304e-06, + "loss": 0.1803, + "step": 2167 + }, + { + "epoch": 4.438075742067554, + "grad_norm": 0.2838386004821354, + "learning_rate": 1.4951542815001886e-06, + "loss": 0.159, + "step": 2168 + }, + { + "epoch": 4.440122824974411, + "grad_norm": 0.28615497006671264, + "learning_rate": 1.4843185110497139e-06, + "loss": 0.1654, + "step": 2169 + }, + { + "epoch": 4.4421699078812695, + "grad_norm": 0.2828928258486327, + "learning_rate": 1.4735206349701003e-06, + "loss": 0.166, + "step": 2170 + }, + { + "epoch": 4.444216990788127, + "grad_norm": 0.294693410920466, + "learning_rate": 1.4627606753603886e-06, + "loss": 0.1708, + "step": 2171 + }, + { + "epoch": 4.4462640736949846, + "grad_norm": 0.28832385318223086, + "learning_rate": 1.4520386542420006e-06, + "loss": 0.173, + "step": 2172 + }, + { + "epoch": 4.448311156601842, + "grad_norm": 0.2750702995509294, + "learning_rate": 1.4413545935587415e-06, + "loss": 0.1612, + "step": 2173 + }, + { + "epoch": 4.4503582395087005, + "grad_norm": 0.2827730533019191, + "learning_rate": 1.4307085151767086e-06, + "loss": 0.1568, + "step": 2174 + }, + { + "epoch": 4.452405322415558, + "grad_norm": 0.2724089303161446, + "learning_rate": 1.4201004408842644e-06, + "loss": 0.1577, + "step": 2175 + }, + { + "epoch": 4.4544524053224155, + "grad_norm": 0.27196453176116103, + "learning_rate": 1.4095303923919956e-06, + "loss": 0.1773, + "step": 2176 + }, + { + "epoch": 4.456499488229273, + "grad_norm": 0.3178765316382206, + "learning_rate": 1.3989983913326665e-06, + "loss": 0.159, + "step": 2177 + }, + { + "epoch": 4.458546571136131, + "grad_norm": 0.29327822729767095, + "learning_rate": 1.3885044592611706e-06, + "loss": 0.1431, + "step": 2178 + }, + { + "epoch": 4.460593654042989, + "grad_norm": 0.29769593027711694, + "learning_rate": 1.3780486176544905e-06, + "loss": 0.1985, + "step": 2179 + }, + { + "epoch": 4.462640736949846, + "grad_norm": 0.2844568268296535, + "learning_rate": 1.3676308879116507e-06, + "loss": 0.1652, + "step": 2180 + }, + { + "epoch": 4.464687819856704, + "grad_norm": 0.2824834673707818, + "learning_rate": 1.3572512913536783e-06, + "loss": 0.1957, + "step": 2181 + }, + { + "epoch": 4.466734902763562, + "grad_norm": 0.2944499941869074, + "learning_rate": 1.3469098492235521e-06, + "loss": 0.1663, + "step": 2182 + }, + { + "epoch": 4.46878198567042, + "grad_norm": 0.2860326191078485, + "learning_rate": 1.3366065826861685e-06, + "loss": 0.1508, + "step": 2183 + }, + { + "epoch": 4.470829068577277, + "grad_norm": 0.29391486499526526, + "learning_rate": 1.3263415128282908e-06, + "loss": 0.1643, + "step": 2184 + }, + { + "epoch": 4.472876151484135, + "grad_norm": 0.28228277616536795, + "learning_rate": 1.316114660658505e-06, + "loss": 0.165, + "step": 2185 + }, + { + "epoch": 4.474923234390992, + "grad_norm": 0.28104222228106285, + "learning_rate": 1.305926047107191e-06, + "loss": 0.1787, + "step": 2186 + }, + { + "epoch": 4.476970317297851, + "grad_norm": 0.3067397142838367, + "learning_rate": 1.2957756930264642e-06, + "loss": 0.1708, + "step": 2187 + }, + { + "epoch": 4.479017400204708, + "grad_norm": 0.287797881234288, + "learning_rate": 1.2856636191901296e-06, + "loss": 0.1778, + "step": 2188 + }, + { + "epoch": 4.481064483111566, + "grad_norm": 0.29131274478787034, + "learning_rate": 1.2755898462936544e-06, + "loss": 0.1754, + "step": 2189 + }, + { + "epoch": 4.483111566018424, + "grad_norm": 0.2774665709211303, + "learning_rate": 1.265554394954125e-06, + "loss": 0.1702, + "step": 2190 + }, + { + "epoch": 4.485158648925282, + "grad_norm": 0.2895032215404063, + "learning_rate": 1.255557285710185e-06, + "loss": 0.1572, + "step": 2191 + }, + { + "epoch": 4.487205731832139, + "grad_norm": 0.29001265695476014, + "learning_rate": 1.2455985390220193e-06, + "loss": 0.2107, + "step": 2192 + }, + { + "epoch": 4.489252814738997, + "grad_norm": 0.28177925431653544, + "learning_rate": 1.2356781752712932e-06, + "loss": 0.1821, + "step": 2193 + }, + { + "epoch": 4.491299897645854, + "grad_norm": 0.3086719314394619, + "learning_rate": 1.225796214761117e-06, + "loss": 0.1513, + "step": 2194 + }, + { + "epoch": 4.493346980552713, + "grad_norm": 0.29208730292486546, + "learning_rate": 1.2159526777160036e-06, + "loss": 0.1387, + "step": 2195 + }, + { + "epoch": 4.49539406345957, + "grad_norm": 0.3397441507088236, + "learning_rate": 1.2061475842818337e-06, + "loss": 0.1566, + "step": 2196 + }, + { + "epoch": 4.497441146366428, + "grad_norm": 0.2873481591587359, + "learning_rate": 1.196380954525802e-06, + "loss": 0.1726, + "step": 2197 + }, + { + "epoch": 4.499488229273285, + "grad_norm": 0.27576099473692967, + "learning_rate": 1.1866528084363881e-06, + "loss": 0.1549, + "step": 2198 + }, + { + "epoch": 4.501535312180144, + "grad_norm": 0.3686313423170628, + "learning_rate": 1.1769631659233104e-06, + "loss": 0.1567, + "step": 2199 + }, + { + "epoch": 4.503582395087001, + "grad_norm": 0.29827782847327655, + "learning_rate": 1.1673120468174837e-06, + "loss": 0.1872, + "step": 2200 + }, + { + "epoch": 4.505629477993859, + "grad_norm": 0.28857294210571843, + "learning_rate": 1.1576994708709766e-06, + "loss": 0.182, + "step": 2201 + }, + { + "epoch": 4.507676560900716, + "grad_norm": 0.27379258065859163, + "learning_rate": 1.148125457756981e-06, + "loss": 0.1732, + "step": 2202 + }, + { + "epoch": 4.5097236438075745, + "grad_norm": 0.2746324365416677, + "learning_rate": 1.1385900270697658e-06, + "loss": 0.1962, + "step": 2203 + }, + { + "epoch": 4.511770726714432, + "grad_norm": 0.29241951735417715, + "learning_rate": 1.1290931983246334e-06, + "loss": 0.1652, + "step": 2204 + }, + { + "epoch": 4.5138178096212895, + "grad_norm": 0.3084497347616217, + "learning_rate": 1.119634990957883e-06, + "loss": 0.1849, + "step": 2205 + }, + { + "epoch": 4.515864892528147, + "grad_norm": 0.3071333917103865, + "learning_rate": 1.110215424326775e-06, + "loss": 0.1584, + "step": 2206 + }, + { + "epoch": 4.5179119754350054, + "grad_norm": 0.2857404999852703, + "learning_rate": 1.1008345177094859e-06, + "loss": 0.195, + "step": 2207 + }, + { + "epoch": 4.519959058341863, + "grad_norm": 0.2705119565200689, + "learning_rate": 1.091492290305063e-06, + "loss": 0.1665, + "step": 2208 + }, + { + "epoch": 4.5220061412487205, + "grad_norm": 0.3090579005503219, + "learning_rate": 1.0821887612333959e-06, + "loss": 0.1802, + "step": 2209 + }, + { + "epoch": 4.524053224155578, + "grad_norm": 0.27905532018019086, + "learning_rate": 1.0729239495351917e-06, + "loss": 0.1786, + "step": 2210 + }, + { + "epoch": 4.526100307062436, + "grad_norm": 0.2818996052768231, + "learning_rate": 1.0636978741718873e-06, + "loss": 0.1951, + "step": 2211 + }, + { + "epoch": 4.528147389969294, + "grad_norm": 0.28519212105139324, + "learning_rate": 1.0545105540256628e-06, + "loss": 0.1718, + "step": 2212 + }, + { + "epoch": 4.530194472876151, + "grad_norm": 0.2752841091314753, + "learning_rate": 1.0453620078993755e-06, + "loss": 0.1904, + "step": 2213 + }, + { + "epoch": 4.532241555783009, + "grad_norm": 0.27656267859889894, + "learning_rate": 1.0362522545165276e-06, + "loss": 0.1563, + "step": 2214 + }, + { + "epoch": 4.534288638689867, + "grad_norm": 0.3034310152471821, + "learning_rate": 1.0271813125212237e-06, + "loss": 0.1967, + "step": 2215 + }, + { + "epoch": 4.536335721596725, + "grad_norm": 0.28706993728461716, + "learning_rate": 1.0181492004781467e-06, + "loss": 0.159, + "step": 2216 + }, + { + "epoch": 4.538382804503582, + "grad_norm": 0.2782931986242475, + "learning_rate": 1.009155936872499e-06, + "loss": 0.1926, + "step": 2217 + }, + { + "epoch": 4.54042988741044, + "grad_norm": 0.2836768256218939, + "learning_rate": 1.0002015401099797e-06, + "loss": 0.1697, + "step": 2218 + }, + { + "epoch": 4.542476970317297, + "grad_norm": 0.2708987463201031, + "learning_rate": 9.91286028516747e-07, + "loss": 0.1936, + "step": 2219 + }, + { + "epoch": 4.544524053224156, + "grad_norm": 0.28619987569766664, + "learning_rate": 9.824094203393697e-07, + "loss": 0.1849, + "step": 2220 + }, + { + "epoch": 4.546571136131013, + "grad_norm": 0.2707540756476124, + "learning_rate": 9.735717337447981e-07, + "loss": 0.1751, + "step": 2221 + }, + { + "epoch": 4.548618219037871, + "grad_norm": 0.27410601443733645, + "learning_rate": 9.647729868203238e-07, + "loss": 0.1868, + "step": 2222 + }, + { + "epoch": 4.550665301944729, + "grad_norm": 0.2957604221652705, + "learning_rate": 9.56013197573553e-07, + "loss": 0.1462, + "step": 2223 + }, + { + "epoch": 4.552712384851587, + "grad_norm": 0.26338269002342407, + "learning_rate": 9.4729238393235e-07, + "loss": 0.193, + "step": 2224 + }, + { + "epoch": 4.554759467758444, + "grad_norm": 0.2833184625486789, + "learning_rate": 9.386105637448151e-07, + "loss": 0.1621, + "step": 2225 + }, + { + "epoch": 4.556806550665302, + "grad_norm": 0.2789058820952134, + "learning_rate": 9.299677547792463e-07, + "loss": 0.1593, + "step": 2226 + }, + { + "epoch": 4.558853633572159, + "grad_norm": 0.2720509751943245, + "learning_rate": 9.21363974724101e-07, + "loss": 0.1802, + "step": 2227 + }, + { + "epoch": 4.560900716479018, + "grad_norm": 0.2861949558256626, + "learning_rate": 9.127992411879494e-07, + "loss": 0.2003, + "step": 2228 + }, + { + "epoch": 4.562947799385875, + "grad_norm": 0.28455385830088115, + "learning_rate": 9.042735716994678e-07, + "loss": 0.1772, + "step": 2229 + }, + { + "epoch": 4.564994882292733, + "grad_norm": 0.2762848657480311, + "learning_rate": 8.957869837073673e-07, + "loss": 0.153, + "step": 2230 + }, + { + "epoch": 4.567041965199591, + "grad_norm": 0.28684432410227595, + "learning_rate": 8.873394945803793e-07, + "loss": 0.139, + "step": 2231 + }, + { + "epoch": 4.569089048106449, + "grad_norm": 0.27952234592413544, + "learning_rate": 8.789311216072183e-07, + "loss": 0.1655, + "step": 2232 + }, + { + "epoch": 4.571136131013306, + "grad_norm": 0.2712591839850155, + "learning_rate": 8.705618819965411e-07, + "loss": 0.1687, + "step": 2233 + }, + { + "epoch": 4.573183213920164, + "grad_norm": 0.31661374577038626, + "learning_rate": 8.622317928769086e-07, + "loss": 0.1797, + "step": 2234 + }, + { + "epoch": 4.575230296827021, + "grad_norm": 0.30388013921486445, + "learning_rate": 8.539408712967679e-07, + "loss": 0.205, + "step": 2235 + }, + { + "epoch": 4.5772773797338795, + "grad_norm": 0.29761363560711057, + "learning_rate": 8.456891342243945e-07, + "loss": 0.1323, + "step": 2236 + }, + { + "epoch": 4.579324462640737, + "grad_norm": 0.2749824129298373, + "learning_rate": 8.374765985478728e-07, + "loss": 0.1662, + "step": 2237 + }, + { + "epoch": 4.5813715455475945, + "grad_norm": 0.288719742941428, + "learning_rate": 8.293032810750579e-07, + "loss": 0.185, + "step": 2238 + }, + { + "epoch": 4.583418628454452, + "grad_norm": 0.29201488163729705, + "learning_rate": 8.211691985335357e-07, + "loss": 0.1763, + "step": 2239 + }, + { + "epoch": 4.58546571136131, + "grad_norm": 0.29009215527637505, + "learning_rate": 8.130743675706032e-07, + "loss": 0.1485, + "step": 2240 + }, + { + "epoch": 4.587512794268168, + "grad_norm": 0.2855149297724901, + "learning_rate": 8.050188047532148e-07, + "loss": 0.172, + "step": 2241 + }, + { + "epoch": 4.5895598771750254, + "grad_norm": 0.3379024479854454, + "learning_rate": 7.970025265679648e-07, + "loss": 0.173, + "step": 2242 + }, + { + "epoch": 4.591606960081883, + "grad_norm": 0.2783179411940068, + "learning_rate": 7.890255494210453e-07, + "loss": 0.1801, + "step": 2243 + }, + { + "epoch": 4.593654042988741, + "grad_norm": 0.30188617728523903, + "learning_rate": 7.810878896382101e-07, + "loss": 0.1668, + "step": 2244 + }, + { + "epoch": 4.595701125895599, + "grad_norm": 0.27939878533555546, + "learning_rate": 7.731895634647513e-07, + "loss": 0.1544, + "step": 2245 + }, + { + "epoch": 4.597748208802456, + "grad_norm": 0.29721546644084723, + "learning_rate": 7.653305870654604e-07, + "loss": 0.1459, + "step": 2246 + }, + { + "epoch": 4.599795291709314, + "grad_norm": 0.2761040647920302, + "learning_rate": 7.575109765245936e-07, + "loss": 0.1991, + "step": 2247 + }, + { + "epoch": 4.601842374616172, + "grad_norm": 0.26790523775903885, + "learning_rate": 7.497307478458382e-07, + "loss": 0.1881, + "step": 2248 + }, + { + "epoch": 4.60388945752303, + "grad_norm": 0.26902868723836015, + "learning_rate": 7.419899169522903e-07, + "loss": 0.1956, + "step": 2249 + }, + { + "epoch": 4.605936540429887, + "grad_norm": 0.27200231013008247, + "learning_rate": 7.342884996863997e-07, + "loss": 0.1656, + "step": 2250 + }, + { + "epoch": 4.607983623336745, + "grad_norm": 0.3124482032601389, + "learning_rate": 7.266265118099669e-07, + "loss": 0.1753, + "step": 2251 + }, + { + "epoch": 4.610030706243603, + "grad_norm": 0.2938080759180912, + "learning_rate": 7.190039690040884e-07, + "loss": 0.1864, + "step": 2252 + }, + { + "epoch": 4.612077789150461, + "grad_norm": 0.28057707746789584, + "learning_rate": 7.114208868691319e-07, + "loss": 0.1655, + "step": 2253 + }, + { + "epoch": 4.614124872057318, + "grad_norm": 0.3670804523977461, + "learning_rate": 7.038772809247075e-07, + "loss": 0.2006, + "step": 2254 + }, + { + "epoch": 4.616171954964176, + "grad_norm": 0.3156535130425976, + "learning_rate": 6.963731666096318e-07, + "loss": 0.1873, + "step": 2255 + }, + { + "epoch": 4.618219037871034, + "grad_norm": 0.29444479967748544, + "learning_rate": 6.889085592818956e-07, + "loss": 0.1698, + "step": 2256 + }, + { + "epoch": 4.620266120777892, + "grad_norm": 0.2743736628864778, + "learning_rate": 6.814834742186361e-07, + "loss": 0.1851, + "step": 2257 + }, + { + "epoch": 4.622313203684749, + "grad_norm": 0.2788944245236952, + "learning_rate": 6.740979266161018e-07, + "loss": 0.1649, + "step": 2258 + }, + { + "epoch": 4.624360286591607, + "grad_norm": 0.28277515238178413, + "learning_rate": 6.667519315896264e-07, + "loss": 0.1707, + "step": 2259 + }, + { + "epoch": 4.626407369498464, + "grad_norm": 0.29229311294327376, + "learning_rate": 6.594455041735925e-07, + "loss": 0.1577, + "step": 2260 + }, + { + "epoch": 4.628454452405323, + "grad_norm": 0.2826367197666537, + "learning_rate": 6.521786593214075e-07, + "loss": 0.1694, + "step": 2261 + }, + { + "epoch": 4.63050153531218, + "grad_norm": 0.27183167715863765, + "learning_rate": 6.449514119054634e-07, + "loss": 0.1821, + "step": 2262 + }, + { + "epoch": 4.632548618219038, + "grad_norm": 0.2705799558624456, + "learning_rate": 6.377637767171152e-07, + "loss": 0.16, + "step": 2263 + }, + { + "epoch": 4.634595701125896, + "grad_norm": 0.312451093844958, + "learning_rate": 6.306157684666425e-07, + "loss": 0.146, + "step": 2264 + }, + { + "epoch": 4.6366427840327535, + "grad_norm": 0.30534769600917144, + "learning_rate": 6.235074017832299e-07, + "loss": 0.2026, + "step": 2265 + }, + { + "epoch": 4.638689866939611, + "grad_norm": 0.2749885611375992, + "learning_rate": 6.164386912149289e-07, + "loss": 0.149, + "step": 2266 + }, + { + "epoch": 4.640736949846469, + "grad_norm": 0.3043305615254814, + "learning_rate": 6.094096512286297e-07, + "loss": 0.1931, + "step": 2267 + }, + { + "epoch": 4.642784032753326, + "grad_norm": 0.2926074707796185, + "learning_rate": 6.024202962100312e-07, + "loss": 0.1704, + "step": 2268 + }, + { + "epoch": 4.6448311156601845, + "grad_norm": 0.27826474993716993, + "learning_rate": 5.954706404636179e-07, + "loss": 0.1868, + "step": 2269 + }, + { + "epoch": 4.646878198567042, + "grad_norm": 0.2676299230869566, + "learning_rate": 5.88560698212619e-07, + "loss": 0.1656, + "step": 2270 + }, + { + "epoch": 4.6489252814738995, + "grad_norm": 0.3004879943235446, + "learning_rate": 5.816904835989867e-07, + "loss": 0.1462, + "step": 2271 + }, + { + "epoch": 4.650972364380758, + "grad_norm": 0.28512914647917514, + "learning_rate": 5.748600106833735e-07, + "loss": 0.1784, + "step": 2272 + }, + { + "epoch": 4.653019447287615, + "grad_norm": 0.2810502748083855, + "learning_rate": 5.680692934450837e-07, + "loss": 0.2186, + "step": 2273 + }, + { + "epoch": 4.655066530194473, + "grad_norm": 0.2855042987462666, + "learning_rate": 5.613183457820714e-07, + "loss": 0.1911, + "step": 2274 + }, + { + "epoch": 4.65711361310133, + "grad_norm": 0.30832919924707025, + "learning_rate": 5.546071815108845e-07, + "loss": 0.1853, + "step": 2275 + }, + { + "epoch": 4.659160696008188, + "grad_norm": 0.27863470748148456, + "learning_rate": 5.479358143666602e-07, + "loss": 0.1446, + "step": 2276 + }, + { + "epoch": 4.661207778915046, + "grad_norm": 0.2954741773503375, + "learning_rate": 5.413042580030792e-07, + "loss": 0.1861, + "step": 2277 + }, + { + "epoch": 4.663254861821904, + "grad_norm": 0.292054066538448, + "learning_rate": 5.347125259923491e-07, + "loss": 0.1693, + "step": 2278 + }, + { + "epoch": 4.665301944728761, + "grad_norm": 0.2736497121658198, + "learning_rate": 5.281606318251764e-07, + "loss": 0.1548, + "step": 2279 + }, + { + "epoch": 4.667349027635619, + "grad_norm": 0.30678020029811015, + "learning_rate": 5.216485889107214e-07, + "loss": 0.1982, + "step": 2280 + }, + { + "epoch": 4.669396110542477, + "grad_norm": 0.3007630538627819, + "learning_rate": 5.151764105766011e-07, + "loss": 0.2082, + "step": 2281 + }, + { + "epoch": 4.671443193449335, + "grad_norm": 0.30472720971722367, + "learning_rate": 5.087441100688351e-07, + "loss": 0.1913, + "step": 2282 + }, + { + "epoch": 4.673490276356192, + "grad_norm": 0.28472393369780435, + "learning_rate": 5.023517005518264e-07, + "loss": 0.1795, + "step": 2283 + }, + { + "epoch": 4.67553735926305, + "grad_norm": 0.30834546215081027, + "learning_rate": 4.959991951083498e-07, + "loss": 0.1677, + "step": 2284 + }, + { + "epoch": 4.677584442169908, + "grad_norm": 0.2707613967004538, + "learning_rate": 4.89686606739499e-07, + "loss": 0.1885, + "step": 2285 + }, + { + "epoch": 4.679631525076766, + "grad_norm": 0.2925354294386155, + "learning_rate": 4.834139483646793e-07, + "loss": 0.1369, + "step": 2286 + }, + { + "epoch": 4.681678607983623, + "grad_norm": 0.295052171708121, + "learning_rate": 4.771812328215708e-07, + "loss": 0.1684, + "step": 2287 + }, + { + "epoch": 4.683725690890481, + "grad_norm": 0.2816846080734314, + "learning_rate": 4.709884728661118e-07, + "loss": 0.1634, + "step": 2288 + }, + { + "epoch": 4.685772773797339, + "grad_norm": 0.29537583958782165, + "learning_rate": 4.648356811724619e-07, + "loss": 0.1501, + "step": 2289 + }, + { + "epoch": 4.687819856704197, + "grad_norm": 0.2776645155887449, + "learning_rate": 4.587228703329838e-07, + "loss": 0.1731, + "step": 2290 + }, + { + "epoch": 4.689866939611054, + "grad_norm": 0.27540787792841026, + "learning_rate": 4.5265005285821674e-07, + "loss": 0.1688, + "step": 2291 + }, + { + "epoch": 4.691914022517912, + "grad_norm": 0.2774319968994503, + "learning_rate": 4.4661724117684545e-07, + "loss": 0.1736, + "step": 2292 + }, + { + "epoch": 4.69396110542477, + "grad_norm": 0.27404159260754235, + "learning_rate": 4.40624447635678e-07, + "loss": 0.1473, + "step": 2293 + }, + { + "epoch": 4.696008188331628, + "grad_norm": 0.2944590643348067, + "learning_rate": 4.346716844996279e-07, + "loss": 0.1594, + "step": 2294 + }, + { + "epoch": 4.698055271238485, + "grad_norm": 0.2881991440998457, + "learning_rate": 4.2875896395167427e-07, + "loss": 0.1988, + "step": 2295 + }, + { + "epoch": 4.700102354145343, + "grad_norm": 0.28026776497774936, + "learning_rate": 4.228862980928439e-07, + "loss": 0.1784, + "step": 2296 + }, + { + "epoch": 4.702149437052201, + "grad_norm": 0.2955331369567261, + "learning_rate": 4.1705369894219584e-07, + "loss": 0.1786, + "step": 2297 + }, + { + "epoch": 4.7041965199590585, + "grad_norm": 0.292319027121268, + "learning_rate": 4.112611784367837e-07, + "loss": 0.1677, + "step": 2298 + }, + { + "epoch": 4.706243602865916, + "grad_norm": 0.31348235722088247, + "learning_rate": 4.0550874843163337e-07, + "loss": 0.1796, + "step": 2299 + }, + { + "epoch": 4.7082906857727735, + "grad_norm": 0.2903513190722839, + "learning_rate": 3.997964206997207e-07, + "loss": 0.1804, + "step": 2300 + }, + { + "epoch": 4.710337768679631, + "grad_norm": 0.2879799305264078, + "learning_rate": 3.941242069319562e-07, + "loss": 0.1895, + "step": 2301 + }, + { + "epoch": 4.7123848515864895, + "grad_norm": 0.3577665248128591, + "learning_rate": 3.8849211873714266e-07, + "loss": 0.1765, + "step": 2302 + }, + { + "epoch": 4.714431934493347, + "grad_norm": 0.2903669946216332, + "learning_rate": 3.8290016764196637e-07, + "loss": 0.1716, + "step": 2303 + }, + { + "epoch": 4.7164790174002045, + "grad_norm": 0.3014373493296439, + "learning_rate": 3.7734836509096596e-07, + "loss": 0.1388, + "step": 2304 + }, + { + "epoch": 4.718526100307063, + "grad_norm": 0.3083437231349992, + "learning_rate": 3.7183672244652135e-07, + "loss": 0.1903, + "step": 2305 + }, + { + "epoch": 4.72057318321392, + "grad_norm": 0.2957154629258413, + "learning_rate": 3.663652509888027e-07, + "loss": 0.1718, + "step": 2306 + }, + { + "epoch": 4.722620266120778, + "grad_norm": 0.3046648653682266, + "learning_rate": 3.6093396191578366e-07, + "loss": 0.1979, + "step": 2307 + }, + { + "epoch": 4.724667349027635, + "grad_norm": 0.28017443242244167, + "learning_rate": 3.5554286634318814e-07, + "loss": 0.1728, + "step": 2308 + }, + { + "epoch": 4.726714431934493, + "grad_norm": 0.2748026640801257, + "learning_rate": 3.501919753044836e-07, + "loss": 0.2096, + "step": 2309 + }, + { + "epoch": 4.728761514841351, + "grad_norm": 0.2900862431852642, + "learning_rate": 3.448812997508588e-07, + "loss": 0.1655, + "step": 2310 + }, + { + "epoch": 4.730808597748209, + "grad_norm": 0.2833002934703294, + "learning_rate": 3.3961085055119083e-07, + "loss": 0.1527, + "step": 2311 + }, + { + "epoch": 4.732855680655066, + "grad_norm": 0.27909247839420187, + "learning_rate": 3.3438063849203116e-07, + "loss": 0.1449, + "step": 2312 + }, + { + "epoch": 4.734902763561925, + "grad_norm": 0.2997312028695653, + "learning_rate": 3.2919067427758186e-07, + "loss": 0.153, + "step": 2313 + }, + { + "epoch": 4.736949846468782, + "grad_norm": 0.2788661977637299, + "learning_rate": 3.2404096852967305e-07, + "loss": 0.1686, + "step": 2314 + }, + { + "epoch": 4.73899692937564, + "grad_norm": 0.2844108440502707, + "learning_rate": 3.189315317877428e-07, + "loss": 0.1575, + "step": 2315 + }, + { + "epoch": 4.741044012282497, + "grad_norm": 0.29542954551351697, + "learning_rate": 3.138623745088132e-07, + "loss": 0.1489, + "step": 2316 + }, + { + "epoch": 4.743091095189355, + "grad_norm": 0.2931309390733155, + "learning_rate": 3.0883350706746973e-07, + "loss": 0.1793, + "step": 2317 + }, + { + "epoch": 4.745138178096213, + "grad_norm": 0.274362618181376, + "learning_rate": 3.038449397558396e-07, + "loss": 0.1635, + "step": 2318 + }, + { + "epoch": 4.747185261003071, + "grad_norm": 0.27428372927016115, + "learning_rate": 2.9889668278357376e-07, + "loss": 0.1588, + "step": 2319 + }, + { + "epoch": 4.749232343909928, + "grad_norm": 0.29732674571188733, + "learning_rate": 2.9398874627782014e-07, + "loss": 0.1708, + "step": 2320 + }, + { + "epoch": 4.751279426816786, + "grad_norm": 0.3003256230614381, + "learning_rate": 2.891211402832128e-07, + "loss": 0.1725, + "step": 2321 + }, + { + "epoch": 4.753326509723644, + "grad_norm": 0.3084720969154894, + "learning_rate": 2.8429387476183624e-07, + "loss": 0.1483, + "step": 2322 + }, + { + "epoch": 4.755373592630502, + "grad_norm": 0.3036705097265483, + "learning_rate": 2.7950695959322093e-07, + "loss": 0.1623, + "step": 2323 + }, + { + "epoch": 4.757420675537359, + "grad_norm": 0.26881070505536797, + "learning_rate": 2.747604045743102e-07, + "loss": 0.171, + "step": 2324 + }, + { + "epoch": 4.759467758444217, + "grad_norm": 0.2880629649819299, + "learning_rate": 2.7005421941945555e-07, + "loss": 0.1646, + "step": 2325 + }, + { + "epoch": 4.761514841351075, + "grad_norm": 0.28081087918300657, + "learning_rate": 2.653884137603702e-07, + "loss": 0.1427, + "step": 2326 + }, + { + "epoch": 4.763561924257933, + "grad_norm": 0.29698506829638605, + "learning_rate": 2.6076299714614673e-07, + "loss": 0.1612, + "step": 2327 + }, + { + "epoch": 4.76560900716479, + "grad_norm": 0.2830061118167646, + "learning_rate": 2.5617797904320396e-07, + "loss": 0.1731, + "step": 2328 + }, + { + "epoch": 4.767656090071648, + "grad_norm": 0.3123860044528765, + "learning_rate": 2.516333688352801e-07, + "loss": 0.1561, + "step": 2329 + }, + { + "epoch": 4.769703172978506, + "grad_norm": 0.2798082098938455, + "learning_rate": 2.471291758234218e-07, + "loss": 0.1902, + "step": 2330 + }, + { + "epoch": 4.7717502558853635, + "grad_norm": 0.2987315382953632, + "learning_rate": 2.426654092259528e-07, + "loss": 0.1551, + "step": 2331 + }, + { + "epoch": 4.773797338792221, + "grad_norm": 0.2808063055566694, + "learning_rate": 2.382420781784589e-07, + "loss": 0.1749, + "step": 2332 + }, + { + "epoch": 4.7758444216990785, + "grad_norm": 0.2796699754953347, + "learning_rate": 2.338591917337696e-07, + "loss": 0.1727, + "step": 2333 + }, + { + "epoch": 4.777891504605937, + "grad_norm": 0.277747972615087, + "learning_rate": 2.295167588619518e-07, + "loss": 0.1507, + "step": 2334 + }, + { + "epoch": 4.779938587512794, + "grad_norm": 0.2987700878833016, + "learning_rate": 2.2521478845025867e-07, + "loss": 0.1798, + "step": 2335 + }, + { + "epoch": 4.781985670419652, + "grad_norm": 0.2728906304255605, + "learning_rate": 2.2095328930315184e-07, + "loss": 0.171, + "step": 2336 + }, + { + "epoch": 4.7840327533265095, + "grad_norm": 0.300885418994469, + "learning_rate": 2.167322701422525e-07, + "loss": 0.163, + "step": 2337 + }, + { + "epoch": 4.786079836233368, + "grad_norm": 0.2843769306835108, + "learning_rate": 2.1255173960634146e-07, + "loss": 0.1788, + "step": 2338 + }, + { + "epoch": 4.788126919140225, + "grad_norm": 0.29006108226073735, + "learning_rate": 2.08411706251328e-07, + "loss": 0.177, + "step": 2339 + }, + { + "epoch": 4.790174002047083, + "grad_norm": 0.28369183368200673, + "learning_rate": 2.0431217855025e-07, + "loss": 0.18, + "step": 2340 + }, + { + "epoch": 4.79222108495394, + "grad_norm": 0.28761647181705274, + "learning_rate": 2.0025316489323597e-07, + "loss": 0.1656, + "step": 2341 + }, + { + "epoch": 4.794268167860798, + "grad_norm": 0.2839547733000503, + "learning_rate": 1.9623467358750315e-07, + "loss": 0.1743, + "step": 2342 + }, + { + "epoch": 4.796315250767656, + "grad_norm": 0.2792802390584324, + "learning_rate": 1.9225671285733272e-07, + "loss": 0.1793, + "step": 2343 + }, + { + "epoch": 4.798362333674514, + "grad_norm": 0.2953632540852749, + "learning_rate": 1.8831929084406119e-07, + "loss": 0.1697, + "step": 2344 + }, + { + "epoch": 4.800409416581371, + "grad_norm": 0.26668806789326216, + "learning_rate": 1.8442241560604922e-07, + "loss": 0.1517, + "step": 2345 + }, + { + "epoch": 4.80245649948823, + "grad_norm": 0.27943039216113114, + "learning_rate": 1.8056609511868163e-07, + "loss": 0.2131, + "step": 2346 + }, + { + "epoch": 4.804503582395087, + "grad_norm": 0.26345026664491444, + "learning_rate": 1.7675033727434288e-07, + "loss": 0.1688, + "step": 2347 + }, + { + "epoch": 4.806550665301945, + "grad_norm": 0.28602587264323004, + "learning_rate": 1.7297514988239505e-07, + "loss": 0.1958, + "step": 2348 + }, + { + "epoch": 4.808597748208802, + "grad_norm": 0.27558338217328315, + "learning_rate": 1.692405406691755e-07, + "loss": 0.1796, + "step": 2349 + }, + { + "epoch": 4.81064483111566, + "grad_norm": 0.2966627143037965, + "learning_rate": 1.655465172779702e-07, + "loss": 0.192, + "step": 2350 + }, + { + "epoch": 4.812691914022518, + "grad_norm": 0.2944395579726434, + "learning_rate": 1.6189308726900277e-07, + "loss": 0.172, + "step": 2351 + }, + { + "epoch": 4.814738996929376, + "grad_norm": 0.2711839256534087, + "learning_rate": 1.5828025811941872e-07, + "loss": 0.1901, + "step": 2352 + }, + { + "epoch": 4.816786079836233, + "grad_norm": 0.29155015705326437, + "learning_rate": 1.547080372232679e-07, + "loss": 0.167, + "step": 2353 + }, + { + "epoch": 4.818833162743092, + "grad_norm": 0.2815618639741477, + "learning_rate": 1.5117643189149546e-07, + "loss": 0.1516, + "step": 2354 + }, + { + "epoch": 4.820880245649949, + "grad_norm": 0.29038125469870796, + "learning_rate": 1.4768544935191088e-07, + "loss": 0.1657, + "step": 2355 + }, + { + "epoch": 4.822927328556807, + "grad_norm": 0.2867465831992582, + "learning_rate": 1.44235096749199e-07, + "loss": 0.1824, + "step": 2356 + }, + { + "epoch": 4.824974411463664, + "grad_norm": 0.2708843561414753, + "learning_rate": 1.408253811448823e-07, + "loss": 0.1735, + "step": 2357 + }, + { + "epoch": 4.827021494370522, + "grad_norm": 0.2891677780067326, + "learning_rate": 1.374563095173187e-07, + "loss": 0.1594, + "step": 2358 + }, + { + "epoch": 4.82906857727738, + "grad_norm": 0.29721811114798363, + "learning_rate": 1.3412788876167925e-07, + "loss": 0.1681, + "step": 2359 + }, + { + "epoch": 4.8311156601842375, + "grad_norm": 0.2819368121687383, + "learning_rate": 1.3084012568994608e-07, + "loss": 0.2588, + "step": 2360 + }, + { + "epoch": 4.833162743091095, + "grad_norm": 0.2637703601334377, + "learning_rate": 1.2759302703088117e-07, + "loss": 0.1686, + "step": 2361 + }, + { + "epoch": 4.835209825997953, + "grad_norm": 0.2790376537691832, + "learning_rate": 1.2438659943003306e-07, + "loss": 0.1824, + "step": 2362 + }, + { + "epoch": 4.837256908904811, + "grad_norm": 0.2855413561087752, + "learning_rate": 1.212208494497036e-07, + "loss": 0.18, + "step": 2363 + }, + { + "epoch": 4.8393039918116685, + "grad_norm": 0.2966645113711715, + "learning_rate": 1.180957835689478e-07, + "loss": 0.1462, + "step": 2364 + }, + { + "epoch": 4.841351074718526, + "grad_norm": 0.28073303467220956, + "learning_rate": 1.1501140818355627e-07, + "loss": 0.2037, + "step": 2365 + }, + { + "epoch": 4.8433981576253835, + "grad_norm": 0.27323730338581254, + "learning_rate": 1.1196772960603952e-07, + "loss": 0.1552, + "step": 2366 + }, + { + "epoch": 4.845445240532242, + "grad_norm": 0.27979495089452416, + "learning_rate": 1.0896475406562135e-07, + "loss": 0.1911, + "step": 2367 + }, + { + "epoch": 4.847492323439099, + "grad_norm": 0.27315088792468245, + "learning_rate": 1.0600248770821886e-07, + "loss": 0.1945, + "step": 2368 + }, + { + "epoch": 4.849539406345957, + "grad_norm": 0.29508373388397574, + "learning_rate": 1.0308093659643582e-07, + "loss": 0.158, + "step": 2369 + }, + { + "epoch": 4.851586489252814, + "grad_norm": 0.2790657492666217, + "learning_rate": 1.0020010670954483e-07, + "loss": 0.1798, + "step": 2370 + }, + { + "epoch": 4.853633572159673, + "grad_norm": 0.2863828008417254, + "learning_rate": 9.736000394348299e-08, + "loss": 0.1688, + "step": 2371 + }, + { + "epoch": 4.85568065506653, + "grad_norm": 0.2837857995206189, + "learning_rate": 9.456063411082738e-08, + "loss": 0.1818, + "step": 2372 + }, + { + "epoch": 4.857727737973388, + "grad_norm": 0.26660018975441796, + "learning_rate": 9.180200294079955e-08, + "loss": 0.1681, + "step": 2373 + }, + { + "epoch": 4.859774820880245, + "grad_norm": 0.30489283035917186, + "learning_rate": 8.908411607923884e-08, + "loss": 0.1724, + "step": 2374 + }, + { + "epoch": 4.861821903787103, + "grad_norm": 0.27668845133173764, + "learning_rate": 8.640697908859575e-08, + "loss": 0.1871, + "step": 2375 + }, + { + "epoch": 4.863868986693961, + "grad_norm": 0.27941980307619757, + "learning_rate": 8.377059744792748e-08, + "loss": 0.1875, + "step": 2376 + }, + { + "epoch": 4.865916069600819, + "grad_norm": 0.2991779658323062, + "learning_rate": 8.117497655287798e-08, + "loss": 0.1607, + "step": 2377 + }, + { + "epoch": 4.867963152507676, + "grad_norm": 0.2960605235453732, + "learning_rate": 7.862012171566902e-08, + "loss": 0.2025, + "step": 2378 + }, + { + "epoch": 4.870010235414535, + "grad_norm": 0.2909596199436068, + "learning_rate": 7.61060381650891e-08, + "loss": 0.1976, + "step": 2379 + }, + { + "epoch": 4.872057318321392, + "grad_norm": 0.3022247341372479, + "learning_rate": 7.363273104648904e-08, + "loss": 0.1873, + "step": 2380 + }, + { + "epoch": 4.87410440122825, + "grad_norm": 0.27635276227359323, + "learning_rate": 7.120020542176198e-08, + "loss": 0.1815, + "step": 2381 + }, + { + "epoch": 4.876151484135107, + "grad_norm": 0.3235414628513566, + "learning_rate": 6.880846626933668e-08, + "loss": 0.1526, + "step": 2382 + }, + { + "epoch": 4.878198567041965, + "grad_norm": 0.3061161602814933, + "learning_rate": 6.645751848417093e-08, + "loss": 0.1672, + "step": 2383 + }, + { + "epoch": 4.880245649948823, + "grad_norm": 0.2857967006307856, + "learning_rate": 6.414736687773371e-08, + "loss": 0.1662, + "step": 2384 + }, + { + "epoch": 4.882292732855681, + "grad_norm": 0.27457282899975494, + "learning_rate": 6.187801617800748e-08, + "loss": 0.1564, + "step": 2385 + }, + { + "epoch": 4.884339815762538, + "grad_norm": 0.2990684370251101, + "learning_rate": 5.964947102946594e-08, + "loss": 0.193, + "step": 2386 + }, + { + "epoch": 4.886386898669397, + "grad_norm": 0.27143331999482234, + "learning_rate": 5.746173599307181e-08, + "loss": 0.172, + "step": 2387 + }, + { + "epoch": 4.888433981576254, + "grad_norm": 0.2720324730535623, + "learning_rate": 5.531481554626128e-08, + "loss": 0.1466, + "step": 2388 + }, + { + "epoch": 4.890481064483112, + "grad_norm": 0.29144216168390374, + "learning_rate": 5.320871408294403e-08, + "loss": 0.1622, + "step": 2389 + }, + { + "epoch": 4.892528147389969, + "grad_norm": 0.27957166407177925, + "learning_rate": 5.114343591348769e-08, + "loss": 0.1744, + "step": 2390 + }, + { + "epoch": 4.894575230296827, + "grad_norm": 0.2743372447650692, + "learning_rate": 4.9118985264711147e-08, + "loss": 0.1779, + "step": 2391 + }, + { + "epoch": 4.896622313203685, + "grad_norm": 0.2864112361999076, + "learning_rate": 4.713536627987347e-08, + "loss": 0.1783, + "step": 2392 + }, + { + "epoch": 4.8986693961105425, + "grad_norm": 0.2820613867682801, + "learning_rate": 4.519258301866947e-08, + "loss": 0.1764, + "step": 2393 + }, + { + "epoch": 4.9007164790174, + "grad_norm": 0.2790446443208781, + "learning_rate": 4.3290639457214125e-08, + "loss": 0.1983, + "step": 2394 + }, + { + "epoch": 4.9027635619242576, + "grad_norm": 0.3009212756955286, + "learning_rate": 4.1429539488047066e-08, + "loss": 0.1632, + "step": 2395 + }, + { + "epoch": 4.904810644831116, + "grad_norm": 0.3048977634842906, + "learning_rate": 3.960928692011257e-08, + "loss": 0.1775, + "step": 2396 + }, + { + "epoch": 4.9068577277379735, + "grad_norm": 0.2999700691804944, + "learning_rate": 3.7829885478757324e-08, + "loss": 0.1811, + "step": 2397 + }, + { + "epoch": 4.908904810644831, + "grad_norm": 0.2867226352655605, + "learning_rate": 3.6091338805719356e-08, + "loss": 0.1749, + "step": 2398 + }, + { + "epoch": 4.9109518935516885, + "grad_norm": 0.27853817331380126, + "learning_rate": 3.439365045912801e-08, + "loss": 0.1715, + "step": 2399 + }, + { + "epoch": 4.912998976458547, + "grad_norm": 0.26671396491728905, + "learning_rate": 3.273682391348398e-08, + "loss": 0.1736, + "step": 2400 + }, + { + "epoch": 4.915046059365404, + "grad_norm": 0.2981072341365868, + "learning_rate": 3.1120862559670396e-08, + "loss": 0.1608, + "step": 2401 + }, + { + "epoch": 4.917093142272262, + "grad_norm": 0.3006161506174685, + "learning_rate": 2.9545769704923954e-08, + "loss": 0.174, + "step": 2402 + }, + { + "epoch": 4.919140225179119, + "grad_norm": 0.2700128965804573, + "learning_rate": 2.8011548572846047e-08, + "loss": 0.1666, + "step": 2403 + }, + { + "epoch": 4.921187308085978, + "grad_norm": 0.27640788721099124, + "learning_rate": 2.651820230338942e-08, + "loss": 0.1786, + "step": 2404 + }, + { + "epoch": 4.923234390992835, + "grad_norm": 0.30195005723908613, + "learning_rate": 2.50657339528515e-08, + "loss": 0.1683, + "step": 2405 + }, + { + "epoch": 4.925281473899693, + "grad_norm": 0.27803519346079886, + "learning_rate": 2.365414649386555e-08, + "loss": 0.2196, + "step": 2406 + }, + { + "epoch": 4.92732855680655, + "grad_norm": 0.27873854091593087, + "learning_rate": 2.2283442815402845e-08, + "loss": 0.1772, + "step": 2407 + }, + { + "epoch": 4.929375639713409, + "grad_norm": 0.31272442814047646, + "learning_rate": 2.0953625722754943e-08, + "loss": 0.1917, + "step": 2408 + }, + { + "epoch": 4.931422722620266, + "grad_norm": 0.27074887610829274, + "learning_rate": 1.9664697937542554e-08, + "loss": 0.177, + "step": 2409 + }, + { + "epoch": 4.933469805527124, + "grad_norm": 0.2809859827116871, + "learning_rate": 1.8416662097693326e-08, + "loss": 0.1735, + "step": 2410 + }, + { + "epoch": 4.935516888433981, + "grad_norm": 0.2922562803880012, + "learning_rate": 1.720952075745075e-08, + "loss": 0.164, + "step": 2411 + }, + { + "epoch": 4.93756397134084, + "grad_norm": 0.27866762770170544, + "learning_rate": 1.604327638736525e-08, + "loss": 0.1509, + "step": 2412 + }, + { + "epoch": 4.939611054247697, + "grad_norm": 0.2757657750150201, + "learning_rate": 1.491793137427866e-08, + "loss": 0.1694, + "step": 2413 + }, + { + "epoch": 4.941658137154555, + "grad_norm": 0.3690880615237033, + "learning_rate": 1.3833488021335328e-08, + "loss": 0.1459, + "step": 2414 + }, + { + "epoch": 4.943705220061412, + "grad_norm": 0.28975968339512975, + "learning_rate": 1.2789948547968779e-08, + "loss": 0.1723, + "step": 2415 + }, + { + "epoch": 4.94575230296827, + "grad_norm": 0.2676510309996025, + "learning_rate": 1.1787315089895057e-08, + "loss": 0.1587, + "step": 2416 + }, + { + "epoch": 4.947799385875128, + "grad_norm": 0.29535503732393725, + "learning_rate": 1.0825589699112737e-08, + "loss": 0.1429, + "step": 2417 + }, + { + "epoch": 4.949846468781986, + "grad_norm": 0.2851419625396727, + "learning_rate": 9.904774343898471e-09, + "loss": 0.1529, + "step": 2418 + }, + { + "epoch": 4.951893551688843, + "grad_norm": 0.25772957787571077, + "learning_rate": 9.024870908802552e-09, + "loss": 0.187, + "step": 2419 + }, + { + "epoch": 4.9539406345957016, + "grad_norm": 0.2900373956230706, + "learning_rate": 8.185881194644474e-09, + "loss": 0.1541, + "step": 2420 + }, + { + "epoch": 4.955987717502559, + "grad_norm": 0.291463457270851, + "learning_rate": 7.387806918508489e-09, + "loss": 0.1585, + "step": 2421 + }, + { + "epoch": 4.958034800409417, + "grad_norm": 0.2796933717518039, + "learning_rate": 6.630649713739168e-09, + "loss": 0.2045, + "step": 2422 + }, + { + "epoch": 4.960081883316274, + "grad_norm": 0.2809689443794419, + "learning_rate": 5.9144111299414e-09, + "loss": 0.169, + "step": 2423 + }, + { + "epoch": 4.962128966223132, + "grad_norm": 0.3074472408506574, + "learning_rate": 5.239092632980391e-09, + "loss": 0.1593, + "step": 2424 + }, + { + "epoch": 4.96417604912999, + "grad_norm": 0.28551605159632193, + "learning_rate": 4.6046956049639045e-09, + "loss": 0.1518, + "step": 2425 + }, + { + "epoch": 4.9662231320368475, + "grad_norm": 0.2889141054976907, + "learning_rate": 4.011221344257799e-09, + "loss": 0.1781, + "step": 2426 + }, + { + "epoch": 4.968270214943705, + "grad_norm": 0.25486448368902137, + "learning_rate": 3.4586710654727074e-09, + "loss": 0.1688, + "step": 2427 + }, + { + "epoch": 4.970317297850563, + "grad_norm": 0.3024410370860751, + "learning_rate": 2.94704589946182e-09, + "loss": 0.2105, + "step": 2428 + }, + { + "epoch": 4.972364380757421, + "grad_norm": 0.29484058319269135, + "learning_rate": 2.4763468933231005e-09, + "loss": 0.178, + "step": 2429 + }, + { + "epoch": 4.974411463664278, + "grad_norm": 0.28427681285996675, + "learning_rate": 2.0465750103926263e-09, + "loss": 0.2151, + "step": 2430 + }, + { + "epoch": 4.976458546571136, + "grad_norm": 0.2758137593263903, + "learning_rate": 1.657731130246809e-09, + "loss": 0.1934, + "step": 2431 + }, + { + "epoch": 4.9785056294779935, + "grad_norm": 0.3024473446528619, + "learning_rate": 1.309816048697954e-09, + "loss": 0.1732, + "step": 2432 + }, + { + "epoch": 4.980552712384852, + "grad_norm": 0.26746264280387694, + "learning_rate": 1.0028304777875975e-09, + "loss": 0.1807, + "step": 2433 + }, + { + "epoch": 4.982599795291709, + "grad_norm": 0.2689810857541332, + "learning_rate": 7.367750458020518e-10, + "loss": 0.1713, + "step": 2434 + }, + { + "epoch": 4.984646878198567, + "grad_norm": 0.30444153746664904, + "learning_rate": 5.116502972479787e-10, + "loss": 0.172, + "step": 2435 + }, + { + "epoch": 4.986693961105424, + "grad_norm": 0.2724209053996081, + "learning_rate": 3.2745669287237435e-10, + "loss": 0.1994, + "step": 2436 + }, + { + "epoch": 4.988741044012283, + "grad_norm": 0.2666986989444724, + "learning_rate": 1.8419460964258505e-10, + "loss": 0.1735, + "step": 2437 + }, + { + "epoch": 4.99078812691914, + "grad_norm": 0.2846154192735168, + "learning_rate": 8.186434076185024e-11, + "loss": 0.1539, + "step": 2438 + }, + { + "epoch": 4.992835209825998, + "grad_norm": 0.28158014093768946, + "learning_rate": 2.046609566264124e-11, + "loss": 0.1756, + "step": 2439 + }, + { + "epoch": 4.994882292732855, + "grad_norm": 0.3150094705291199, + "learning_rate": 0.0, + "loss": 0.2073, + "step": 2440 + }, + { + "epoch": 4.994882292732855, + "step": 2440, + "total_flos": 2619216084533248.0, + "train_loss": 0.34357271391715183, + "train_runtime": 45861.8772, + "train_samples_per_second": 6.814, + "train_steps_per_second": 0.053 + } + ], + "logging_steps": 1, + "max_steps": 2440, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2619216084533248.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}