diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,7033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.023323139177917653, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 2.332313917791765e-05, + "grad_norm": 1.6235620975494385, + "learning_rate": 5.182689816014512e-09, + "loss": 1.9275, + "step": 1 + }, + { + "epoch": 4.66462783558353e-05, + "grad_norm": 1.5710082054138184, + "learning_rate": 1.0365379632029025e-08, + "loss": 1.5593, + "step": 2 + }, + { + "epoch": 6.996941753375295e-05, + "grad_norm": 2.3231985569000244, + "learning_rate": 1.5548069448043534e-08, + "loss": 2.0021, + "step": 3 + }, + { + "epoch": 9.32925567116706e-05, + "grad_norm": 1.8349288702011108, + "learning_rate": 2.073075926405805e-08, + "loss": 2.1141, + "step": 4 + }, + { + "epoch": 0.00011661569588958826, + "grad_norm": 2.039928436279297, + "learning_rate": 2.5913449080072562e-08, + "loss": 1.9361, + "step": 5 + }, + { + "epoch": 0.0001399388350675059, + "grad_norm": 1.8988783359527588, + "learning_rate": 3.109613889608707e-08, + "loss": 2.2441, + "step": 6 + }, + { + "epoch": 0.00016326197424542356, + "grad_norm": 1.4865813255310059, + "learning_rate": 3.6278828712101586e-08, + "loss": 1.8118, + "step": 7 + }, + { + "epoch": 0.0001865851134233412, + "grad_norm": 1.4033368825912476, + "learning_rate": 4.14615185281161e-08, + "loss": 1.8838, + "step": 8 + }, + { + "epoch": 0.00020990825260125886, + "grad_norm": 1.876894235610962, + "learning_rate": 4.6644208344130604e-08, + "loss": 1.9916, + "step": 9 + }, + { + "epoch": 0.00023323139177917651, + "grad_norm": 2.4104366302490234, + "learning_rate": 5.1826898160145123e-08, + "loss": 1.8618, + "step": 10 + }, + { + "epoch": 0.0002565545309570942, + "grad_norm": 1.8457229137420654, + "learning_rate": 5.700958797615963e-08, + "loss": 1.7303, + "step": 11 + }, + { + "epoch": 0.0002798776701350118, + "grad_norm": 1.940317988395691, + "learning_rate": 6.219227779217413e-08, + "loss": 2.2692, + "step": 12 + }, + { + "epoch": 0.0003032008093129295, + "grad_norm": 2.455432891845703, + "learning_rate": 6.737496760818865e-08, + "loss": 2.3401, + "step": 13 + }, + { + "epoch": 0.0003265239484908471, + "grad_norm": 1.5163850784301758, + "learning_rate": 7.255765742420317e-08, + "loss": 2.1687, + "step": 14 + }, + { + "epoch": 0.0003498470876687648, + "grad_norm": 1.3012642860412598, + "learning_rate": 7.774034724021768e-08, + "loss": 1.8693, + "step": 15 + }, + { + "epoch": 0.0003731702268466824, + "grad_norm": 2.0896522998809814, + "learning_rate": 8.29230370562322e-08, + "loss": 1.7031, + "step": 16 + }, + { + "epoch": 0.0003964933660246001, + "grad_norm": 1.7818728685379028, + "learning_rate": 8.810572687224672e-08, + "loss": 2.0829, + "step": 17 + }, + { + "epoch": 0.0004198165052025177, + "grad_norm": 2.569828510284424, + "learning_rate": 9.328841668826121e-08, + "loss": 1.8998, + "step": 18 + }, + { + "epoch": 0.0004431396443804354, + "grad_norm": 1.4619100093841553, + "learning_rate": 9.847110650427573e-08, + "loss": 1.5964, + "step": 19 + }, + { + "epoch": 0.00046646278355835303, + "grad_norm": 1.9832793474197388, + "learning_rate": 1.0365379632029025e-07, + "loss": 1.9292, + "step": 20 + }, + { + "epoch": 0.0004897859227362707, + "grad_norm": 2.0182175636291504, + "learning_rate": 1.0883648613630475e-07, + "loss": 2.0115, + "step": 21 + }, + { + "epoch": 0.0005131090619141884, + "grad_norm": 1.4642307758331299, + "learning_rate": 1.1401917595231926e-07, + "loss": 2.0291, + "step": 22 + }, + { + "epoch": 0.000536432201092106, + "grad_norm": 2.887909173965454, + "learning_rate": 1.1920186576833378e-07, + "loss": 2.1946, + "step": 23 + }, + { + "epoch": 0.0005597553402700236, + "grad_norm": 1.595544457435608, + "learning_rate": 1.2438455558434827e-07, + "loss": 2.0246, + "step": 24 + }, + { + "epoch": 0.0005830784794479413, + "grad_norm": 1.5648566484451294, + "learning_rate": 1.295672454003628e-07, + "loss": 2.1832, + "step": 25 + }, + { + "epoch": 0.000606401618625859, + "grad_norm": 1.4702372550964355, + "learning_rate": 1.347499352163773e-07, + "loss": 1.6395, + "step": 26 + }, + { + "epoch": 0.0006297247578037766, + "grad_norm": 1.7178195714950562, + "learning_rate": 1.399326250323918e-07, + "loss": 1.6264, + "step": 27 + }, + { + "epoch": 0.0006530478969816942, + "grad_norm": 2.1751515865325928, + "learning_rate": 1.4511531484840635e-07, + "loss": 2.511, + "step": 28 + }, + { + "epoch": 0.0006763710361596119, + "grad_norm": 2.9443299770355225, + "learning_rate": 1.5029800466442085e-07, + "loss": 2.229, + "step": 29 + }, + { + "epoch": 0.0006996941753375296, + "grad_norm": 1.8316481113433838, + "learning_rate": 1.5548069448043536e-07, + "loss": 1.8414, + "step": 30 + }, + { + "epoch": 0.0007230173145154472, + "grad_norm": 1.9659239053726196, + "learning_rate": 1.6066338429644986e-07, + "loss": 2.0109, + "step": 31 + }, + { + "epoch": 0.0007463404536933648, + "grad_norm": 2.1653449535369873, + "learning_rate": 1.658460741124644e-07, + "loss": 2.0155, + "step": 32 + }, + { + "epoch": 0.0007696635928712825, + "grad_norm": 1.8755710124969482, + "learning_rate": 1.710287639284789e-07, + "loss": 2.1105, + "step": 33 + }, + { + "epoch": 0.0007929867320492002, + "grad_norm": 1.5989196300506592, + "learning_rate": 1.7621145374449343e-07, + "loss": 2.1583, + "step": 34 + }, + { + "epoch": 0.0008163098712271178, + "grad_norm": 1.865307331085205, + "learning_rate": 1.813941435605079e-07, + "loss": 2.001, + "step": 35 + }, + { + "epoch": 0.0008396330104050355, + "grad_norm": 1.4584789276123047, + "learning_rate": 1.8657683337652242e-07, + "loss": 1.8854, + "step": 36 + }, + { + "epoch": 0.0008629561495829531, + "grad_norm": 2.6818912029266357, + "learning_rate": 1.9175952319253695e-07, + "loss": 2.1888, + "step": 37 + }, + { + "epoch": 0.0008862792887608708, + "grad_norm": 2.17561674118042, + "learning_rate": 1.9694221300855146e-07, + "loss": 1.9616, + "step": 38 + }, + { + "epoch": 0.0009096024279387884, + "grad_norm": 1.252475619316101, + "learning_rate": 2.02124902824566e-07, + "loss": 1.9585, + "step": 39 + }, + { + "epoch": 0.0009329255671167061, + "grad_norm": 1.884366750717163, + "learning_rate": 2.073075926405805e-07, + "loss": 2.2436, + "step": 40 + }, + { + "epoch": 0.0009562487062946237, + "grad_norm": 1.4951350688934326, + "learning_rate": 2.1249028245659497e-07, + "loss": 1.7149, + "step": 41 + }, + { + "epoch": 0.0009795718454725414, + "grad_norm": 1.891728162765503, + "learning_rate": 2.176729722726095e-07, + "loss": 2.0472, + "step": 42 + }, + { + "epoch": 0.001002894984650459, + "grad_norm": 1.8992432355880737, + "learning_rate": 2.22855662088624e-07, + "loss": 2.1471, + "step": 43 + }, + { + "epoch": 0.0010262181238283768, + "grad_norm": 1.3931283950805664, + "learning_rate": 2.2803835190463852e-07, + "loss": 1.5292, + "step": 44 + }, + { + "epoch": 0.0010495412630062942, + "grad_norm": 1.8894548416137695, + "learning_rate": 2.3322104172065305e-07, + "loss": 1.7759, + "step": 45 + }, + { + "epoch": 0.001072864402184212, + "grad_norm": 1.592050552368164, + "learning_rate": 2.3840373153666755e-07, + "loss": 2.2498, + "step": 46 + }, + { + "epoch": 0.0010961875413621296, + "grad_norm": 1.3746178150177002, + "learning_rate": 2.4358642135268203e-07, + "loss": 1.8503, + "step": 47 + }, + { + "epoch": 0.0011195106805400473, + "grad_norm": 2.0268595218658447, + "learning_rate": 2.4876911116869654e-07, + "loss": 1.9358, + "step": 48 + }, + { + "epoch": 0.001142833819717965, + "grad_norm": 1.7836228609085083, + "learning_rate": 2.539518009847111e-07, + "loss": 1.9855, + "step": 49 + }, + { + "epoch": 0.0011661569588958826, + "grad_norm": 1.829447627067566, + "learning_rate": 2.591344908007256e-07, + "loss": 2.2802, + "step": 50 + }, + { + "epoch": 0.0011894800980738003, + "grad_norm": 2.2813496589660645, + "learning_rate": 2.643171806167401e-07, + "loss": 2.1593, + "step": 51 + }, + { + "epoch": 0.001212803237251718, + "grad_norm": 3.019044876098633, + "learning_rate": 2.694998704327546e-07, + "loss": 1.9534, + "step": 52 + }, + { + "epoch": 0.0012361263764296354, + "grad_norm": 2.011425256729126, + "learning_rate": 2.746825602487691e-07, + "loss": 2.1284, + "step": 53 + }, + { + "epoch": 0.0012594495156075531, + "grad_norm": 2.207106590270996, + "learning_rate": 2.798652500647836e-07, + "loss": 2.2427, + "step": 54 + }, + { + "epoch": 0.0012827726547854708, + "grad_norm": 1.3172473907470703, + "learning_rate": 2.8504793988079813e-07, + "loss": 1.9782, + "step": 55 + }, + { + "epoch": 0.0013060957939633885, + "grad_norm": 1.522895097732544, + "learning_rate": 2.902306296968127e-07, + "loss": 1.9455, + "step": 56 + }, + { + "epoch": 0.0013294189331413062, + "grad_norm": 2.657248020172119, + "learning_rate": 2.954133195128272e-07, + "loss": 1.959, + "step": 57 + }, + { + "epoch": 0.0013527420723192238, + "grad_norm": 1.9738789796829224, + "learning_rate": 3.005960093288417e-07, + "loss": 1.7878, + "step": 58 + }, + { + "epoch": 0.0013760652114971415, + "grad_norm": 1.5549254417419434, + "learning_rate": 3.057786991448562e-07, + "loss": 1.9405, + "step": 59 + }, + { + "epoch": 0.0013993883506750592, + "grad_norm": 2.9688899517059326, + "learning_rate": 3.109613889608707e-07, + "loss": 1.9969, + "step": 60 + }, + { + "epoch": 0.0014227114898529767, + "grad_norm": 1.4602586030960083, + "learning_rate": 3.1614407877688527e-07, + "loss": 1.9339, + "step": 61 + }, + { + "epoch": 0.0014460346290308943, + "grad_norm": 2.4017045497894287, + "learning_rate": 3.213267685928997e-07, + "loss": 2.0842, + "step": 62 + }, + { + "epoch": 0.001469357768208812, + "grad_norm": 1.7433497905731201, + "learning_rate": 3.2650945840891423e-07, + "loss": 2.0223, + "step": 63 + }, + { + "epoch": 0.0014926809073867297, + "grad_norm": 1.7395591735839844, + "learning_rate": 3.316921482249288e-07, + "loss": 1.9257, + "step": 64 + }, + { + "epoch": 0.0015160040465646474, + "grad_norm": 1.8336257934570312, + "learning_rate": 3.3687483804094324e-07, + "loss": 1.948, + "step": 65 + }, + { + "epoch": 0.001539327185742565, + "grad_norm": 1.6493985652923584, + "learning_rate": 3.420575278569578e-07, + "loss": 1.8672, + "step": 66 + }, + { + "epoch": 0.0015626503249204827, + "grad_norm": 1.5789337158203125, + "learning_rate": 3.472402176729723e-07, + "loss": 1.9446, + "step": 67 + }, + { + "epoch": 0.0015859734640984004, + "grad_norm": 1.3755509853363037, + "learning_rate": 3.5242290748898686e-07, + "loss": 2.1796, + "step": 68 + }, + { + "epoch": 0.001609296603276318, + "grad_norm": 1.7978087663650513, + "learning_rate": 3.576055973050013e-07, + "loss": 1.8974, + "step": 69 + }, + { + "epoch": 0.0016326197424542355, + "grad_norm": 1.8888216018676758, + "learning_rate": 3.627882871210158e-07, + "loss": 1.915, + "step": 70 + }, + { + "epoch": 0.0016559428816321532, + "grad_norm": 2.6150593757629395, + "learning_rate": 3.679709769370304e-07, + "loss": 2.2133, + "step": 71 + }, + { + "epoch": 0.001679266020810071, + "grad_norm": 1.7009005546569824, + "learning_rate": 3.7315366675304483e-07, + "loss": 2.1024, + "step": 72 + }, + { + "epoch": 0.0017025891599879886, + "grad_norm": 1.741734266281128, + "learning_rate": 3.783363565690594e-07, + "loss": 2.1839, + "step": 73 + }, + { + "epoch": 0.0017259122991659063, + "grad_norm": 2.7715041637420654, + "learning_rate": 3.835190463850739e-07, + "loss": 2.0734, + "step": 74 + }, + { + "epoch": 0.001749235438343824, + "grad_norm": 1.9710502624511719, + "learning_rate": 3.8870173620108835e-07, + "loss": 2.18, + "step": 75 + }, + { + "epoch": 0.0017725585775217416, + "grad_norm": 2.077986478805542, + "learning_rate": 3.938844260171029e-07, + "loss": 2.1482, + "step": 76 + }, + { + "epoch": 0.0017958817166996593, + "grad_norm": 2.583721160888672, + "learning_rate": 3.990671158331174e-07, + "loss": 2.5364, + "step": 77 + }, + { + "epoch": 0.0018192048558775768, + "grad_norm": 1.3425930738449097, + "learning_rate": 4.04249805649132e-07, + "loss": 1.8194, + "step": 78 + }, + { + "epoch": 0.0018425279950554944, + "grad_norm": 2.1111888885498047, + "learning_rate": 4.0943249546514643e-07, + "loss": 1.7878, + "step": 79 + }, + { + "epoch": 0.0018658511342334121, + "grad_norm": 2.0795626640319824, + "learning_rate": 4.14615185281161e-07, + "loss": 2.3006, + "step": 80 + }, + { + "epoch": 0.0018891742734113298, + "grad_norm": 1.273370623588562, + "learning_rate": 4.197978750971755e-07, + "loss": 1.7599, + "step": 81 + }, + { + "epoch": 0.0019124974125892475, + "grad_norm": 1.6202706098556519, + "learning_rate": 4.2498056491318994e-07, + "loss": 2.1727, + "step": 82 + }, + { + "epoch": 0.0019358205517671651, + "grad_norm": 2.4593732357025146, + "learning_rate": 4.301632547292045e-07, + "loss": 2.4588, + "step": 83 + }, + { + "epoch": 0.001959143690945083, + "grad_norm": 1.2617835998535156, + "learning_rate": 4.35345944545219e-07, + "loss": 1.9078, + "step": 84 + }, + { + "epoch": 0.0019824668301230003, + "grad_norm": 2.2640504837036133, + "learning_rate": 4.405286343612335e-07, + "loss": 1.8983, + "step": 85 + }, + { + "epoch": 0.002005789969300918, + "grad_norm": 1.6804454326629639, + "learning_rate": 4.45711324177248e-07, + "loss": 2.1049, + "step": 86 + }, + { + "epoch": 0.0020291131084788356, + "grad_norm": 2.060009717941284, + "learning_rate": 4.5089401399326253e-07, + "loss": 2.0153, + "step": 87 + }, + { + "epoch": 0.0020524362476567535, + "grad_norm": 1.7166160345077515, + "learning_rate": 4.5607670380927703e-07, + "loss": 2.1093, + "step": 88 + }, + { + "epoch": 0.002075759386834671, + "grad_norm": 1.6695979833602905, + "learning_rate": 4.6125939362529154e-07, + "loss": 1.8607, + "step": 89 + }, + { + "epoch": 0.0020990825260125885, + "grad_norm": 1.4339056015014648, + "learning_rate": 4.664420834413061e-07, + "loss": 2.2632, + "step": 90 + }, + { + "epoch": 0.0021224056651905064, + "grad_norm": 1.5228222608566284, + "learning_rate": 4.7162477325732055e-07, + "loss": 2.0851, + "step": 91 + }, + { + "epoch": 0.002145728804368424, + "grad_norm": 1.540848731994629, + "learning_rate": 4.768074630733351e-07, + "loss": 2.1446, + "step": 92 + }, + { + "epoch": 0.0021690519435463417, + "grad_norm": 1.480702519416809, + "learning_rate": 4.819901528893496e-07, + "loss": 2.0718, + "step": 93 + }, + { + "epoch": 0.002192375082724259, + "grad_norm": 2.23518705368042, + "learning_rate": 4.871728427053641e-07, + "loss": 1.6198, + "step": 94 + }, + { + "epoch": 0.002215698221902177, + "grad_norm": 1.6477755308151245, + "learning_rate": 4.923555325213786e-07, + "loss": 2.1136, + "step": 95 + }, + { + "epoch": 0.0022390213610800945, + "grad_norm": 1.9548614025115967, + "learning_rate": 4.975382223373931e-07, + "loss": 1.9143, + "step": 96 + }, + { + "epoch": 0.0022623445002580124, + "grad_norm": 1.3557407855987549, + "learning_rate": 5.027209121534076e-07, + "loss": 2.0044, + "step": 97 + }, + { + "epoch": 0.00228566763943593, + "grad_norm": 2.2781455516815186, + "learning_rate": 5.079036019694222e-07, + "loss": 1.7761, + "step": 98 + }, + { + "epoch": 0.0023089907786138474, + "grad_norm": 2.1195600032806396, + "learning_rate": 5.130862917854368e-07, + "loss": 1.8174, + "step": 99 + }, + { + "epoch": 0.0023323139177917653, + "grad_norm": 2.0798068046569824, + "learning_rate": 5.182689816014512e-07, + "loss": 2.1431, + "step": 100 + }, + { + "epoch": 0.0023556370569696827, + "grad_norm": 1.8773006200790405, + "learning_rate": 5.234516714174657e-07, + "loss": 1.5221, + "step": 101 + }, + { + "epoch": 0.0023789601961476006, + "grad_norm": 1.7917876243591309, + "learning_rate": 5.286343612334802e-07, + "loss": 1.9383, + "step": 102 + }, + { + "epoch": 0.002402283335325518, + "grad_norm": 1.4980329275131226, + "learning_rate": 5.338170510494947e-07, + "loss": 1.846, + "step": 103 + }, + { + "epoch": 0.002425606474503436, + "grad_norm": 2.0081095695495605, + "learning_rate": 5.389997408655092e-07, + "loss": 1.8777, + "step": 104 + }, + { + "epoch": 0.0024489296136813534, + "grad_norm": 1.525317907333374, + "learning_rate": 5.441824306815238e-07, + "loss": 1.971, + "step": 105 + }, + { + "epoch": 0.002472252752859271, + "grad_norm": 1.4131786823272705, + "learning_rate": 5.493651204975382e-07, + "loss": 2.2224, + "step": 106 + }, + { + "epoch": 0.002495575892037189, + "grad_norm": 1.164492130279541, + "learning_rate": 5.545478103135528e-07, + "loss": 1.8909, + "step": 107 + }, + { + "epoch": 0.0025188990312151062, + "grad_norm": 1.9998016357421875, + "learning_rate": 5.597305001295673e-07, + "loss": 2.1197, + "step": 108 + }, + { + "epoch": 0.002542222170393024, + "grad_norm": 1.6218236684799194, + "learning_rate": 5.649131899455818e-07, + "loss": 1.7799, + "step": 109 + }, + { + "epoch": 0.0025655453095709416, + "grad_norm": 1.535388708114624, + "learning_rate": 5.700958797615963e-07, + "loss": 1.7878, + "step": 110 + }, + { + "epoch": 0.0025888684487488595, + "grad_norm": 1.4929994344711304, + "learning_rate": 5.752785695776108e-07, + "loss": 2.0802, + "step": 111 + }, + { + "epoch": 0.002612191587926777, + "grad_norm": 2.183293104171753, + "learning_rate": 5.804612593936254e-07, + "loss": 2.0506, + "step": 112 + }, + { + "epoch": 0.002635514727104695, + "grad_norm": 1.6339191198349, + "learning_rate": 5.856439492096398e-07, + "loss": 1.7152, + "step": 113 + }, + { + "epoch": 0.0026588378662826123, + "grad_norm": 1.4886974096298218, + "learning_rate": 5.908266390256544e-07, + "loss": 1.8327, + "step": 114 + }, + { + "epoch": 0.0026821610054605298, + "grad_norm": 1.4198302030563354, + "learning_rate": 5.960093288416688e-07, + "loss": 1.8342, + "step": 115 + }, + { + "epoch": 0.0027054841446384477, + "grad_norm": 2.041900157928467, + "learning_rate": 6.011920186576834e-07, + "loss": 1.9101, + "step": 116 + }, + { + "epoch": 0.002728807283816365, + "grad_norm": 1.7576725482940674, + "learning_rate": 6.063747084736979e-07, + "loss": 2.3793, + "step": 117 + }, + { + "epoch": 0.002752130422994283, + "grad_norm": 1.620440125465393, + "learning_rate": 6.115573982897124e-07, + "loss": 1.7363, + "step": 118 + }, + { + "epoch": 0.0027754535621722005, + "grad_norm": 1.972102403640747, + "learning_rate": 6.16740088105727e-07, + "loss": 2.0338, + "step": 119 + }, + { + "epoch": 0.0027987767013501184, + "grad_norm": 1.5385342836380005, + "learning_rate": 6.219227779217414e-07, + "loss": 1.829, + "step": 120 + }, + { + "epoch": 0.002822099840528036, + "grad_norm": 1.4439769983291626, + "learning_rate": 6.27105467737756e-07, + "loss": 1.9893, + "step": 121 + }, + { + "epoch": 0.0028454229797059533, + "grad_norm": 1.5146026611328125, + "learning_rate": 6.322881575537705e-07, + "loss": 1.6563, + "step": 122 + }, + { + "epoch": 0.002868746118883871, + "grad_norm": 1.7177401781082153, + "learning_rate": 6.374708473697849e-07, + "loss": 1.9483, + "step": 123 + }, + { + "epoch": 0.0028920692580617887, + "grad_norm": 2.484865188598633, + "learning_rate": 6.426535371857994e-07, + "loss": 2.0949, + "step": 124 + }, + { + "epoch": 0.0029153923972397066, + "grad_norm": 1.5320651531219482, + "learning_rate": 6.47836227001814e-07, + "loss": 1.8557, + "step": 125 + }, + { + "epoch": 0.002938715536417624, + "grad_norm": 1.3804417848587036, + "learning_rate": 6.530189168178285e-07, + "loss": 1.8733, + "step": 126 + }, + { + "epoch": 0.002962038675595542, + "grad_norm": 2.0832831859588623, + "learning_rate": 6.58201606633843e-07, + "loss": 1.8556, + "step": 127 + }, + { + "epoch": 0.0029853618147734594, + "grad_norm": 1.2582931518554688, + "learning_rate": 6.633842964498576e-07, + "loss": 2.1239, + "step": 128 + }, + { + "epoch": 0.0030086849539513773, + "grad_norm": 1.6449629068374634, + "learning_rate": 6.685669862658721e-07, + "loss": 2.1635, + "step": 129 + }, + { + "epoch": 0.0030320080931292947, + "grad_norm": 1.3350502252578735, + "learning_rate": 6.737496760818865e-07, + "loss": 1.801, + "step": 130 + }, + { + "epoch": 0.003055331232307212, + "grad_norm": 1.7689651250839233, + "learning_rate": 6.78932365897901e-07, + "loss": 1.7541, + "step": 131 + }, + { + "epoch": 0.00307865437148513, + "grad_norm": 1.4711276292800903, + "learning_rate": 6.841150557139156e-07, + "loss": 2.3916, + "step": 132 + }, + { + "epoch": 0.0031019775106630476, + "grad_norm": 1.2806516885757446, + "learning_rate": 6.892977455299301e-07, + "loss": 1.8609, + "step": 133 + }, + { + "epoch": 0.0031253006498409655, + "grad_norm": 1.5531939268112183, + "learning_rate": 6.944804353459446e-07, + "loss": 1.7721, + "step": 134 + }, + { + "epoch": 0.003148623789018883, + "grad_norm": 1.6541032791137695, + "learning_rate": 6.996631251619592e-07, + "loss": 2.1091, + "step": 135 + }, + { + "epoch": 0.003171946928196801, + "grad_norm": 2.050734281539917, + "learning_rate": 7.048458149779737e-07, + "loss": 1.8932, + "step": 136 + }, + { + "epoch": 0.0031952700673747183, + "grad_norm": 1.2903157472610474, + "learning_rate": 7.100285047939881e-07, + "loss": 2.0833, + "step": 137 + }, + { + "epoch": 0.003218593206552636, + "grad_norm": 1.3316091299057007, + "learning_rate": 7.152111946100026e-07, + "loss": 1.9307, + "step": 138 + }, + { + "epoch": 0.0032419163457305536, + "grad_norm": 1.441341519355774, + "learning_rate": 7.203938844260172e-07, + "loss": 2.2529, + "step": 139 + }, + { + "epoch": 0.003265239484908471, + "grad_norm": 2.159276008605957, + "learning_rate": 7.255765742420316e-07, + "loss": 1.847, + "step": 140 + }, + { + "epoch": 0.003288562624086389, + "grad_norm": 1.8410853147506714, + "learning_rate": 7.307592640580462e-07, + "loss": 2.2465, + "step": 141 + }, + { + "epoch": 0.0033118857632643064, + "grad_norm": 1.8678739070892334, + "learning_rate": 7.359419538740608e-07, + "loss": 1.9261, + "step": 142 + }, + { + "epoch": 0.0033352089024422243, + "grad_norm": 1.2097922563552856, + "learning_rate": 7.411246436900751e-07, + "loss": 2.0205, + "step": 143 + }, + { + "epoch": 0.003358532041620142, + "grad_norm": 1.733077883720398, + "learning_rate": 7.463073335060897e-07, + "loss": 1.8389, + "step": 144 + }, + { + "epoch": 0.0033818551807980597, + "grad_norm": 1.7118474245071411, + "learning_rate": 7.514900233221042e-07, + "loss": 1.9511, + "step": 145 + }, + { + "epoch": 0.003405178319975977, + "grad_norm": 1.6960872411727905, + "learning_rate": 7.566727131381188e-07, + "loss": 1.8828, + "step": 146 + }, + { + "epoch": 0.0034285014591538946, + "grad_norm": 1.2409390211105347, + "learning_rate": 7.618554029541332e-07, + "loss": 1.6878, + "step": 147 + }, + { + "epoch": 0.0034518245983318125, + "grad_norm": 1.3440965414047241, + "learning_rate": 7.670380927701478e-07, + "loss": 1.64, + "step": 148 + }, + { + "epoch": 0.00347514773750973, + "grad_norm": 1.539393663406372, + "learning_rate": 7.722207825861624e-07, + "loss": 1.6754, + "step": 149 + }, + { + "epoch": 0.003498470876687648, + "grad_norm": 1.5395653247833252, + "learning_rate": 7.774034724021767e-07, + "loss": 1.9761, + "step": 150 + }, + { + "epoch": 0.0035217940158655653, + "grad_norm": 2.0169472694396973, + "learning_rate": 7.825861622181913e-07, + "loss": 1.6927, + "step": 151 + }, + { + "epoch": 0.0035451171550434832, + "grad_norm": 1.8776079416275024, + "learning_rate": 7.877688520342058e-07, + "loss": 1.9273, + "step": 152 + }, + { + "epoch": 0.0035684402942214007, + "grad_norm": 2.078824043273926, + "learning_rate": 7.929515418502204e-07, + "loss": 1.6756, + "step": 153 + }, + { + "epoch": 0.0035917634333993186, + "grad_norm": 1.407560110092163, + "learning_rate": 7.981342316662348e-07, + "loss": 1.6038, + "step": 154 + }, + { + "epoch": 0.003615086572577236, + "grad_norm": 1.1770573854446411, + "learning_rate": 8.033169214822494e-07, + "loss": 1.6679, + "step": 155 + }, + { + "epoch": 0.0036384097117551535, + "grad_norm": 1.2057602405548096, + "learning_rate": 8.08499611298264e-07, + "loss": 1.7916, + "step": 156 + }, + { + "epoch": 0.0036617328509330714, + "grad_norm": 1.117970585823059, + "learning_rate": 8.136823011142783e-07, + "loss": 1.7974, + "step": 157 + }, + { + "epoch": 0.003685055990110989, + "grad_norm": 1.5996465682983398, + "learning_rate": 8.188649909302929e-07, + "loss": 1.6053, + "step": 158 + }, + { + "epoch": 0.0037083791292889068, + "grad_norm": 1.4170929193496704, + "learning_rate": 8.240476807463074e-07, + "loss": 1.7155, + "step": 159 + }, + { + "epoch": 0.0037317022684668242, + "grad_norm": 1.8114391565322876, + "learning_rate": 8.29230370562322e-07, + "loss": 1.9192, + "step": 160 + }, + { + "epoch": 0.003755025407644742, + "grad_norm": 1.3462793827056885, + "learning_rate": 8.344130603783364e-07, + "loss": 1.4624, + "step": 161 + }, + { + "epoch": 0.0037783485468226596, + "grad_norm": 1.6305956840515137, + "learning_rate": 8.39595750194351e-07, + "loss": 1.8017, + "step": 162 + }, + { + "epoch": 0.003801671686000577, + "grad_norm": 1.662576675415039, + "learning_rate": 8.447784400103655e-07, + "loss": 1.733, + "step": 163 + }, + { + "epoch": 0.003824994825178495, + "grad_norm": 1.556788682937622, + "learning_rate": 8.499611298263799e-07, + "loss": 1.9586, + "step": 164 + }, + { + "epoch": 0.0038483179643564124, + "grad_norm": 1.5282272100448608, + "learning_rate": 8.551438196423944e-07, + "loss": 1.8254, + "step": 165 + }, + { + "epoch": 0.0038716411035343303, + "grad_norm": 1.6790592670440674, + "learning_rate": 8.60326509458409e-07, + "loss": 2.1866, + "step": 166 + }, + { + "epoch": 0.0038949642427122478, + "grad_norm": 1.5164263248443604, + "learning_rate": 8.655091992744236e-07, + "loss": 1.6651, + "step": 167 + }, + { + "epoch": 0.003918287381890166, + "grad_norm": 1.5002336502075195, + "learning_rate": 8.70691889090438e-07, + "loss": 1.9295, + "step": 168 + }, + { + "epoch": 0.0039416105210680836, + "grad_norm": 1.2122441530227661, + "learning_rate": 8.758745789064526e-07, + "loss": 1.761, + "step": 169 + }, + { + "epoch": 0.003964933660246001, + "grad_norm": 1.637898564338684, + "learning_rate": 8.81057268722467e-07, + "loss": 1.8697, + "step": 170 + }, + { + "epoch": 0.0039882567994239185, + "grad_norm": 0.988777220249176, + "learning_rate": 8.862399585384815e-07, + "loss": 2.1249, + "step": 171 + }, + { + "epoch": 0.004011579938601836, + "grad_norm": 1.8833587169647217, + "learning_rate": 8.91422648354496e-07, + "loss": 1.6915, + "step": 172 + }, + { + "epoch": 0.004034903077779753, + "grad_norm": 1.8418108224868774, + "learning_rate": 8.966053381705106e-07, + "loss": 2.0019, + "step": 173 + }, + { + "epoch": 0.004058226216957671, + "grad_norm": 1.6375901699066162, + "learning_rate": 9.017880279865251e-07, + "loss": 1.7625, + "step": 174 + }, + { + "epoch": 0.004081549356135589, + "grad_norm": 1.8701720237731934, + "learning_rate": 9.069707178025396e-07, + "loss": 1.801, + "step": 175 + }, + { + "epoch": 0.004104872495313507, + "grad_norm": 1.4488773345947266, + "learning_rate": 9.121534076185541e-07, + "loss": 1.9971, + "step": 176 + }, + { + "epoch": 0.004128195634491424, + "grad_norm": 0.9587986469268799, + "learning_rate": 9.173360974345686e-07, + "loss": 1.6253, + "step": 177 + }, + { + "epoch": 0.004151518773669342, + "grad_norm": 2.6533186435699463, + "learning_rate": 9.225187872505831e-07, + "loss": 1.572, + "step": 178 + }, + { + "epoch": 0.00417484191284726, + "grad_norm": 2.4528841972351074, + "learning_rate": 9.277014770665976e-07, + "loss": 1.7586, + "step": 179 + }, + { + "epoch": 0.004198165052025177, + "grad_norm": 1.1871824264526367, + "learning_rate": 9.328841668826122e-07, + "loss": 1.6765, + "step": 180 + }, + { + "epoch": 0.004221488191203095, + "grad_norm": 1.1292660236358643, + "learning_rate": 9.380668566986266e-07, + "loss": 2.0673, + "step": 181 + }, + { + "epoch": 0.004244811330381013, + "grad_norm": 1.3055285215377808, + "learning_rate": 9.432495465146411e-07, + "loss": 1.8103, + "step": 182 + }, + { + "epoch": 0.004268134469558931, + "grad_norm": 1.5225868225097656, + "learning_rate": 9.484322363306557e-07, + "loss": 2.0813, + "step": 183 + }, + { + "epoch": 0.004291457608736848, + "grad_norm": 1.2439767122268677, + "learning_rate": 9.536149261466702e-07, + "loss": 1.6919, + "step": 184 + }, + { + "epoch": 0.0043147807479147655, + "grad_norm": 1.2424002885818481, + "learning_rate": 9.587976159626847e-07, + "loss": 1.9506, + "step": 185 + }, + { + "epoch": 0.0043381038870926834, + "grad_norm": 0.9796323776245117, + "learning_rate": 9.639803057786992e-07, + "loss": 1.7342, + "step": 186 + }, + { + "epoch": 0.0043614270262706005, + "grad_norm": 1.2240192890167236, + "learning_rate": 9.691629955947138e-07, + "loss": 2.0646, + "step": 187 + }, + { + "epoch": 0.004384750165448518, + "grad_norm": 0.8779449462890625, + "learning_rate": 9.743456854107281e-07, + "loss": 1.4535, + "step": 188 + }, + { + "epoch": 0.004408073304626436, + "grad_norm": 1.3131407499313354, + "learning_rate": 9.795283752267427e-07, + "loss": 1.9817, + "step": 189 + }, + { + "epoch": 0.004431396443804354, + "grad_norm": 1.3259912729263306, + "learning_rate": 9.847110650427573e-07, + "loss": 1.709, + "step": 190 + }, + { + "epoch": 0.004454719582982271, + "grad_norm": 1.4236465692520142, + "learning_rate": 9.898937548587718e-07, + "loss": 1.7059, + "step": 191 + }, + { + "epoch": 0.004478042722160189, + "grad_norm": 1.2791959047317505, + "learning_rate": 9.950764446747862e-07, + "loss": 1.9633, + "step": 192 + }, + { + "epoch": 0.004501365861338107, + "grad_norm": 0.9857053160667419, + "learning_rate": 1.0002591344908007e-06, + "loss": 1.807, + "step": 193 + }, + { + "epoch": 0.004524689000516025, + "grad_norm": 1.264302372932434, + "learning_rate": 1.0054418243068153e-06, + "loss": 1.5389, + "step": 194 + }, + { + "epoch": 0.004548012139693942, + "grad_norm": 1.2205390930175781, + "learning_rate": 1.0106245141228298e-06, + "loss": 1.4549, + "step": 195 + }, + { + "epoch": 0.00457133527887186, + "grad_norm": 1.055471420288086, + "learning_rate": 1.0158072039388444e-06, + "loss": 1.6931, + "step": 196 + }, + { + "epoch": 0.004594658418049778, + "grad_norm": 1.0585546493530273, + "learning_rate": 1.020989893754859e-06, + "loss": 1.8054, + "step": 197 + }, + { + "epoch": 0.004617981557227695, + "grad_norm": 2.16025972366333, + "learning_rate": 1.0261725835708735e-06, + "loss": 2.0077, + "step": 198 + }, + { + "epoch": 0.004641304696405613, + "grad_norm": 2.125786781311035, + "learning_rate": 1.0313552733868879e-06, + "loss": 1.9117, + "step": 199 + }, + { + "epoch": 0.0046646278355835305, + "grad_norm": 1.3560391664505005, + "learning_rate": 1.0365379632029024e-06, + "loss": 1.9871, + "step": 200 + }, + { + "epoch": 0.004687950974761448, + "grad_norm": 1.3505181074142456, + "learning_rate": 1.041720653018917e-06, + "loss": 1.714, + "step": 201 + }, + { + "epoch": 0.004711274113939365, + "grad_norm": 1.1724427938461304, + "learning_rate": 1.0469033428349313e-06, + "loss": 1.7611, + "step": 202 + }, + { + "epoch": 0.004734597253117283, + "grad_norm": 1.1746799945831299, + "learning_rate": 1.0520860326509459e-06, + "loss": 1.867, + "step": 203 + }, + { + "epoch": 0.004757920392295201, + "grad_norm": 1.0976382493972778, + "learning_rate": 1.0572687224669604e-06, + "loss": 1.808, + "step": 204 + }, + { + "epoch": 0.004781243531473118, + "grad_norm": 1.3842298984527588, + "learning_rate": 1.062451412282975e-06, + "loss": 1.7973, + "step": 205 + }, + { + "epoch": 0.004804566670651036, + "grad_norm": 1.6715288162231445, + "learning_rate": 1.0676341020989893e-06, + "loss": 1.9817, + "step": 206 + }, + { + "epoch": 0.004827889809828954, + "grad_norm": 1.0734590291976929, + "learning_rate": 1.072816791915004e-06, + "loss": 1.4297, + "step": 207 + }, + { + "epoch": 0.004851212949006872, + "grad_norm": 1.0182546377182007, + "learning_rate": 1.0779994817310185e-06, + "loss": 1.713, + "step": 208 + }, + { + "epoch": 0.004874536088184789, + "grad_norm": 1.1884313821792603, + "learning_rate": 1.083182171547033e-06, + "loss": 1.5234, + "step": 209 + }, + { + "epoch": 0.004897859227362707, + "grad_norm": 1.520266056060791, + "learning_rate": 1.0883648613630476e-06, + "loss": 2.0598, + "step": 210 + }, + { + "epoch": 0.004921182366540625, + "grad_norm": 1.1709904670715332, + "learning_rate": 1.0935475511790621e-06, + "loss": 2.1461, + "step": 211 + }, + { + "epoch": 0.004944505505718542, + "grad_norm": 1.2634027004241943, + "learning_rate": 1.0987302409950765e-06, + "loss": 1.5076, + "step": 212 + }, + { + "epoch": 0.00496782864489646, + "grad_norm": 1.490717887878418, + "learning_rate": 1.103912930811091e-06, + "loss": 1.8628, + "step": 213 + }, + { + "epoch": 0.004991151784074378, + "grad_norm": 2.077373743057251, + "learning_rate": 1.1090956206271056e-06, + "loss": 1.9295, + "step": 214 + }, + { + "epoch": 0.0050144749232522955, + "grad_norm": 1.647877812385559, + "learning_rate": 1.1142783104431202e-06, + "loss": 1.7929, + "step": 215 + }, + { + "epoch": 0.0050377980624302125, + "grad_norm": 1.1937353610992432, + "learning_rate": 1.1194610002591345e-06, + "loss": 1.6509, + "step": 216 + }, + { + "epoch": 0.00506112120160813, + "grad_norm": 1.0805108547210693, + "learning_rate": 1.124643690075149e-06, + "loss": 1.6447, + "step": 217 + }, + { + "epoch": 0.005084444340786048, + "grad_norm": 1.1077872514724731, + "learning_rate": 1.1298263798911636e-06, + "loss": 1.7675, + "step": 218 + }, + { + "epoch": 0.005107767479963966, + "grad_norm": 0.8648241758346558, + "learning_rate": 1.135009069707178e-06, + "loss": 1.6687, + "step": 219 + }, + { + "epoch": 0.005131090619141883, + "grad_norm": 1.0522700548171997, + "learning_rate": 1.1401917595231925e-06, + "loss": 1.2878, + "step": 220 + }, + { + "epoch": 0.005154413758319801, + "grad_norm": 1.3021256923675537, + "learning_rate": 1.145374449339207e-06, + "loss": 1.8535, + "step": 221 + }, + { + "epoch": 0.005177736897497719, + "grad_norm": 1.2912962436676025, + "learning_rate": 1.1505571391552216e-06, + "loss": 1.865, + "step": 222 + }, + { + "epoch": 0.005201060036675636, + "grad_norm": 1.6733994483947754, + "learning_rate": 1.1557398289712362e-06, + "loss": 1.5748, + "step": 223 + }, + { + "epoch": 0.005224383175853554, + "grad_norm": 1.0865724086761475, + "learning_rate": 1.1609225187872508e-06, + "loss": 1.8159, + "step": 224 + }, + { + "epoch": 0.005247706315031472, + "grad_norm": 1.1498301029205322, + "learning_rate": 1.1661052086032653e-06, + "loss": 1.8579, + "step": 225 + }, + { + "epoch": 0.00527102945420939, + "grad_norm": 1.9360573291778564, + "learning_rate": 1.1712878984192797e-06, + "loss": 1.7366, + "step": 226 + }, + { + "epoch": 0.005294352593387307, + "grad_norm": 1.0133939981460571, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.4571, + "step": 227 + }, + { + "epoch": 0.005317675732565225, + "grad_norm": 1.6443811655044556, + "learning_rate": 1.1816532780513088e-06, + "loss": 1.5312, + "step": 228 + }, + { + "epoch": 0.0053409988717431425, + "grad_norm": 1.1923338174819946, + "learning_rate": 1.1868359678673233e-06, + "loss": 1.6993, + "step": 229 + }, + { + "epoch": 0.0053643220109210596, + "grad_norm": 1.0345349311828613, + "learning_rate": 1.1920186576833377e-06, + "loss": 1.5739, + "step": 230 + }, + { + "epoch": 0.0053876451500989775, + "grad_norm": 0.9833806753158569, + "learning_rate": 1.1972013474993522e-06, + "loss": 1.819, + "step": 231 + }, + { + "epoch": 0.005410968289276895, + "grad_norm": 1.3315545320510864, + "learning_rate": 1.2023840373153668e-06, + "loss": 1.9472, + "step": 232 + }, + { + "epoch": 0.005434291428454813, + "grad_norm": 1.0042314529418945, + "learning_rate": 1.2075667271313812e-06, + "loss": 1.993, + "step": 233 + }, + { + "epoch": 0.00545761456763273, + "grad_norm": 1.2731118202209473, + "learning_rate": 1.2127494169473957e-06, + "loss": 1.6763, + "step": 234 + }, + { + "epoch": 0.005480937706810648, + "grad_norm": 0.9664155840873718, + "learning_rate": 1.2179321067634103e-06, + "loss": 1.3091, + "step": 235 + }, + { + "epoch": 0.005504260845988566, + "grad_norm": 1.6930897235870361, + "learning_rate": 1.2231147965794248e-06, + "loss": 1.6111, + "step": 236 + }, + { + "epoch": 0.005527583985166483, + "grad_norm": 0.9807016253471375, + "learning_rate": 1.2282974863954394e-06, + "loss": 1.6131, + "step": 237 + }, + { + "epoch": 0.005550907124344401, + "grad_norm": 1.321951150894165, + "learning_rate": 1.233480176211454e-06, + "loss": 1.242, + "step": 238 + }, + { + "epoch": 0.005574230263522319, + "grad_norm": 1.1465637683868408, + "learning_rate": 1.2386628660274685e-06, + "loss": 1.7035, + "step": 239 + }, + { + "epoch": 0.005597553402700237, + "grad_norm": 2.4264347553253174, + "learning_rate": 1.2438455558434829e-06, + "loss": 1.9859, + "step": 240 + }, + { + "epoch": 0.005620876541878154, + "grad_norm": 1.429149866104126, + "learning_rate": 1.2490282456594974e-06, + "loss": 1.8249, + "step": 241 + }, + { + "epoch": 0.005644199681056072, + "grad_norm": 1.1119049787521362, + "learning_rate": 1.254210935475512e-06, + "loss": 1.8005, + "step": 242 + }, + { + "epoch": 0.00566752282023399, + "grad_norm": 1.9002227783203125, + "learning_rate": 1.2593936252915265e-06, + "loss": 1.6951, + "step": 243 + }, + { + "epoch": 0.005690845959411907, + "grad_norm": 1.067659854888916, + "learning_rate": 1.264576315107541e-06, + "loss": 1.799, + "step": 244 + }, + { + "epoch": 0.0057141690985898245, + "grad_norm": 1.2947990894317627, + "learning_rate": 1.2697590049235552e-06, + "loss": 1.7837, + "step": 245 + }, + { + "epoch": 0.005737492237767742, + "grad_norm": 1.0790272951126099, + "learning_rate": 1.2749416947395698e-06, + "loss": 1.67, + "step": 246 + }, + { + "epoch": 0.00576081537694566, + "grad_norm": 1.3589330911636353, + "learning_rate": 1.2801243845555843e-06, + "loss": 1.9282, + "step": 247 + }, + { + "epoch": 0.005784138516123577, + "grad_norm": 1.4140998125076294, + "learning_rate": 1.285307074371599e-06, + "loss": 1.6708, + "step": 248 + }, + { + "epoch": 0.005807461655301495, + "grad_norm": 1.000994086265564, + "learning_rate": 1.2904897641876135e-06, + "loss": 1.4077, + "step": 249 + }, + { + "epoch": 0.005830784794479413, + "grad_norm": 1.3655062913894653, + "learning_rate": 1.295672454003628e-06, + "loss": 1.8862, + "step": 250 + }, + { + "epoch": 0.005854107933657331, + "grad_norm": 1.1164065599441528, + "learning_rate": 1.3008551438196426e-06, + "loss": 1.528, + "step": 251 + }, + { + "epoch": 0.005877431072835248, + "grad_norm": 1.1792149543762207, + "learning_rate": 1.306037833635657e-06, + "loss": 1.2879, + "step": 252 + }, + { + "epoch": 0.005900754212013166, + "grad_norm": 2.236320734024048, + "learning_rate": 1.3112205234516715e-06, + "loss": 1.4929, + "step": 253 + }, + { + "epoch": 0.005924077351191084, + "grad_norm": 1.8795088529586792, + "learning_rate": 1.316403213267686e-06, + "loss": 1.2468, + "step": 254 + }, + { + "epoch": 0.005947400490369001, + "grad_norm": 1.2248806953430176, + "learning_rate": 1.3215859030837006e-06, + "loss": 1.769, + "step": 255 + }, + { + "epoch": 0.005970723629546919, + "grad_norm": 1.252236008644104, + "learning_rate": 1.3267685928997152e-06, + "loss": 1.9014, + "step": 256 + }, + { + "epoch": 0.005994046768724837, + "grad_norm": 1.3926386833190918, + "learning_rate": 1.3319512827157297e-06, + "loss": 1.9599, + "step": 257 + }, + { + "epoch": 0.0060173699079027546, + "grad_norm": 1.5681990385055542, + "learning_rate": 1.3371339725317443e-06, + "loss": 1.8109, + "step": 258 + }, + { + "epoch": 0.006040693047080672, + "grad_norm": 1.6841275691986084, + "learning_rate": 1.3423166623477584e-06, + "loss": 1.4601, + "step": 259 + }, + { + "epoch": 0.0060640161862585895, + "grad_norm": 1.5262291431427002, + "learning_rate": 1.347499352163773e-06, + "loss": 1.6493, + "step": 260 + }, + { + "epoch": 0.006087339325436507, + "grad_norm": 1.0905576944351196, + "learning_rate": 1.3526820419797875e-06, + "loss": 2.0847, + "step": 261 + }, + { + "epoch": 0.006110662464614424, + "grad_norm": 1.4682683944702148, + "learning_rate": 1.357864731795802e-06, + "loss": 1.6889, + "step": 262 + }, + { + "epoch": 0.006133985603792342, + "grad_norm": 1.1054515838623047, + "learning_rate": 1.3630474216118166e-06, + "loss": 1.55, + "step": 263 + }, + { + "epoch": 0.00615730874297026, + "grad_norm": 1.3931388854980469, + "learning_rate": 1.3682301114278312e-06, + "loss": 1.655, + "step": 264 + }, + { + "epoch": 0.006180631882148178, + "grad_norm": 1.1766420602798462, + "learning_rate": 1.3734128012438458e-06, + "loss": 1.9555, + "step": 265 + }, + { + "epoch": 0.006203955021326095, + "grad_norm": 1.1652954816818237, + "learning_rate": 1.3785954910598601e-06, + "loss": 1.8446, + "step": 266 + }, + { + "epoch": 0.006227278160504013, + "grad_norm": 1.378980278968811, + "learning_rate": 1.3837781808758747e-06, + "loss": 1.4449, + "step": 267 + }, + { + "epoch": 0.006250601299681931, + "grad_norm": 1.2017453908920288, + "learning_rate": 1.3889608706918892e-06, + "loss": 1.6272, + "step": 268 + }, + { + "epoch": 0.006273924438859848, + "grad_norm": 1.2221115827560425, + "learning_rate": 1.3941435605079038e-06, + "loss": 1.7299, + "step": 269 + }, + { + "epoch": 0.006297247578037766, + "grad_norm": 1.189775824546814, + "learning_rate": 1.3993262503239183e-06, + "loss": 1.1664, + "step": 270 + }, + { + "epoch": 0.006320570717215684, + "grad_norm": 1.0103381872177124, + "learning_rate": 1.404508940139933e-06, + "loss": 1.3519, + "step": 271 + }, + { + "epoch": 0.006343893856393602, + "grad_norm": 1.1243481636047363, + "learning_rate": 1.4096916299559475e-06, + "loss": 1.6704, + "step": 272 + }, + { + "epoch": 0.006367216995571519, + "grad_norm": 1.8137811422348022, + "learning_rate": 1.4148743197719616e-06, + "loss": 1.279, + "step": 273 + }, + { + "epoch": 0.0063905401347494365, + "grad_norm": 1.0875202417373657, + "learning_rate": 1.4200570095879762e-06, + "loss": 1.1564, + "step": 274 + }, + { + "epoch": 0.0064138632739273544, + "grad_norm": 1.0839550495147705, + "learning_rate": 1.4252396994039907e-06, + "loss": 1.7263, + "step": 275 + }, + { + "epoch": 0.006437186413105272, + "grad_norm": 1.7203173637390137, + "learning_rate": 1.4304223892200053e-06, + "loss": 1.9309, + "step": 276 + }, + { + "epoch": 0.006460509552283189, + "grad_norm": 1.3320658206939697, + "learning_rate": 1.4356050790360198e-06, + "loss": 1.8276, + "step": 277 + }, + { + "epoch": 0.006483832691461107, + "grad_norm": 1.5260910987854004, + "learning_rate": 1.4407877688520344e-06, + "loss": 1.413, + "step": 278 + }, + { + "epoch": 0.006507155830639025, + "grad_norm": 1.2401058673858643, + "learning_rate": 1.445970458668049e-06, + "loss": 1.4087, + "step": 279 + }, + { + "epoch": 0.006530478969816942, + "grad_norm": 1.2722922563552856, + "learning_rate": 1.4511531484840633e-06, + "loss": 1.6216, + "step": 280 + }, + { + "epoch": 0.00655380210899486, + "grad_norm": 1.2668229341506958, + "learning_rate": 1.4563358383000779e-06, + "loss": 1.6252, + "step": 281 + }, + { + "epoch": 0.006577125248172778, + "grad_norm": 1.4556583166122437, + "learning_rate": 1.4615185281160924e-06, + "loss": 2.3276, + "step": 282 + }, + { + "epoch": 0.006600448387350696, + "grad_norm": 1.537610411643982, + "learning_rate": 1.466701217932107e-06, + "loss": 1.4319, + "step": 283 + }, + { + "epoch": 0.006623771526528613, + "grad_norm": 1.3130170106887817, + "learning_rate": 1.4718839077481215e-06, + "loss": 1.4978, + "step": 284 + }, + { + "epoch": 0.006647094665706531, + "grad_norm": 1.5020934343338013, + "learning_rate": 1.477066597564136e-06, + "loss": 1.8697, + "step": 285 + }, + { + "epoch": 0.006670417804884449, + "grad_norm": 1.6949779987335205, + "learning_rate": 1.4822492873801502e-06, + "loss": 1.7433, + "step": 286 + }, + { + "epoch": 0.006693740944062366, + "grad_norm": 1.5566325187683105, + "learning_rate": 1.4874319771961648e-06, + "loss": 1.5674, + "step": 287 + }, + { + "epoch": 0.006717064083240284, + "grad_norm": 1.015093445777893, + "learning_rate": 1.4926146670121793e-06, + "loss": 1.9903, + "step": 288 + }, + { + "epoch": 0.0067403872224182015, + "grad_norm": 2.229853868484497, + "learning_rate": 1.497797356828194e-06, + "loss": 1.1905, + "step": 289 + }, + { + "epoch": 0.006763710361596119, + "grad_norm": 1.5241860151290894, + "learning_rate": 1.5029800466442085e-06, + "loss": 1.958, + "step": 290 + }, + { + "epoch": 0.006787033500774036, + "grad_norm": 0.8666454553604126, + "learning_rate": 1.508162736460223e-06, + "loss": 1.7141, + "step": 291 + }, + { + "epoch": 0.006810356639951954, + "grad_norm": 1.4594520330429077, + "learning_rate": 1.5133454262762376e-06, + "loss": 1.7235, + "step": 292 + }, + { + "epoch": 0.006833679779129872, + "grad_norm": 1.3267074823379517, + "learning_rate": 1.518528116092252e-06, + "loss": 1.6172, + "step": 293 + }, + { + "epoch": 0.006857002918307789, + "grad_norm": 1.5386312007904053, + "learning_rate": 1.5237108059082665e-06, + "loss": 1.4843, + "step": 294 + }, + { + "epoch": 0.006880326057485707, + "grad_norm": 1.3275539875030518, + "learning_rate": 1.528893495724281e-06, + "loss": 1.5444, + "step": 295 + }, + { + "epoch": 0.006903649196663625, + "grad_norm": 1.1002707481384277, + "learning_rate": 1.5340761855402956e-06, + "loss": 1.717, + "step": 296 + }, + { + "epoch": 0.006926972335841543, + "grad_norm": 1.172974944114685, + "learning_rate": 1.5392588753563102e-06, + "loss": 1.6963, + "step": 297 + }, + { + "epoch": 0.00695029547501946, + "grad_norm": 1.0728440284729004, + "learning_rate": 1.5444415651723247e-06, + "loss": 1.6228, + "step": 298 + }, + { + "epoch": 0.006973618614197378, + "grad_norm": 1.274348258972168, + "learning_rate": 1.5496242549883393e-06, + "loss": 1.2559, + "step": 299 + }, + { + "epoch": 0.006996941753375296, + "grad_norm": 1.2520028352737427, + "learning_rate": 1.5548069448043534e-06, + "loss": 1.6118, + "step": 300 + }, + { + "epoch": 0.007020264892553213, + "grad_norm": 1.5844305753707886, + "learning_rate": 1.559989634620368e-06, + "loss": 1.5645, + "step": 301 + }, + { + "epoch": 0.007043588031731131, + "grad_norm": 2.285438299179077, + "learning_rate": 1.5651723244363825e-06, + "loss": 1.4541, + "step": 302 + }, + { + "epoch": 0.007066911170909049, + "grad_norm": 1.2873152494430542, + "learning_rate": 1.570355014252397e-06, + "loss": 1.4835, + "step": 303 + }, + { + "epoch": 0.0070902343100869665, + "grad_norm": 1.1332640647888184, + "learning_rate": 1.5755377040684116e-06, + "loss": 1.8279, + "step": 304 + }, + { + "epoch": 0.0071135574492648835, + "grad_norm": 1.6483525037765503, + "learning_rate": 1.5807203938844262e-06, + "loss": 1.2509, + "step": 305 + }, + { + "epoch": 0.007136880588442801, + "grad_norm": 1.0219485759735107, + "learning_rate": 1.5859030837004408e-06, + "loss": 1.8421, + "step": 306 + }, + { + "epoch": 0.007160203727620719, + "grad_norm": 1.2478340864181519, + "learning_rate": 1.5910857735164551e-06, + "loss": 1.9144, + "step": 307 + }, + { + "epoch": 0.007183526866798637, + "grad_norm": 1.4016437530517578, + "learning_rate": 1.5962684633324697e-06, + "loss": 1.5146, + "step": 308 + }, + { + "epoch": 0.007206850005976554, + "grad_norm": 1.1399790048599243, + "learning_rate": 1.6014511531484842e-06, + "loss": 1.6714, + "step": 309 + }, + { + "epoch": 0.007230173145154472, + "grad_norm": 2.047961473464966, + "learning_rate": 1.6066338429644988e-06, + "loss": 1.1777, + "step": 310 + }, + { + "epoch": 0.00725349628433239, + "grad_norm": 1.1410201787948608, + "learning_rate": 1.6118165327805133e-06, + "loss": 1.6783, + "step": 311 + }, + { + "epoch": 0.007276819423510307, + "grad_norm": 1.2840640544891357, + "learning_rate": 1.616999222596528e-06, + "loss": 1.9351, + "step": 312 + }, + { + "epoch": 0.007300142562688225, + "grad_norm": 0.9116181135177612, + "learning_rate": 1.6221819124125425e-06, + "loss": 1.7705, + "step": 313 + }, + { + "epoch": 0.007323465701866143, + "grad_norm": 1.3190463781356812, + "learning_rate": 1.6273646022285566e-06, + "loss": 1.4484, + "step": 314 + }, + { + "epoch": 0.007346788841044061, + "grad_norm": 0.9988270401954651, + "learning_rate": 1.6325472920445712e-06, + "loss": 1.5159, + "step": 315 + }, + { + "epoch": 0.007370111980221978, + "grad_norm": 0.8620725870132446, + "learning_rate": 1.6377299818605857e-06, + "loss": 1.5605, + "step": 316 + }, + { + "epoch": 0.007393435119399896, + "grad_norm": 1.284604549407959, + "learning_rate": 1.6429126716766003e-06, + "loss": 1.4822, + "step": 317 + }, + { + "epoch": 0.0074167582585778135, + "grad_norm": 1.2546097040176392, + "learning_rate": 1.6480953614926148e-06, + "loss": 1.436, + "step": 318 + }, + { + "epoch": 0.0074400813977557306, + "grad_norm": 0.9116978645324707, + "learning_rate": 1.6532780513086294e-06, + "loss": 1.2708, + "step": 319 + }, + { + "epoch": 0.0074634045369336485, + "grad_norm": 0.9910548329353333, + "learning_rate": 1.658460741124644e-06, + "loss": 1.8144, + "step": 320 + }, + { + "epoch": 0.007486727676111566, + "grad_norm": 1.9879093170166016, + "learning_rate": 1.6636434309406583e-06, + "loss": 1.4826, + "step": 321 + }, + { + "epoch": 0.007510050815289484, + "grad_norm": 1.0845030546188354, + "learning_rate": 1.6688261207566729e-06, + "loss": 1.3364, + "step": 322 + }, + { + "epoch": 0.007533373954467401, + "grad_norm": 1.342966079711914, + "learning_rate": 1.6740088105726874e-06, + "loss": 1.6453, + "step": 323 + }, + { + "epoch": 0.007556697093645319, + "grad_norm": 0.9570252895355225, + "learning_rate": 1.679191500388702e-06, + "loss": 1.5384, + "step": 324 + }, + { + "epoch": 0.007580020232823237, + "grad_norm": 1.531516671180725, + "learning_rate": 1.6843741902047165e-06, + "loss": 1.5775, + "step": 325 + }, + { + "epoch": 0.007603343372001154, + "grad_norm": 1.4623240232467651, + "learning_rate": 1.689556880020731e-06, + "loss": 1.7159, + "step": 326 + }, + { + "epoch": 0.007626666511179072, + "grad_norm": 1.109586238861084, + "learning_rate": 1.6947395698367454e-06, + "loss": 1.7403, + "step": 327 + }, + { + "epoch": 0.00764998965035699, + "grad_norm": 1.3199604749679565, + "learning_rate": 1.6999222596527598e-06, + "loss": 1.7208, + "step": 328 + }, + { + "epoch": 0.007673312789534908, + "grad_norm": 1.0979784727096558, + "learning_rate": 1.7051049494687743e-06, + "loss": 1.6097, + "step": 329 + }, + { + "epoch": 0.007696635928712825, + "grad_norm": 1.0952926874160767, + "learning_rate": 1.710287639284789e-06, + "loss": 1.8262, + "step": 330 + }, + { + "epoch": 0.007719959067890743, + "grad_norm": 1.1149373054504395, + "learning_rate": 1.7154703291008035e-06, + "loss": 1.5762, + "step": 331 + }, + { + "epoch": 0.007743282207068661, + "grad_norm": 1.2090753316879272, + "learning_rate": 1.720653018916818e-06, + "loss": 1.6161, + "step": 332 + }, + { + "epoch": 0.007766605346246578, + "grad_norm": 1.3476163148880005, + "learning_rate": 1.7258357087328326e-06, + "loss": 1.6854, + "step": 333 + }, + { + "epoch": 0.0077899284854244955, + "grad_norm": 1.3222614526748657, + "learning_rate": 1.7310183985488471e-06, + "loss": 1.5996, + "step": 334 + }, + { + "epoch": 0.007813251624602413, + "grad_norm": 1.2350871562957764, + "learning_rate": 1.7362010883648615e-06, + "loss": 1.5052, + "step": 335 + }, + { + "epoch": 0.007836574763780331, + "grad_norm": 1.4628745317459106, + "learning_rate": 1.741383778180876e-06, + "loss": 1.6268, + "step": 336 + }, + { + "epoch": 0.00785989790295825, + "grad_norm": 1.3481048345565796, + "learning_rate": 1.7465664679968906e-06, + "loss": 1.4308, + "step": 337 + }, + { + "epoch": 0.007883221042136167, + "grad_norm": 1.0008901357650757, + "learning_rate": 1.7517491578129052e-06, + "loss": 1.6487, + "step": 338 + }, + { + "epoch": 0.007906544181314083, + "grad_norm": 2.4258437156677246, + "learning_rate": 1.7569318476289195e-06, + "loss": 1.5327, + "step": 339 + }, + { + "epoch": 0.007929867320492001, + "grad_norm": 1.3444914817810059, + "learning_rate": 1.762114537444934e-06, + "loss": 1.5257, + "step": 340 + }, + { + "epoch": 0.007953190459669919, + "grad_norm": 2.297591209411621, + "learning_rate": 1.7672972272609486e-06, + "loss": 1.9581, + "step": 341 + }, + { + "epoch": 0.007976513598847837, + "grad_norm": 1.107711672782898, + "learning_rate": 1.772479917076963e-06, + "loss": 1.3486, + "step": 342 + }, + { + "epoch": 0.007999836738025755, + "grad_norm": 1.4064106941223145, + "learning_rate": 1.7776626068929775e-06, + "loss": 1.3169, + "step": 343 + }, + { + "epoch": 0.008023159877203673, + "grad_norm": 1.1236720085144043, + "learning_rate": 1.782845296708992e-06, + "loss": 2.0225, + "step": 344 + }, + { + "epoch": 0.00804648301638159, + "grad_norm": 1.9214081764221191, + "learning_rate": 1.7880279865250066e-06, + "loss": 1.7269, + "step": 345 + }, + { + "epoch": 0.008069806155559507, + "grad_norm": 1.1544204950332642, + "learning_rate": 1.7932106763410212e-06, + "loss": 1.8407, + "step": 346 + }, + { + "epoch": 0.008093129294737425, + "grad_norm": 1.3266545534133911, + "learning_rate": 1.7983933661570358e-06, + "loss": 1.3316, + "step": 347 + }, + { + "epoch": 0.008116452433915343, + "grad_norm": 1.4208300113677979, + "learning_rate": 1.8035760559730501e-06, + "loss": 1.7712, + "step": 348 + }, + { + "epoch": 0.00813977557309326, + "grad_norm": 1.1849939823150635, + "learning_rate": 1.8087587457890647e-06, + "loss": 1.3843, + "step": 349 + }, + { + "epoch": 0.008163098712271178, + "grad_norm": 0.9147690534591675, + "learning_rate": 1.8139414356050792e-06, + "loss": 1.703, + "step": 350 + }, + { + "epoch": 0.008186421851449096, + "grad_norm": 1.2026822566986084, + "learning_rate": 1.8191241254210938e-06, + "loss": 1.642, + "step": 351 + }, + { + "epoch": 0.008209744990627014, + "grad_norm": 1.6620279550552368, + "learning_rate": 1.8243068152371081e-06, + "loss": 1.2861, + "step": 352 + }, + { + "epoch": 0.00823306812980493, + "grad_norm": 1.20318603515625, + "learning_rate": 1.8294895050531227e-06, + "loss": 1.7781, + "step": 353 + }, + { + "epoch": 0.008256391268982848, + "grad_norm": 1.117148756980896, + "learning_rate": 1.8346721948691372e-06, + "loss": 1.7056, + "step": 354 + }, + { + "epoch": 0.008279714408160766, + "grad_norm": 1.3435394763946533, + "learning_rate": 1.8398548846851516e-06, + "loss": 1.7352, + "step": 355 + }, + { + "epoch": 0.008303037547338684, + "grad_norm": 1.6550534963607788, + "learning_rate": 1.8450375745011662e-06, + "loss": 1.4283, + "step": 356 + }, + { + "epoch": 0.008326360686516602, + "grad_norm": 1.0326530933380127, + "learning_rate": 1.8502202643171807e-06, + "loss": 1.8726, + "step": 357 + }, + { + "epoch": 0.00834968382569452, + "grad_norm": 1.1237214803695679, + "learning_rate": 1.8554029541331953e-06, + "loss": 1.7547, + "step": 358 + }, + { + "epoch": 0.008373006964872438, + "grad_norm": 1.3457711935043335, + "learning_rate": 1.8605856439492098e-06, + "loss": 1.5047, + "step": 359 + }, + { + "epoch": 0.008396330104050354, + "grad_norm": 1.3615081310272217, + "learning_rate": 1.8657683337652244e-06, + "loss": 1.3476, + "step": 360 + }, + { + "epoch": 0.008419653243228272, + "grad_norm": 1.4443084001541138, + "learning_rate": 1.870951023581239e-06, + "loss": 1.4259, + "step": 361 + }, + { + "epoch": 0.00844297638240619, + "grad_norm": 0.9154095649719238, + "learning_rate": 1.8761337133972533e-06, + "loss": 1.6089, + "step": 362 + }, + { + "epoch": 0.008466299521584108, + "grad_norm": 1.1972756385803223, + "learning_rate": 1.8813164032132679e-06, + "loss": 1.5704, + "step": 363 + }, + { + "epoch": 0.008489622660762025, + "grad_norm": 1.1325738430023193, + "learning_rate": 1.8864990930292822e-06, + "loss": 1.7252, + "step": 364 + }, + { + "epoch": 0.008512945799939943, + "grad_norm": 1.2257301807403564, + "learning_rate": 1.8916817828452968e-06, + "loss": 1.5124, + "step": 365 + }, + { + "epoch": 0.008536268939117861, + "grad_norm": 1.7714002132415771, + "learning_rate": 1.8968644726613113e-06, + "loss": 1.5799, + "step": 366 + }, + { + "epoch": 0.008559592078295777, + "grad_norm": 1.1215579509735107, + "learning_rate": 1.9020471624773259e-06, + "loss": 1.7692, + "step": 367 + }, + { + "epoch": 0.008582915217473695, + "grad_norm": 1.3264069557189941, + "learning_rate": 1.9072298522933404e-06, + "loss": 1.7848, + "step": 368 + }, + { + "epoch": 0.008606238356651613, + "grad_norm": 0.9898104667663574, + "learning_rate": 1.912412542109355e-06, + "loss": 1.945, + "step": 369 + }, + { + "epoch": 0.008629561495829531, + "grad_norm": 0.9507944583892822, + "learning_rate": 1.9175952319253693e-06, + "loss": 1.6469, + "step": 370 + }, + { + "epoch": 0.008652884635007449, + "grad_norm": 1.1940997838974, + "learning_rate": 1.9227779217413837e-06, + "loss": 1.5144, + "step": 371 + }, + { + "epoch": 0.008676207774185367, + "grad_norm": 1.2926305532455444, + "learning_rate": 1.9279606115573985e-06, + "loss": 1.6527, + "step": 372 + }, + { + "epoch": 0.008699530913363285, + "grad_norm": 0.9909786581993103, + "learning_rate": 1.933143301373413e-06, + "loss": 1.8003, + "step": 373 + }, + { + "epoch": 0.008722854052541201, + "grad_norm": 1.3900662660598755, + "learning_rate": 1.9383259911894276e-06, + "loss": 1.7743, + "step": 374 + }, + { + "epoch": 0.008746177191719119, + "grad_norm": 0.9942039251327515, + "learning_rate": 1.943508681005442e-06, + "loss": 1.5635, + "step": 375 + }, + { + "epoch": 0.008769500330897037, + "grad_norm": 1.3887672424316406, + "learning_rate": 1.9486913708214563e-06, + "loss": 1.744, + "step": 376 + }, + { + "epoch": 0.008792823470074955, + "grad_norm": 1.2873059511184692, + "learning_rate": 1.953874060637471e-06, + "loss": 1.64, + "step": 377 + }, + { + "epoch": 0.008816146609252873, + "grad_norm": 1.2259247303009033, + "learning_rate": 1.9590567504534854e-06, + "loss": 1.6418, + "step": 378 + }, + { + "epoch": 0.00883946974843079, + "grad_norm": 1.5709097385406494, + "learning_rate": 1.9642394402695e-06, + "loss": 1.4343, + "step": 379 + }, + { + "epoch": 0.008862792887608708, + "grad_norm": 1.016625165939331, + "learning_rate": 1.9694221300855145e-06, + "loss": 1.5838, + "step": 380 + }, + { + "epoch": 0.008886116026786626, + "grad_norm": 1.5763674974441528, + "learning_rate": 1.9746048199015293e-06, + "loss": 1.3391, + "step": 381 + }, + { + "epoch": 0.008909439165964542, + "grad_norm": 1.014722466468811, + "learning_rate": 1.9797875097175436e-06, + "loss": 1.7185, + "step": 382 + }, + { + "epoch": 0.00893276230514246, + "grad_norm": 1.5255705118179321, + "learning_rate": 1.984970199533558e-06, + "loss": 1.5749, + "step": 383 + }, + { + "epoch": 0.008956085444320378, + "grad_norm": 1.4036648273468018, + "learning_rate": 1.9901528893495723e-06, + "loss": 1.4134, + "step": 384 + }, + { + "epoch": 0.008979408583498296, + "grad_norm": 1.327813982963562, + "learning_rate": 1.995335579165587e-06, + "loss": 1.8475, + "step": 385 + }, + { + "epoch": 0.009002731722676214, + "grad_norm": 1.357269287109375, + "learning_rate": 2.0005182689816014e-06, + "loss": 1.4145, + "step": 386 + }, + { + "epoch": 0.009026054861854132, + "grad_norm": 1.4663738012313843, + "learning_rate": 2.005700958797616e-06, + "loss": 1.5207, + "step": 387 + }, + { + "epoch": 0.00904937800103205, + "grad_norm": 0.9792691469192505, + "learning_rate": 2.0108836486136305e-06, + "loss": 1.7392, + "step": 388 + }, + { + "epoch": 0.009072701140209966, + "grad_norm": 1.9074856042861938, + "learning_rate": 2.0160663384296453e-06, + "loss": 1.5931, + "step": 389 + }, + { + "epoch": 0.009096024279387884, + "grad_norm": 1.562455654144287, + "learning_rate": 2.0212490282456597e-06, + "loss": 1.3503, + "step": 390 + }, + { + "epoch": 0.009119347418565802, + "grad_norm": 1.6827714443206787, + "learning_rate": 2.026431718061674e-06, + "loss": 1.8409, + "step": 391 + }, + { + "epoch": 0.00914267055774372, + "grad_norm": 0.969691276550293, + "learning_rate": 2.0316144078776888e-06, + "loss": 1.5167, + "step": 392 + }, + { + "epoch": 0.009165993696921637, + "grad_norm": 1.1107996702194214, + "learning_rate": 2.036797097693703e-06, + "loss": 1.5723, + "step": 393 + }, + { + "epoch": 0.009189316836099555, + "grad_norm": 0.9862359762191772, + "learning_rate": 2.041979787509718e-06, + "loss": 1.1188, + "step": 394 + }, + { + "epoch": 0.009212639975277473, + "grad_norm": 1.4997074604034424, + "learning_rate": 2.0471624773257322e-06, + "loss": 1.6742, + "step": 395 + }, + { + "epoch": 0.00923596311445539, + "grad_norm": 1.1336885690689087, + "learning_rate": 2.052345167141747e-06, + "loss": 1.5602, + "step": 396 + }, + { + "epoch": 0.009259286253633307, + "grad_norm": 1.4929397106170654, + "learning_rate": 2.057527856957761e-06, + "loss": 1.4891, + "step": 397 + }, + { + "epoch": 0.009282609392811225, + "grad_norm": 1.3118637800216675, + "learning_rate": 2.0627105467737757e-06, + "loss": 1.5758, + "step": 398 + }, + { + "epoch": 0.009305932531989143, + "grad_norm": 1.1043623685836792, + "learning_rate": 2.06789323658979e-06, + "loss": 1.9455, + "step": 399 + }, + { + "epoch": 0.009329255671167061, + "grad_norm": 1.3472813367843628, + "learning_rate": 2.073075926405805e-06, + "loss": 1.4657, + "step": 400 + }, + { + "epoch": 0.009352578810344979, + "grad_norm": 1.5614628791809082, + "learning_rate": 2.078258616221819e-06, + "loss": 1.3351, + "step": 401 + }, + { + "epoch": 0.009375901949522897, + "grad_norm": 1.393477439880371, + "learning_rate": 2.083441306037834e-06, + "loss": 1.8887, + "step": 402 + }, + { + "epoch": 0.009399225088700813, + "grad_norm": 1.0576095581054688, + "learning_rate": 2.0886239958538483e-06, + "loss": 1.7814, + "step": 403 + }, + { + "epoch": 0.00942254822787873, + "grad_norm": 1.5161347389221191, + "learning_rate": 2.0938066856698626e-06, + "loss": 1.2316, + "step": 404 + }, + { + "epoch": 0.009445871367056649, + "grad_norm": 1.05890691280365, + "learning_rate": 2.0989893754858774e-06, + "loss": 1.5303, + "step": 405 + }, + { + "epoch": 0.009469194506234567, + "grad_norm": 0.801816463470459, + "learning_rate": 2.1041720653018918e-06, + "loss": 1.5165, + "step": 406 + }, + { + "epoch": 0.009492517645412485, + "grad_norm": 1.2811832427978516, + "learning_rate": 2.1093547551179065e-06, + "loss": 1.8638, + "step": 407 + }, + { + "epoch": 0.009515840784590402, + "grad_norm": 1.2984956502914429, + "learning_rate": 2.114537444933921e-06, + "loss": 1.4195, + "step": 408 + }, + { + "epoch": 0.00953916392376832, + "grad_norm": 2.3772926330566406, + "learning_rate": 2.1197201347499356e-06, + "loss": 1.2616, + "step": 409 + }, + { + "epoch": 0.009562487062946236, + "grad_norm": 1.102181315422058, + "learning_rate": 2.12490282456595e-06, + "loss": 1.6683, + "step": 410 + }, + { + "epoch": 0.009585810202124154, + "grad_norm": 1.4473963975906372, + "learning_rate": 2.1300855143819643e-06, + "loss": 1.6474, + "step": 411 + }, + { + "epoch": 0.009609133341302072, + "grad_norm": 2.3995816707611084, + "learning_rate": 2.1352682041979787e-06, + "loss": 1.6203, + "step": 412 + }, + { + "epoch": 0.00963245648047999, + "grad_norm": 0.9490773677825928, + "learning_rate": 2.1404508940139935e-06, + "loss": 1.8082, + "step": 413 + }, + { + "epoch": 0.009655779619657908, + "grad_norm": 0.9358771443367004, + "learning_rate": 2.145633583830008e-06, + "loss": 1.5929, + "step": 414 + }, + { + "epoch": 0.009679102758835826, + "grad_norm": 0.9875616431236267, + "learning_rate": 2.1508162736460226e-06, + "loss": 1.4312, + "step": 415 + }, + { + "epoch": 0.009702425898013744, + "grad_norm": 1.197416067123413, + "learning_rate": 2.155998963462037e-06, + "loss": 1.3165, + "step": 416 + }, + { + "epoch": 0.00972574903719166, + "grad_norm": 2.0210750102996826, + "learning_rate": 2.1611816532780513e-06, + "loss": 1.4962, + "step": 417 + }, + { + "epoch": 0.009749072176369578, + "grad_norm": 1.2700085639953613, + "learning_rate": 2.166364343094066e-06, + "loss": 1.6101, + "step": 418 + }, + { + "epoch": 0.009772395315547496, + "grad_norm": 1.124679684638977, + "learning_rate": 2.1715470329100804e-06, + "loss": 1.7477, + "step": 419 + }, + { + "epoch": 0.009795718454725414, + "grad_norm": 1.178290843963623, + "learning_rate": 2.176729722726095e-06, + "loss": 1.4108, + "step": 420 + }, + { + "epoch": 0.009819041593903332, + "grad_norm": 1.792117953300476, + "learning_rate": 2.1819124125421095e-06, + "loss": 1.5568, + "step": 421 + }, + { + "epoch": 0.00984236473308125, + "grad_norm": 1.7381610870361328, + "learning_rate": 2.1870951023581243e-06, + "loss": 1.3229, + "step": 422 + }, + { + "epoch": 0.009865687872259167, + "grad_norm": 1.023553490638733, + "learning_rate": 2.1922777921741386e-06, + "loss": 1.1633, + "step": 423 + }, + { + "epoch": 0.009889011011437084, + "grad_norm": 1.5537900924682617, + "learning_rate": 2.197460481990153e-06, + "loss": 1.291, + "step": 424 + }, + { + "epoch": 0.009912334150615001, + "grad_norm": 1.722598671913147, + "learning_rate": 2.2026431718061673e-06, + "loss": 1.5201, + "step": 425 + }, + { + "epoch": 0.00993565728979292, + "grad_norm": 1.546295166015625, + "learning_rate": 2.207825861622182e-06, + "loss": 1.3554, + "step": 426 + }, + { + "epoch": 0.009958980428970837, + "grad_norm": 1.4075593948364258, + "learning_rate": 2.2130085514381964e-06, + "loss": 1.3831, + "step": 427 + }, + { + "epoch": 0.009982303568148755, + "grad_norm": 1.441125512123108, + "learning_rate": 2.218191241254211e-06, + "loss": 1.4806, + "step": 428 + }, + { + "epoch": 0.010005626707326673, + "grad_norm": 1.4198213815689087, + "learning_rate": 2.2233739310702255e-06, + "loss": 1.6962, + "step": 429 + }, + { + "epoch": 0.010028949846504591, + "grad_norm": 1.1716971397399902, + "learning_rate": 2.2285566208862403e-06, + "loss": 1.0423, + "step": 430 + }, + { + "epoch": 0.010052272985682507, + "grad_norm": 1.1271895170211792, + "learning_rate": 2.2337393107022547e-06, + "loss": 1.4246, + "step": 431 + }, + { + "epoch": 0.010075596124860425, + "grad_norm": 1.2987208366394043, + "learning_rate": 2.238922000518269e-06, + "loss": 1.5946, + "step": 432 + }, + { + "epoch": 0.010098919264038343, + "grad_norm": 1.7283997535705566, + "learning_rate": 2.2441046903342838e-06, + "loss": 1.5761, + "step": 433 + }, + { + "epoch": 0.01012224240321626, + "grad_norm": 1.635098934173584, + "learning_rate": 2.249287380150298e-06, + "loss": 1.6912, + "step": 434 + }, + { + "epoch": 0.010145565542394179, + "grad_norm": 2.1896469593048096, + "learning_rate": 2.254470069966313e-06, + "loss": 1.2961, + "step": 435 + }, + { + "epoch": 0.010168888681572097, + "grad_norm": 1.1874053478240967, + "learning_rate": 2.2596527597823272e-06, + "loss": 1.4999, + "step": 436 + }, + { + "epoch": 0.010192211820750014, + "grad_norm": 1.2898855209350586, + "learning_rate": 2.264835449598342e-06, + "loss": 1.7152, + "step": 437 + }, + { + "epoch": 0.010215534959927932, + "grad_norm": 0.792107105255127, + "learning_rate": 2.270018139414356e-06, + "loss": 1.4129, + "step": 438 + }, + { + "epoch": 0.010238858099105849, + "grad_norm": 1.2092666625976562, + "learning_rate": 2.2752008292303707e-06, + "loss": 1.4687, + "step": 439 + }, + { + "epoch": 0.010262181238283766, + "grad_norm": 1.2261115312576294, + "learning_rate": 2.280383519046385e-06, + "loss": 1.5548, + "step": 440 + }, + { + "epoch": 0.010285504377461684, + "grad_norm": 2.0835094451904297, + "learning_rate": 2.2855662088624e-06, + "loss": 1.5925, + "step": 441 + }, + { + "epoch": 0.010308827516639602, + "grad_norm": 1.075907826423645, + "learning_rate": 2.290748898678414e-06, + "loss": 1.4967, + "step": 442 + }, + { + "epoch": 0.01033215065581752, + "grad_norm": 0.9633646011352539, + "learning_rate": 2.295931588494429e-06, + "loss": 1.6798, + "step": 443 + }, + { + "epoch": 0.010355473794995438, + "grad_norm": 1.6833699941635132, + "learning_rate": 2.3011142783104433e-06, + "loss": 1.3053, + "step": 444 + }, + { + "epoch": 0.010378796934173356, + "grad_norm": 1.1333974599838257, + "learning_rate": 2.3062969681264576e-06, + "loss": 1.3658, + "step": 445 + }, + { + "epoch": 0.010402120073351272, + "grad_norm": 1.3382309675216675, + "learning_rate": 2.3114796579424724e-06, + "loss": 1.6492, + "step": 446 + }, + { + "epoch": 0.01042544321252919, + "grad_norm": 0.7148923873901367, + "learning_rate": 2.3166623477584868e-06, + "loss": 1.6269, + "step": 447 + }, + { + "epoch": 0.010448766351707108, + "grad_norm": 1.084245204925537, + "learning_rate": 2.3218450375745015e-06, + "loss": 2.0708, + "step": 448 + }, + { + "epoch": 0.010472089490885026, + "grad_norm": 1.1463004350662231, + "learning_rate": 2.327027727390516e-06, + "loss": 2.0115, + "step": 449 + }, + { + "epoch": 0.010495412630062944, + "grad_norm": 1.5500133037567139, + "learning_rate": 2.3322104172065306e-06, + "loss": 1.5454, + "step": 450 + }, + { + "epoch": 0.010518735769240862, + "grad_norm": 1.2993839979171753, + "learning_rate": 2.337393107022545e-06, + "loss": 1.5475, + "step": 451 + }, + { + "epoch": 0.01054205890841878, + "grad_norm": 1.295839786529541, + "learning_rate": 2.3425757968385593e-06, + "loss": 1.2895, + "step": 452 + }, + { + "epoch": 0.010565382047596696, + "grad_norm": 1.045040488243103, + "learning_rate": 2.3477584866545737e-06, + "loss": 1.7306, + "step": 453 + }, + { + "epoch": 0.010588705186774613, + "grad_norm": 1.4592766761779785, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.7795, + "step": 454 + }, + { + "epoch": 0.010612028325952531, + "grad_norm": 0.9432761073112488, + "learning_rate": 2.358123866286603e-06, + "loss": 1.6963, + "step": 455 + }, + { + "epoch": 0.01063535146513045, + "grad_norm": 1.3770086765289307, + "learning_rate": 2.3633065561026176e-06, + "loss": 1.2003, + "step": 456 + }, + { + "epoch": 0.010658674604308367, + "grad_norm": 1.1453793048858643, + "learning_rate": 2.368489245918632e-06, + "loss": 1.9012, + "step": 457 + }, + { + "epoch": 0.010681997743486285, + "grad_norm": 1.2836976051330566, + "learning_rate": 2.3736719357346467e-06, + "loss": 1.4324, + "step": 458 + }, + { + "epoch": 0.010705320882664203, + "grad_norm": 1.6498123407363892, + "learning_rate": 2.378854625550661e-06, + "loss": 1.6212, + "step": 459 + }, + { + "epoch": 0.010728644021842119, + "grad_norm": 1.3681795597076416, + "learning_rate": 2.3840373153666754e-06, + "loss": 1.6047, + "step": 460 + }, + { + "epoch": 0.010751967161020037, + "grad_norm": 1.4474722146987915, + "learning_rate": 2.38922000518269e-06, + "loss": 1.5279, + "step": 461 + }, + { + "epoch": 0.010775290300197955, + "grad_norm": 1.4832510948181152, + "learning_rate": 2.3944026949987045e-06, + "loss": 1.7073, + "step": 462 + }, + { + "epoch": 0.010798613439375873, + "grad_norm": 1.343935251235962, + "learning_rate": 2.3995853848147193e-06, + "loss": 1.4637, + "step": 463 + }, + { + "epoch": 0.01082193657855379, + "grad_norm": 1.8285539150238037, + "learning_rate": 2.4047680746307336e-06, + "loss": 1.3944, + "step": 464 + }, + { + "epoch": 0.010845259717731709, + "grad_norm": 1.4653230905532837, + "learning_rate": 2.4099507644467484e-06, + "loss": 1.8847, + "step": 465 + }, + { + "epoch": 0.010868582856909626, + "grad_norm": 1.4410351514816284, + "learning_rate": 2.4151334542627623e-06, + "loss": 1.7298, + "step": 466 + }, + { + "epoch": 0.010891905996087543, + "grad_norm": 1.3057256937026978, + "learning_rate": 2.420316144078777e-06, + "loss": 1.6188, + "step": 467 + }, + { + "epoch": 0.01091522913526546, + "grad_norm": 1.574479103088379, + "learning_rate": 2.4254988338947914e-06, + "loss": 1.585, + "step": 468 + }, + { + "epoch": 0.010938552274443378, + "grad_norm": 1.4391696453094482, + "learning_rate": 2.430681523710806e-06, + "loss": 1.7272, + "step": 469 + }, + { + "epoch": 0.010961875413621296, + "grad_norm": 2.304706335067749, + "learning_rate": 2.4358642135268205e-06, + "loss": 1.7127, + "step": 470 + }, + { + "epoch": 0.010985198552799214, + "grad_norm": 1.2380545139312744, + "learning_rate": 2.4410469033428353e-06, + "loss": 1.5428, + "step": 471 + }, + { + "epoch": 0.011008521691977132, + "grad_norm": 1.303446888923645, + "learning_rate": 2.4462295931588497e-06, + "loss": 1.609, + "step": 472 + }, + { + "epoch": 0.01103184483115505, + "grad_norm": 1.3888837099075317, + "learning_rate": 2.451412282974864e-06, + "loss": 1.7134, + "step": 473 + }, + { + "epoch": 0.011055167970332966, + "grad_norm": 0.9802701473236084, + "learning_rate": 2.4565949727908788e-06, + "loss": 1.4401, + "step": 474 + }, + { + "epoch": 0.011078491109510884, + "grad_norm": 1.5808403491973877, + "learning_rate": 2.461777662606893e-06, + "loss": 1.7415, + "step": 475 + }, + { + "epoch": 0.011101814248688802, + "grad_norm": 1.299912691116333, + "learning_rate": 2.466960352422908e-06, + "loss": 1.361, + "step": 476 + }, + { + "epoch": 0.01112513738786672, + "grad_norm": 0.9326110482215881, + "learning_rate": 2.4721430422389222e-06, + "loss": 1.222, + "step": 477 + }, + { + "epoch": 0.011148460527044638, + "grad_norm": 1.0385396480560303, + "learning_rate": 2.477325732054937e-06, + "loss": 1.4813, + "step": 478 + }, + { + "epoch": 0.011171783666222556, + "grad_norm": 1.1004397869110107, + "learning_rate": 2.482508421870951e-06, + "loss": 1.5064, + "step": 479 + }, + { + "epoch": 0.011195106805400474, + "grad_norm": 1.274898886680603, + "learning_rate": 2.4876911116869657e-06, + "loss": 1.3046, + "step": 480 + }, + { + "epoch": 0.01121842994457839, + "grad_norm": 1.0818660259246826, + "learning_rate": 2.49287380150298e-06, + "loss": 1.878, + "step": 481 + }, + { + "epoch": 0.011241753083756308, + "grad_norm": 1.2744652032852173, + "learning_rate": 2.498056491318995e-06, + "loss": 1.6394, + "step": 482 + }, + { + "epoch": 0.011265076222934226, + "grad_norm": 1.0467538833618164, + "learning_rate": 2.503239181135009e-06, + "loss": 1.8949, + "step": 483 + }, + { + "epoch": 0.011288399362112143, + "grad_norm": 1.2507177591323853, + "learning_rate": 2.508421870951024e-06, + "loss": 1.5386, + "step": 484 + }, + { + "epoch": 0.011311722501290061, + "grad_norm": 2.0707380771636963, + "learning_rate": 2.5136045607670383e-06, + "loss": 1.3359, + "step": 485 + }, + { + "epoch": 0.01133504564046798, + "grad_norm": 1.0060955286026, + "learning_rate": 2.518787250583053e-06, + "loss": 1.5551, + "step": 486 + }, + { + "epoch": 0.011358368779645897, + "grad_norm": 2.1019294261932373, + "learning_rate": 2.5239699403990674e-06, + "loss": 1.4009, + "step": 487 + }, + { + "epoch": 0.011381691918823813, + "grad_norm": 1.2085974216461182, + "learning_rate": 2.529152630215082e-06, + "loss": 1.1264, + "step": 488 + }, + { + "epoch": 0.011405015058001731, + "grad_norm": 1.2670215368270874, + "learning_rate": 2.5343353200310965e-06, + "loss": 1.4005, + "step": 489 + }, + { + "epoch": 0.011428338197179649, + "grad_norm": 0.976809024810791, + "learning_rate": 2.5395180098471104e-06, + "loss": 1.6539, + "step": 490 + }, + { + "epoch": 0.011451661336357567, + "grad_norm": 1.8012447357177734, + "learning_rate": 2.5447006996631252e-06, + "loss": 1.5083, + "step": 491 + }, + { + "epoch": 0.011474984475535485, + "grad_norm": 2.0657784938812256, + "learning_rate": 2.5498833894791396e-06, + "loss": 1.4127, + "step": 492 + }, + { + "epoch": 0.011498307614713403, + "grad_norm": 1.4070103168487549, + "learning_rate": 2.5550660792951543e-06, + "loss": 1.4707, + "step": 493 + }, + { + "epoch": 0.01152163075389132, + "grad_norm": 0.859045147895813, + "learning_rate": 2.5602487691111687e-06, + "loss": 1.6301, + "step": 494 + }, + { + "epoch": 0.011544953893069239, + "grad_norm": 1.5209952592849731, + "learning_rate": 2.5654314589271835e-06, + "loss": 1.8438, + "step": 495 + }, + { + "epoch": 0.011568277032247155, + "grad_norm": 1.1508231163024902, + "learning_rate": 2.570614148743198e-06, + "loss": 1.2495, + "step": 496 + }, + { + "epoch": 0.011591600171425073, + "grad_norm": 0.9130313396453857, + "learning_rate": 2.5757968385592126e-06, + "loss": 1.1848, + "step": 497 + }, + { + "epoch": 0.01161492331060299, + "grad_norm": 1.5925562381744385, + "learning_rate": 2.580979528375227e-06, + "loss": 1.4745, + "step": 498 + }, + { + "epoch": 0.011638246449780908, + "grad_norm": 2.5118539333343506, + "learning_rate": 2.5861622181912417e-06, + "loss": 1.6218, + "step": 499 + }, + { + "epoch": 0.011661569588958826, + "grad_norm": 1.272691249847412, + "learning_rate": 2.591344908007256e-06, + "loss": 1.2147, + "step": 500 + }, + { + "epoch": 0.011684892728136744, + "grad_norm": 1.1436160802841187, + "learning_rate": 2.596527597823271e-06, + "loss": 1.5556, + "step": 501 + }, + { + "epoch": 0.011708215867314662, + "grad_norm": 1.0195647478103638, + "learning_rate": 2.601710287639285e-06, + "loss": 1.3303, + "step": 502 + }, + { + "epoch": 0.011731539006492578, + "grad_norm": 1.4576568603515625, + "learning_rate": 2.6068929774553e-06, + "loss": 1.6531, + "step": 503 + }, + { + "epoch": 0.011754862145670496, + "grad_norm": 1.360716462135315, + "learning_rate": 2.612075667271314e-06, + "loss": 1.1761, + "step": 504 + }, + { + "epoch": 0.011778185284848414, + "grad_norm": 2.7770462036132812, + "learning_rate": 2.617258357087328e-06, + "loss": 1.247, + "step": 505 + }, + { + "epoch": 0.011801508424026332, + "grad_norm": 1.3706661462783813, + "learning_rate": 2.622441046903343e-06, + "loss": 1.5103, + "step": 506 + }, + { + "epoch": 0.01182483156320425, + "grad_norm": 1.5405017137527466, + "learning_rate": 2.6276237367193573e-06, + "loss": 1.6827, + "step": 507 + }, + { + "epoch": 0.011848154702382168, + "grad_norm": 1.1809494495391846, + "learning_rate": 2.632806426535372e-06, + "loss": 1.7162, + "step": 508 + }, + { + "epoch": 0.011871477841560086, + "grad_norm": 1.085557222366333, + "learning_rate": 2.6379891163513864e-06, + "loss": 1.514, + "step": 509 + }, + { + "epoch": 0.011894800980738002, + "grad_norm": 1.2155910730361938, + "learning_rate": 2.643171806167401e-06, + "loss": 1.4029, + "step": 510 + }, + { + "epoch": 0.01191812411991592, + "grad_norm": 1.240242600440979, + "learning_rate": 2.6483544959834155e-06, + "loss": 1.4336, + "step": 511 + }, + { + "epoch": 0.011941447259093838, + "grad_norm": 1.649802327156067, + "learning_rate": 2.6535371857994303e-06, + "loss": 1.9082, + "step": 512 + }, + { + "epoch": 0.011964770398271755, + "grad_norm": 1.3479831218719482, + "learning_rate": 2.6587198756154447e-06, + "loss": 1.5424, + "step": 513 + }, + { + "epoch": 0.011988093537449673, + "grad_norm": 1.2537102699279785, + "learning_rate": 2.6639025654314594e-06, + "loss": 1.6061, + "step": 514 + }, + { + "epoch": 0.012011416676627591, + "grad_norm": 1.1049939393997192, + "learning_rate": 2.6690852552474738e-06, + "loss": 1.8361, + "step": 515 + }, + { + "epoch": 0.012034739815805509, + "grad_norm": 2.9946062564849854, + "learning_rate": 2.6742679450634885e-06, + "loss": 1.4471, + "step": 516 + }, + { + "epoch": 0.012058062954983425, + "grad_norm": 0.9455610513687134, + "learning_rate": 2.6794506348795025e-06, + "loss": 1.6831, + "step": 517 + }, + { + "epoch": 0.012081386094161343, + "grad_norm": 1.4750438928604126, + "learning_rate": 2.684633324695517e-06, + "loss": 1.3143, + "step": 518 + }, + { + "epoch": 0.012104709233339261, + "grad_norm": 1.1056557893753052, + "learning_rate": 2.6898160145115316e-06, + "loss": 1.5054, + "step": 519 + }, + { + "epoch": 0.012128032372517179, + "grad_norm": 0.9718064069747925, + "learning_rate": 2.694998704327546e-06, + "loss": 1.3134, + "step": 520 + }, + { + "epoch": 0.012151355511695097, + "grad_norm": 2.2384724617004395, + "learning_rate": 2.7001813941435607e-06, + "loss": 1.4851, + "step": 521 + }, + { + "epoch": 0.012174678650873015, + "grad_norm": 1.2468239068984985, + "learning_rate": 2.705364083959575e-06, + "loss": 1.4873, + "step": 522 + }, + { + "epoch": 0.012198001790050933, + "grad_norm": 1.4248602390289307, + "learning_rate": 2.71054677377559e-06, + "loss": 1.7643, + "step": 523 + }, + { + "epoch": 0.012221324929228849, + "grad_norm": 1.3377385139465332, + "learning_rate": 2.715729463591604e-06, + "loss": 1.7064, + "step": 524 + }, + { + "epoch": 0.012244648068406767, + "grad_norm": 0.9933966994285583, + "learning_rate": 2.720912153407619e-06, + "loss": 1.7187, + "step": 525 + }, + { + "epoch": 0.012267971207584685, + "grad_norm": 1.018750548362732, + "learning_rate": 2.7260948432236333e-06, + "loss": 1.5915, + "step": 526 + }, + { + "epoch": 0.012291294346762602, + "grad_norm": 1.356325387954712, + "learning_rate": 2.731277533039648e-06, + "loss": 1.7193, + "step": 527 + }, + { + "epoch": 0.01231461748594052, + "grad_norm": 1.2781217098236084, + "learning_rate": 2.7364602228556624e-06, + "loss": 1.5494, + "step": 528 + }, + { + "epoch": 0.012337940625118438, + "grad_norm": 1.561498761177063, + "learning_rate": 2.741642912671677e-06, + "loss": 1.6972, + "step": 529 + }, + { + "epoch": 0.012361263764296356, + "grad_norm": 1.1695748567581177, + "learning_rate": 2.7468256024876915e-06, + "loss": 2.1633, + "step": 530 + }, + { + "epoch": 0.012384586903474272, + "grad_norm": 1.4304964542388916, + "learning_rate": 2.7520082923037054e-06, + "loss": 1.6321, + "step": 531 + }, + { + "epoch": 0.01240791004265219, + "grad_norm": 1.0513828992843628, + "learning_rate": 2.7571909821197202e-06, + "loss": 1.2897, + "step": 532 + }, + { + "epoch": 0.012431233181830108, + "grad_norm": 1.0206960439682007, + "learning_rate": 2.7623736719357346e-06, + "loss": 1.7842, + "step": 533 + }, + { + "epoch": 0.012454556321008026, + "grad_norm": 1.1440876722335815, + "learning_rate": 2.7675563617517493e-06, + "loss": 1.4399, + "step": 534 + }, + { + "epoch": 0.012477879460185944, + "grad_norm": 1.0837441682815552, + "learning_rate": 2.7727390515677637e-06, + "loss": 1.5155, + "step": 535 + }, + { + "epoch": 0.012501202599363862, + "grad_norm": 1.071378231048584, + "learning_rate": 2.7779217413837785e-06, + "loss": 1.6459, + "step": 536 + }, + { + "epoch": 0.01252452573854178, + "grad_norm": 1.6966552734375, + "learning_rate": 2.783104431199793e-06, + "loss": 1.6015, + "step": 537 + }, + { + "epoch": 0.012547848877719696, + "grad_norm": 1.2789183855056763, + "learning_rate": 2.7882871210158076e-06, + "loss": 1.2423, + "step": 538 + }, + { + "epoch": 0.012571172016897614, + "grad_norm": 1.2072651386260986, + "learning_rate": 2.793469810831822e-06, + "loss": 1.69, + "step": 539 + }, + { + "epoch": 0.012594495156075532, + "grad_norm": 1.5257117748260498, + "learning_rate": 2.7986525006478367e-06, + "loss": 1.7608, + "step": 540 + }, + { + "epoch": 0.01261781829525345, + "grad_norm": 1.0233759880065918, + "learning_rate": 2.803835190463851e-06, + "loss": 1.1299, + "step": 541 + }, + { + "epoch": 0.012641141434431367, + "grad_norm": 1.8280616998672485, + "learning_rate": 2.809017880279866e-06, + "loss": 1.3338, + "step": 542 + }, + { + "epoch": 0.012664464573609285, + "grad_norm": 1.6891363859176636, + "learning_rate": 2.81420057009588e-06, + "loss": 1.5505, + "step": 543 + }, + { + "epoch": 0.012687787712787203, + "grad_norm": 1.1501421928405762, + "learning_rate": 2.819383259911895e-06, + "loss": 1.6788, + "step": 544 + }, + { + "epoch": 0.01271111085196512, + "grad_norm": 1.107029914855957, + "learning_rate": 2.824565949727909e-06, + "loss": 1.3782, + "step": 545 + }, + { + "epoch": 0.012734433991143037, + "grad_norm": 0.9627429246902466, + "learning_rate": 2.829748639543923e-06, + "loss": 1.3155, + "step": 546 + }, + { + "epoch": 0.012757757130320955, + "grad_norm": 2.330007791519165, + "learning_rate": 2.834931329359938e-06, + "loss": 1.425, + "step": 547 + }, + { + "epoch": 0.012781080269498873, + "grad_norm": 1.4026503562927246, + "learning_rate": 2.8401140191759523e-06, + "loss": 1.5578, + "step": 548 + }, + { + "epoch": 0.012804403408676791, + "grad_norm": 0.9430487155914307, + "learning_rate": 2.845296708991967e-06, + "loss": 1.6075, + "step": 549 + }, + { + "epoch": 0.012827726547854709, + "grad_norm": 1.0779294967651367, + "learning_rate": 2.8504793988079814e-06, + "loss": 1.5169, + "step": 550 + }, + { + "epoch": 0.012851049687032627, + "grad_norm": 1.130324125289917, + "learning_rate": 2.855662088623996e-06, + "loss": 1.5016, + "step": 551 + }, + { + "epoch": 0.012874372826210545, + "grad_norm": 1.0127092599868774, + "learning_rate": 2.8608447784400105e-06, + "loss": 1.8715, + "step": 552 + }, + { + "epoch": 0.01289769596538846, + "grad_norm": 1.1831302642822266, + "learning_rate": 2.8660274682560253e-06, + "loss": 1.678, + "step": 553 + }, + { + "epoch": 0.012921019104566379, + "grad_norm": 1.3394455909729004, + "learning_rate": 2.8712101580720397e-06, + "loss": 1.4129, + "step": 554 + }, + { + "epoch": 0.012944342243744297, + "grad_norm": 1.2189030647277832, + "learning_rate": 2.8763928478880544e-06, + "loss": 1.7364, + "step": 555 + }, + { + "epoch": 0.012967665382922215, + "grad_norm": 1.2808138132095337, + "learning_rate": 2.8815755377040688e-06, + "loss": 1.6274, + "step": 556 + }, + { + "epoch": 0.012990988522100132, + "grad_norm": 1.0384689569473267, + "learning_rate": 2.8867582275200835e-06, + "loss": 1.5942, + "step": 557 + }, + { + "epoch": 0.01301431166127805, + "grad_norm": 1.8520807027816772, + "learning_rate": 2.891940917336098e-06, + "loss": 1.3067, + "step": 558 + }, + { + "epoch": 0.013037634800455968, + "grad_norm": 1.1817374229431152, + "learning_rate": 2.897123607152112e-06, + "loss": 1.6405, + "step": 559 + }, + { + "epoch": 0.013060957939633884, + "grad_norm": 1.1010823249816895, + "learning_rate": 2.9023062969681266e-06, + "loss": 1.4339, + "step": 560 + }, + { + "epoch": 0.013084281078811802, + "grad_norm": 1.2461942434310913, + "learning_rate": 2.907488986784141e-06, + "loss": 1.9866, + "step": 561 + }, + { + "epoch": 0.01310760421798972, + "grad_norm": 1.1503125429153442, + "learning_rate": 2.9126716766001557e-06, + "loss": 1.585, + "step": 562 + }, + { + "epoch": 0.013130927357167638, + "grad_norm": 1.542434573173523, + "learning_rate": 2.91785436641617e-06, + "loss": 1.4524, + "step": 563 + }, + { + "epoch": 0.013154250496345556, + "grad_norm": 1.0469673871994019, + "learning_rate": 2.923037056232185e-06, + "loss": 1.6884, + "step": 564 + }, + { + "epoch": 0.013177573635523474, + "grad_norm": 1.5137437582015991, + "learning_rate": 2.928219746048199e-06, + "loss": 1.5377, + "step": 565 + }, + { + "epoch": 0.013200896774701392, + "grad_norm": 1.1454534530639648, + "learning_rate": 2.933402435864214e-06, + "loss": 1.8508, + "step": 566 + }, + { + "epoch": 0.013224219913879308, + "grad_norm": 1.310381531715393, + "learning_rate": 2.9385851256802283e-06, + "loss": 1.5774, + "step": 567 + }, + { + "epoch": 0.013247543053057226, + "grad_norm": 1.1223838329315186, + "learning_rate": 2.943767815496243e-06, + "loss": 1.4496, + "step": 568 + }, + { + "epoch": 0.013270866192235144, + "grad_norm": 1.4537910223007202, + "learning_rate": 2.9489505053122574e-06, + "loss": 1.4423, + "step": 569 + }, + { + "epoch": 0.013294189331413062, + "grad_norm": 1.1783167123794556, + "learning_rate": 2.954133195128272e-06, + "loss": 1.9314, + "step": 570 + }, + { + "epoch": 0.01331751247059098, + "grad_norm": 1.211719274520874, + "learning_rate": 2.9593158849442865e-06, + "loss": 1.5366, + "step": 571 + }, + { + "epoch": 0.013340835609768897, + "grad_norm": 2.9552671909332275, + "learning_rate": 2.9644985747603004e-06, + "loss": 1.3431, + "step": 572 + }, + { + "epoch": 0.013364158748946815, + "grad_norm": 1.2814795970916748, + "learning_rate": 2.9696812645763152e-06, + "loss": 1.3879, + "step": 573 + }, + { + "epoch": 0.013387481888124731, + "grad_norm": 1.2598010301589966, + "learning_rate": 2.9748639543923296e-06, + "loss": 1.4775, + "step": 574 + }, + { + "epoch": 0.01341080502730265, + "grad_norm": 1.3874925374984741, + "learning_rate": 2.9800466442083443e-06, + "loss": 1.4012, + "step": 575 + }, + { + "epoch": 0.013434128166480567, + "grad_norm": 1.1846306324005127, + "learning_rate": 2.9852293340243587e-06, + "loss": 1.4491, + "step": 576 + }, + { + "epoch": 0.013457451305658485, + "grad_norm": 1.388150691986084, + "learning_rate": 2.9904120238403734e-06, + "loss": 1.6913, + "step": 577 + }, + { + "epoch": 0.013480774444836403, + "grad_norm": 1.8026880025863647, + "learning_rate": 2.995594713656388e-06, + "loss": 1.1754, + "step": 578 + }, + { + "epoch": 0.013504097584014321, + "grad_norm": 1.9366620779037476, + "learning_rate": 3.0007774034724026e-06, + "loss": 1.4406, + "step": 579 + }, + { + "epoch": 0.013527420723192239, + "grad_norm": 1.039657473564148, + "learning_rate": 3.005960093288417e-06, + "loss": 1.4823, + "step": 580 + }, + { + "epoch": 0.013550743862370155, + "grad_norm": 1.0928449630737305, + "learning_rate": 3.0111427831044317e-06, + "loss": 1.4502, + "step": 581 + }, + { + "epoch": 0.013574067001548073, + "grad_norm": 2.408292531967163, + "learning_rate": 3.016325472920446e-06, + "loss": 1.4778, + "step": 582 + }, + { + "epoch": 0.01359739014072599, + "grad_norm": 1.2284953594207764, + "learning_rate": 3.021508162736461e-06, + "loss": 1.5887, + "step": 583 + }, + { + "epoch": 0.013620713279903909, + "grad_norm": 1.3841763734817505, + "learning_rate": 3.026690852552475e-06, + "loss": 1.3778, + "step": 584 + }, + { + "epoch": 0.013644036419081827, + "grad_norm": 1.305172324180603, + "learning_rate": 3.03187354236849e-06, + "loss": 1.2837, + "step": 585 + }, + { + "epoch": 0.013667359558259744, + "grad_norm": 1.087904691696167, + "learning_rate": 3.037056232184504e-06, + "loss": 1.4361, + "step": 586 + }, + { + "epoch": 0.013690682697437662, + "grad_norm": 1.1818716526031494, + "learning_rate": 3.042238922000518e-06, + "loss": 1.4903, + "step": 587 + }, + { + "epoch": 0.013714005836615578, + "grad_norm": 0.9969412088394165, + "learning_rate": 3.047421611816533e-06, + "loss": 1.6923, + "step": 588 + }, + { + "epoch": 0.013737328975793496, + "grad_norm": 1.3729232549667358, + "learning_rate": 3.0526043016325473e-06, + "loss": 1.4219, + "step": 589 + }, + { + "epoch": 0.013760652114971414, + "grad_norm": 1.091769814491272, + "learning_rate": 3.057786991448562e-06, + "loss": 1.6978, + "step": 590 + }, + { + "epoch": 0.013783975254149332, + "grad_norm": 1.1668254137039185, + "learning_rate": 3.0629696812645764e-06, + "loss": 1.4609, + "step": 591 + }, + { + "epoch": 0.01380729839332725, + "grad_norm": 1.3739502429962158, + "learning_rate": 3.068152371080591e-06, + "loss": 1.7247, + "step": 592 + }, + { + "epoch": 0.013830621532505168, + "grad_norm": 1.480758547782898, + "learning_rate": 3.0733350608966055e-06, + "loss": 1.6142, + "step": 593 + }, + { + "epoch": 0.013853944671683086, + "grad_norm": 0.853581964969635, + "learning_rate": 3.0785177507126203e-06, + "loss": 1.5563, + "step": 594 + }, + { + "epoch": 0.013877267810861002, + "grad_norm": 1.144692063331604, + "learning_rate": 3.0837004405286347e-06, + "loss": 1.6145, + "step": 595 + }, + { + "epoch": 0.01390059095003892, + "grad_norm": 1.2413440942764282, + "learning_rate": 3.0888831303446494e-06, + "loss": 1.5762, + "step": 596 + }, + { + "epoch": 0.013923914089216838, + "grad_norm": 1.147834062576294, + "learning_rate": 3.0940658201606638e-06, + "loss": 1.4478, + "step": 597 + }, + { + "epoch": 0.013947237228394756, + "grad_norm": 1.0349398851394653, + "learning_rate": 3.0992485099766785e-06, + "loss": 1.612, + "step": 598 + }, + { + "epoch": 0.013970560367572674, + "grad_norm": 1.4780391454696655, + "learning_rate": 3.104431199792693e-06, + "loss": 1.5179, + "step": 599 + }, + { + "epoch": 0.013993883506750592, + "grad_norm": 1.1395933628082275, + "learning_rate": 3.109613889608707e-06, + "loss": 1.4845, + "step": 600 + }, + { + "epoch": 0.01401720664592851, + "grad_norm": 1.37168550491333, + "learning_rate": 3.1147965794247216e-06, + "loss": 1.581, + "step": 601 + }, + { + "epoch": 0.014040529785106426, + "grad_norm": 1.8260347843170166, + "learning_rate": 3.119979269240736e-06, + "loss": 1.1221, + "step": 602 + }, + { + "epoch": 0.014063852924284343, + "grad_norm": 2.5528669357299805, + "learning_rate": 3.1251619590567507e-06, + "loss": 1.255, + "step": 603 + }, + { + "epoch": 0.014087176063462261, + "grad_norm": 1.3272032737731934, + "learning_rate": 3.130344648872765e-06, + "loss": 1.2713, + "step": 604 + }, + { + "epoch": 0.01411049920264018, + "grad_norm": 1.147449254989624, + "learning_rate": 3.13552733868878e-06, + "loss": 1.3694, + "step": 605 + }, + { + "epoch": 0.014133822341818097, + "grad_norm": 1.173793077468872, + "learning_rate": 3.140710028504794e-06, + "loss": 1.5818, + "step": 606 + }, + { + "epoch": 0.014157145480996015, + "grad_norm": 1.2347713708877563, + "learning_rate": 3.145892718320809e-06, + "loss": 1.501, + "step": 607 + }, + { + "epoch": 0.014180468620173933, + "grad_norm": 1.3945446014404297, + "learning_rate": 3.1510754081368233e-06, + "loss": 1.8674, + "step": 608 + }, + { + "epoch": 0.01420379175935185, + "grad_norm": 1.239762544631958, + "learning_rate": 3.156258097952838e-06, + "loss": 1.2516, + "step": 609 + }, + { + "epoch": 0.014227114898529767, + "grad_norm": 1.552531361579895, + "learning_rate": 3.1614407877688524e-06, + "loss": 1.5358, + "step": 610 + }, + { + "epoch": 0.014250438037707685, + "grad_norm": 1.576997995376587, + "learning_rate": 3.166623477584867e-06, + "loss": 1.7601, + "step": 611 + }, + { + "epoch": 0.014273761176885603, + "grad_norm": 1.3251402378082275, + "learning_rate": 3.1718061674008815e-06, + "loss": 1.2758, + "step": 612 + }, + { + "epoch": 0.01429708431606352, + "grad_norm": 1.2837574481964111, + "learning_rate": 3.1769888572168963e-06, + "loss": 1.528, + "step": 613 + }, + { + "epoch": 0.014320407455241439, + "grad_norm": 0.9697505831718445, + "learning_rate": 3.1821715470329102e-06, + "loss": 1.6359, + "step": 614 + }, + { + "epoch": 0.014343730594419356, + "grad_norm": 1.2682685852050781, + "learning_rate": 3.1873542368489246e-06, + "loss": 1.4759, + "step": 615 + }, + { + "epoch": 0.014367053733597274, + "grad_norm": 0.9607746005058289, + "learning_rate": 3.1925369266649393e-06, + "loss": 1.7474, + "step": 616 + }, + { + "epoch": 0.01439037687277519, + "grad_norm": 1.056736946105957, + "learning_rate": 3.1977196164809537e-06, + "loss": 1.8812, + "step": 617 + }, + { + "epoch": 0.014413700011953108, + "grad_norm": 1.1990852355957031, + "learning_rate": 3.2029023062969684e-06, + "loss": 1.6217, + "step": 618 + }, + { + "epoch": 0.014437023151131026, + "grad_norm": 1.1339764595031738, + "learning_rate": 3.208084996112983e-06, + "loss": 1.3557, + "step": 619 + }, + { + "epoch": 0.014460346290308944, + "grad_norm": 1.0672523975372314, + "learning_rate": 3.2132676859289976e-06, + "loss": 1.8239, + "step": 620 + }, + { + "epoch": 0.014483669429486862, + "grad_norm": 1.4371954202651978, + "learning_rate": 3.218450375745012e-06, + "loss": 1.4571, + "step": 621 + }, + { + "epoch": 0.01450699256866478, + "grad_norm": 1.9893105030059814, + "learning_rate": 3.2236330655610267e-06, + "loss": 1.3716, + "step": 622 + }, + { + "epoch": 0.014530315707842698, + "grad_norm": 1.7084318399429321, + "learning_rate": 3.228815755377041e-06, + "loss": 1.5201, + "step": 623 + }, + { + "epoch": 0.014553638847020614, + "grad_norm": 1.308225154876709, + "learning_rate": 3.233998445193056e-06, + "loss": 1.9173, + "step": 624 + }, + { + "epoch": 0.014576961986198532, + "grad_norm": 0.9914215803146362, + "learning_rate": 3.23918113500907e-06, + "loss": 1.7351, + "step": 625 + }, + { + "epoch": 0.01460028512537645, + "grad_norm": 1.0292766094207764, + "learning_rate": 3.244363824825085e-06, + "loss": 1.4073, + "step": 626 + }, + { + "epoch": 0.014623608264554368, + "grad_norm": 1.0998982191085815, + "learning_rate": 3.2495465146410993e-06, + "loss": 1.5979, + "step": 627 + }, + { + "epoch": 0.014646931403732286, + "grad_norm": 1.1409685611724854, + "learning_rate": 3.254729204457113e-06, + "loss": 1.3442, + "step": 628 + }, + { + "epoch": 0.014670254542910204, + "grad_norm": 1.7685736417770386, + "learning_rate": 3.259911894273128e-06, + "loss": 1.251, + "step": 629 + }, + { + "epoch": 0.014693577682088121, + "grad_norm": 1.6536918878555298, + "learning_rate": 3.2650945840891423e-06, + "loss": 1.4698, + "step": 630 + }, + { + "epoch": 0.014716900821266038, + "grad_norm": 2.046391248703003, + "learning_rate": 3.270277273905157e-06, + "loss": 1.5142, + "step": 631 + }, + { + "epoch": 0.014740223960443955, + "grad_norm": 1.3458948135375977, + "learning_rate": 3.2754599637211714e-06, + "loss": 1.3999, + "step": 632 + }, + { + "epoch": 0.014763547099621873, + "grad_norm": 1.7265046834945679, + "learning_rate": 3.280642653537186e-06, + "loss": 1.2212, + "step": 633 + }, + { + "epoch": 0.014786870238799791, + "grad_norm": 1.3191124200820923, + "learning_rate": 3.2858253433532005e-06, + "loss": 1.4354, + "step": 634 + }, + { + "epoch": 0.01481019337797771, + "grad_norm": 1.2317379713058472, + "learning_rate": 3.2910080331692153e-06, + "loss": 1.5661, + "step": 635 + }, + { + "epoch": 0.014833516517155627, + "grad_norm": 1.400969386100769, + "learning_rate": 3.2961907229852297e-06, + "loss": 1.462, + "step": 636 + }, + { + "epoch": 0.014856839656333545, + "grad_norm": 2.060718059539795, + "learning_rate": 3.3013734128012444e-06, + "loss": 1.7522, + "step": 637 + }, + { + "epoch": 0.014880162795511461, + "grad_norm": 1.138715386390686, + "learning_rate": 3.3065561026172588e-06, + "loss": 1.4923, + "step": 638 + }, + { + "epoch": 0.014903485934689379, + "grad_norm": 1.1973599195480347, + "learning_rate": 3.3117387924332735e-06, + "loss": 1.4462, + "step": 639 + }, + { + "epoch": 0.014926809073867297, + "grad_norm": 1.266867756843567, + "learning_rate": 3.316921482249288e-06, + "loss": 1.3159, + "step": 640 + }, + { + "epoch": 0.014950132213045215, + "grad_norm": 3.4681708812713623, + "learning_rate": 3.322104172065302e-06, + "loss": 1.3566, + "step": 641 + }, + { + "epoch": 0.014973455352223133, + "grad_norm": 1.248502492904663, + "learning_rate": 3.3272868618813166e-06, + "loss": 1.6299, + "step": 642 + }, + { + "epoch": 0.01499677849140105, + "grad_norm": 1.561563491821289, + "learning_rate": 3.332469551697331e-06, + "loss": 1.3246, + "step": 643 + }, + { + "epoch": 0.015020101630578968, + "grad_norm": 1.1922053098678589, + "learning_rate": 3.3376522415133457e-06, + "loss": 1.6847, + "step": 644 + }, + { + "epoch": 0.015043424769756885, + "grad_norm": 1.0779014825820923, + "learning_rate": 3.34283493132936e-06, + "loss": 1.8025, + "step": 645 + }, + { + "epoch": 0.015066747908934803, + "grad_norm": 1.5236597061157227, + "learning_rate": 3.348017621145375e-06, + "loss": 1.3894, + "step": 646 + }, + { + "epoch": 0.01509007104811272, + "grad_norm": 1.2087934017181396, + "learning_rate": 3.353200310961389e-06, + "loss": 1.9119, + "step": 647 + }, + { + "epoch": 0.015113394187290638, + "grad_norm": 1.435085654258728, + "learning_rate": 3.358383000777404e-06, + "loss": 1.4334, + "step": 648 + }, + { + "epoch": 0.015136717326468556, + "grad_norm": 1.3662467002868652, + "learning_rate": 3.3635656905934183e-06, + "loss": 1.6717, + "step": 649 + }, + { + "epoch": 0.015160040465646474, + "grad_norm": 1.379262924194336, + "learning_rate": 3.368748380409433e-06, + "loss": 1.0914, + "step": 650 + }, + { + "epoch": 0.015183363604824392, + "grad_norm": 1.436503529548645, + "learning_rate": 3.3739310702254474e-06, + "loss": 1.296, + "step": 651 + }, + { + "epoch": 0.015206686744002308, + "grad_norm": 1.0189919471740723, + "learning_rate": 3.379113760041462e-06, + "loss": 1.5578, + "step": 652 + }, + { + "epoch": 0.015230009883180226, + "grad_norm": 1.3371915817260742, + "learning_rate": 3.3842964498574765e-06, + "loss": 1.3883, + "step": 653 + }, + { + "epoch": 0.015253333022358144, + "grad_norm": 1.152949333190918, + "learning_rate": 3.389479139673491e-06, + "loss": 1.3408, + "step": 654 + }, + { + "epoch": 0.015276656161536062, + "grad_norm": 0.865856945514679, + "learning_rate": 3.3946618294895052e-06, + "loss": 1.8154, + "step": 655 + }, + { + "epoch": 0.01529997930071398, + "grad_norm": 1.3607538938522339, + "learning_rate": 3.3998445193055196e-06, + "loss": 1.5139, + "step": 656 + }, + { + "epoch": 0.015323302439891898, + "grad_norm": 1.0469399690628052, + "learning_rate": 3.4050272091215343e-06, + "loss": 1.4246, + "step": 657 + }, + { + "epoch": 0.015346625579069816, + "grad_norm": 1.2417982816696167, + "learning_rate": 3.4102098989375487e-06, + "loss": 1.4392, + "step": 658 + }, + { + "epoch": 0.015369948718247732, + "grad_norm": 2.018418073654175, + "learning_rate": 3.4153925887535634e-06, + "loss": 1.5175, + "step": 659 + }, + { + "epoch": 0.01539327185742565, + "grad_norm": 1.2593055963516235, + "learning_rate": 3.420575278569578e-06, + "loss": 1.6338, + "step": 660 + }, + { + "epoch": 0.015416594996603568, + "grad_norm": 1.0297298431396484, + "learning_rate": 3.4257579683855926e-06, + "loss": 1.6309, + "step": 661 + }, + { + "epoch": 0.015439918135781485, + "grad_norm": 1.2963732481002808, + "learning_rate": 3.430940658201607e-06, + "loss": 1.3099, + "step": 662 + }, + { + "epoch": 0.015463241274959403, + "grad_norm": 1.0868266820907593, + "learning_rate": 3.4361233480176217e-06, + "loss": 1.4949, + "step": 663 + }, + { + "epoch": 0.015486564414137321, + "grad_norm": 1.156296968460083, + "learning_rate": 3.441306037833636e-06, + "loss": 1.7845, + "step": 664 + }, + { + "epoch": 0.015509887553315239, + "grad_norm": 1.412965178489685, + "learning_rate": 3.446488727649651e-06, + "loss": 1.19, + "step": 665 + }, + { + "epoch": 0.015533210692493155, + "grad_norm": 1.0419931411743164, + "learning_rate": 3.451671417465665e-06, + "loss": 1.7125, + "step": 666 + }, + { + "epoch": 0.015556533831671073, + "grad_norm": 1.035372018814087, + "learning_rate": 3.4568541072816795e-06, + "loss": 1.7003, + "step": 667 + }, + { + "epoch": 0.015579856970848991, + "grad_norm": 1.1559805870056152, + "learning_rate": 3.4620367970976943e-06, + "loss": 1.981, + "step": 668 + }, + { + "epoch": 0.015603180110026909, + "grad_norm": 0.8634515404701233, + "learning_rate": 3.467219486913708e-06, + "loss": 1.2609, + "step": 669 + }, + { + "epoch": 0.015626503249204827, + "grad_norm": 1.1953692436218262, + "learning_rate": 3.472402176729723e-06, + "loss": 1.3956, + "step": 670 + }, + { + "epoch": 0.015649826388382745, + "grad_norm": 0.9668301939964294, + "learning_rate": 3.4775848665457373e-06, + "loss": 1.0568, + "step": 671 + }, + { + "epoch": 0.015673149527560663, + "grad_norm": 2.4868035316467285, + "learning_rate": 3.482767556361752e-06, + "loss": 1.364, + "step": 672 + }, + { + "epoch": 0.01569647266673858, + "grad_norm": 1.4255839586257935, + "learning_rate": 3.4879502461777664e-06, + "loss": 1.5207, + "step": 673 + }, + { + "epoch": 0.0157197958059165, + "grad_norm": 1.2752389907836914, + "learning_rate": 3.493132935993781e-06, + "loss": 1.5141, + "step": 674 + }, + { + "epoch": 0.015743118945094416, + "grad_norm": 1.2186245918273926, + "learning_rate": 3.4983156258097955e-06, + "loss": 1.3655, + "step": 675 + }, + { + "epoch": 0.015766442084272334, + "grad_norm": 1.3544304370880127, + "learning_rate": 3.5034983156258103e-06, + "loss": 1.7428, + "step": 676 + }, + { + "epoch": 0.01578976522345025, + "grad_norm": 1.0968130826950073, + "learning_rate": 3.5086810054418247e-06, + "loss": 1.3491, + "step": 677 + }, + { + "epoch": 0.015813088362628167, + "grad_norm": 1.1593806743621826, + "learning_rate": 3.513863695257839e-06, + "loss": 1.6708, + "step": 678 + }, + { + "epoch": 0.015836411501806084, + "grad_norm": 1.0408954620361328, + "learning_rate": 3.5190463850738538e-06, + "loss": 1.6977, + "step": 679 + }, + { + "epoch": 0.015859734640984002, + "grad_norm": 1.196632742881775, + "learning_rate": 3.524229074889868e-06, + "loss": 1.2019, + "step": 680 + }, + { + "epoch": 0.01588305778016192, + "grad_norm": 1.2698166370391846, + "learning_rate": 3.529411764705883e-06, + "loss": 1.8457, + "step": 681 + }, + { + "epoch": 0.015906380919339838, + "grad_norm": 0.9075011014938354, + "learning_rate": 3.5345944545218972e-06, + "loss": 1.2717, + "step": 682 + }, + { + "epoch": 0.015929704058517756, + "grad_norm": 1.0426501035690308, + "learning_rate": 3.5397771443379116e-06, + "loss": 1.6601, + "step": 683 + }, + { + "epoch": 0.015953027197695674, + "grad_norm": 1.4904205799102783, + "learning_rate": 3.544959834153926e-06, + "loss": 1.6324, + "step": 684 + }, + { + "epoch": 0.015976350336873592, + "grad_norm": 1.0664643049240112, + "learning_rate": 3.5501425239699407e-06, + "loss": 1.4896, + "step": 685 + }, + { + "epoch": 0.01599967347605151, + "grad_norm": 1.3758978843688965, + "learning_rate": 3.555325213785955e-06, + "loss": 1.5457, + "step": 686 + }, + { + "epoch": 0.016022996615229428, + "grad_norm": 1.4759879112243652, + "learning_rate": 3.56050790360197e-06, + "loss": 1.3865, + "step": 687 + }, + { + "epoch": 0.016046319754407345, + "grad_norm": 1.4678733348846436, + "learning_rate": 3.565690593417984e-06, + "loss": 1.223, + "step": 688 + }, + { + "epoch": 0.016069642893585263, + "grad_norm": 1.2057251930236816, + "learning_rate": 3.570873283233999e-06, + "loss": 1.4864, + "step": 689 + }, + { + "epoch": 0.01609296603276318, + "grad_norm": 1.3976320028305054, + "learning_rate": 3.5760559730500133e-06, + "loss": 1.3371, + "step": 690 + }, + { + "epoch": 0.016116289171941096, + "grad_norm": 1.0588197708129883, + "learning_rate": 3.5812386628660276e-06, + "loss": 1.264, + "step": 691 + }, + { + "epoch": 0.016139612311119014, + "grad_norm": 0.891678512096405, + "learning_rate": 3.5864213526820424e-06, + "loss": 1.6566, + "step": 692 + }, + { + "epoch": 0.01616293545029693, + "grad_norm": 1.1149228811264038, + "learning_rate": 3.5916040424980567e-06, + "loss": 1.6862, + "step": 693 + }, + { + "epoch": 0.01618625858947485, + "grad_norm": 1.463218331336975, + "learning_rate": 3.5967867323140715e-06, + "loss": 1.5771, + "step": 694 + }, + { + "epoch": 0.016209581728652767, + "grad_norm": 1.291648030281067, + "learning_rate": 3.601969422130086e-06, + "loss": 1.443, + "step": 695 + }, + { + "epoch": 0.016232904867830685, + "grad_norm": 1.1534149646759033, + "learning_rate": 3.6071521119461002e-06, + "loss": 1.76, + "step": 696 + }, + { + "epoch": 0.016256228007008603, + "grad_norm": 1.3349847793579102, + "learning_rate": 3.6123348017621146e-06, + "loss": 2.0584, + "step": 697 + }, + { + "epoch": 0.01627955114618652, + "grad_norm": 1.665682315826416, + "learning_rate": 3.6175174915781293e-06, + "loss": 1.5989, + "step": 698 + }, + { + "epoch": 0.01630287428536444, + "grad_norm": 1.6486263275146484, + "learning_rate": 3.6227001813941437e-06, + "loss": 1.7698, + "step": 699 + }, + { + "epoch": 0.016326197424542357, + "grad_norm": 1.5153722763061523, + "learning_rate": 3.6278828712101584e-06, + "loss": 1.3312, + "step": 700 + }, + { + "epoch": 0.016349520563720275, + "grad_norm": 1.3090248107910156, + "learning_rate": 3.633065561026173e-06, + "loss": 1.0735, + "step": 701 + }, + { + "epoch": 0.016372843702898193, + "grad_norm": 1.5462753772735596, + "learning_rate": 3.6382482508421876e-06, + "loss": 1.5408, + "step": 702 + }, + { + "epoch": 0.01639616684207611, + "grad_norm": 1.3447730541229248, + "learning_rate": 3.643430940658202e-06, + "loss": 1.5295, + "step": 703 + }, + { + "epoch": 0.01641948998125403, + "grad_norm": 1.232865571975708, + "learning_rate": 3.6486136304742163e-06, + "loss": 1.8686, + "step": 704 + }, + { + "epoch": 0.016442813120431946, + "grad_norm": 0.9742329120635986, + "learning_rate": 3.653796320290231e-06, + "loss": 1.5951, + "step": 705 + }, + { + "epoch": 0.01646613625960986, + "grad_norm": 1.1572047472000122, + "learning_rate": 3.6589790101062454e-06, + "loss": 1.5068, + "step": 706 + }, + { + "epoch": 0.01648945939878778, + "grad_norm": 1.2024304866790771, + "learning_rate": 3.66416169992226e-06, + "loss": 1.3933, + "step": 707 + }, + { + "epoch": 0.016512782537965696, + "grad_norm": 2.442342758178711, + "learning_rate": 3.6693443897382745e-06, + "loss": 1.0126, + "step": 708 + }, + { + "epoch": 0.016536105677143614, + "grad_norm": 1.2786589860916138, + "learning_rate": 3.6745270795542893e-06, + "loss": 1.6902, + "step": 709 + }, + { + "epoch": 0.016559428816321532, + "grad_norm": 0.9200882315635681, + "learning_rate": 3.679709769370303e-06, + "loss": 1.3918, + "step": 710 + }, + { + "epoch": 0.01658275195549945, + "grad_norm": 1.3768819570541382, + "learning_rate": 3.684892459186318e-06, + "loss": 1.6518, + "step": 711 + }, + { + "epoch": 0.016606075094677368, + "grad_norm": 1.274484395980835, + "learning_rate": 3.6900751490023323e-06, + "loss": 1.3728, + "step": 712 + }, + { + "epoch": 0.016629398233855286, + "grad_norm": 1.1752501726150513, + "learning_rate": 3.695257838818347e-06, + "loss": 1.4234, + "step": 713 + }, + { + "epoch": 0.016652721373033204, + "grad_norm": 1.4458903074264526, + "learning_rate": 3.7004405286343614e-06, + "loss": 1.5695, + "step": 714 + }, + { + "epoch": 0.01667604451221112, + "grad_norm": 1.2630547285079956, + "learning_rate": 3.705623218450376e-06, + "loss": 1.5334, + "step": 715 + }, + { + "epoch": 0.01669936765138904, + "grad_norm": 1.3754082918167114, + "learning_rate": 3.7108059082663905e-06, + "loss": 1.4807, + "step": 716 + }, + { + "epoch": 0.016722690790566958, + "grad_norm": 1.4704689979553223, + "learning_rate": 3.715988598082405e-06, + "loss": 1.5409, + "step": 717 + }, + { + "epoch": 0.016746013929744875, + "grad_norm": 1.4692633152008057, + "learning_rate": 3.7211712878984197e-06, + "loss": 1.5922, + "step": 718 + }, + { + "epoch": 0.016769337068922793, + "grad_norm": 1.2148405313491821, + "learning_rate": 3.726353977714434e-06, + "loss": 1.8115, + "step": 719 + }, + { + "epoch": 0.016792660208100708, + "grad_norm": 1.5564905405044556, + "learning_rate": 3.7315366675304488e-06, + "loss": 1.4189, + "step": 720 + }, + { + "epoch": 0.016815983347278626, + "grad_norm": 1.130292296409607, + "learning_rate": 3.736719357346463e-06, + "loss": 1.4455, + "step": 721 + }, + { + "epoch": 0.016839306486456544, + "grad_norm": 2.0609545707702637, + "learning_rate": 3.741902047162478e-06, + "loss": 1.6052, + "step": 722 + }, + { + "epoch": 0.01686262962563446, + "grad_norm": 1.0422543287277222, + "learning_rate": 3.7470847369784922e-06, + "loss": 1.5889, + "step": 723 + }, + { + "epoch": 0.01688595276481238, + "grad_norm": 1.7926782369613647, + "learning_rate": 3.7522674267945066e-06, + "loss": 1.2304, + "step": 724 + }, + { + "epoch": 0.016909275903990297, + "grad_norm": 1.2486250400543213, + "learning_rate": 3.757450116610521e-06, + "loss": 1.7512, + "step": 725 + }, + { + "epoch": 0.016932599043168215, + "grad_norm": 1.6907048225402832, + "learning_rate": 3.7626328064265357e-06, + "loss": 1.2031, + "step": 726 + }, + { + "epoch": 0.016955922182346133, + "grad_norm": 1.2899296283721924, + "learning_rate": 3.76781549624255e-06, + "loss": 1.3111, + "step": 727 + }, + { + "epoch": 0.01697924532152405, + "grad_norm": 2.320288896560669, + "learning_rate": 3.7729981860585644e-06, + "loss": 1.2764, + "step": 728 + }, + { + "epoch": 0.01700256846070197, + "grad_norm": 1.4165383577346802, + "learning_rate": 3.778180875874579e-06, + "loss": 1.2847, + "step": 729 + }, + { + "epoch": 0.017025891599879887, + "grad_norm": 1.1537601947784424, + "learning_rate": 3.7833635656905935e-06, + "loss": 1.6002, + "step": 730 + }, + { + "epoch": 0.017049214739057805, + "grad_norm": 1.3128899335861206, + "learning_rate": 3.7885462555066083e-06, + "loss": 1.4159, + "step": 731 + }, + { + "epoch": 0.017072537878235722, + "grad_norm": 0.9494642615318298, + "learning_rate": 3.7937289453226226e-06, + "loss": 1.5425, + "step": 732 + }, + { + "epoch": 0.01709586101741364, + "grad_norm": 1.8949923515319824, + "learning_rate": 3.7989116351386374e-06, + "loss": 1.109, + "step": 733 + }, + { + "epoch": 0.017119184156591555, + "grad_norm": 1.3136776685714722, + "learning_rate": 3.8040943249546517e-06, + "loss": 1.4208, + "step": 734 + }, + { + "epoch": 0.017142507295769473, + "grad_norm": 1.0108048915863037, + "learning_rate": 3.8092770147706665e-06, + "loss": 1.3101, + "step": 735 + }, + { + "epoch": 0.01716583043494739, + "grad_norm": 1.1397989988327026, + "learning_rate": 3.814459704586681e-06, + "loss": 1.6643, + "step": 736 + }, + { + "epoch": 0.01718915357412531, + "grad_norm": 0.9662717580795288, + "learning_rate": 3.819642394402696e-06, + "loss": 1.5524, + "step": 737 + }, + { + "epoch": 0.017212476713303226, + "grad_norm": 1.5264514684677124, + "learning_rate": 3.82482508421871e-06, + "loss": 1.6702, + "step": 738 + }, + { + "epoch": 0.017235799852481144, + "grad_norm": 1.1797709465026855, + "learning_rate": 3.830007774034724e-06, + "loss": 1.5751, + "step": 739 + }, + { + "epoch": 0.017259122991659062, + "grad_norm": 1.3964486122131348, + "learning_rate": 3.835190463850739e-06, + "loss": 1.3497, + "step": 740 + }, + { + "epoch": 0.01728244613083698, + "grad_norm": 1.0540798902511597, + "learning_rate": 3.840373153666753e-06, + "loss": 1.623, + "step": 741 + }, + { + "epoch": 0.017305769270014898, + "grad_norm": 1.8619107007980347, + "learning_rate": 3.845555843482767e-06, + "loss": 1.836, + "step": 742 + }, + { + "epoch": 0.017329092409192816, + "grad_norm": 1.190048098564148, + "learning_rate": 3.8507385332987826e-06, + "loss": 1.6031, + "step": 743 + }, + { + "epoch": 0.017352415548370734, + "grad_norm": 1.32784903049469, + "learning_rate": 3.855921223114797e-06, + "loss": 1.6144, + "step": 744 + }, + { + "epoch": 0.01737573868754865, + "grad_norm": 1.7393810749053955, + "learning_rate": 3.861103912930811e-06, + "loss": 1.4898, + "step": 745 + }, + { + "epoch": 0.01739906182672657, + "grad_norm": 1.008122444152832, + "learning_rate": 3.866286602746826e-06, + "loss": 1.6506, + "step": 746 + }, + { + "epoch": 0.017422384965904487, + "grad_norm": 1.3282239437103271, + "learning_rate": 3.871469292562841e-06, + "loss": 1.5178, + "step": 747 + }, + { + "epoch": 0.017445708105082402, + "grad_norm": 1.4479358196258545, + "learning_rate": 3.876651982378855e-06, + "loss": 1.5896, + "step": 748 + }, + { + "epoch": 0.01746903124426032, + "grad_norm": 1.9100661277770996, + "learning_rate": 3.8818346721948695e-06, + "loss": 1.2946, + "step": 749 + }, + { + "epoch": 0.017492354383438238, + "grad_norm": 1.269235610961914, + "learning_rate": 3.887017362010884e-06, + "loss": 1.5707, + "step": 750 + }, + { + "epoch": 0.017515677522616156, + "grad_norm": 1.3187369108200073, + "learning_rate": 3.892200051826899e-06, + "loss": 1.8153, + "step": 751 + }, + { + "epoch": 0.017539000661794073, + "grad_norm": 1.3091131448745728, + "learning_rate": 3.8973827416429125e-06, + "loss": 1.5973, + "step": 752 + }, + { + "epoch": 0.01756232380097199, + "grad_norm": 1.4826890230178833, + "learning_rate": 3.902565431458927e-06, + "loss": 1.3277, + "step": 753 + }, + { + "epoch": 0.01758564694014991, + "grad_norm": 1.2626949548721313, + "learning_rate": 3.907748121274942e-06, + "loss": 1.5531, + "step": 754 + }, + { + "epoch": 0.017608970079327827, + "grad_norm": 1.1990412473678589, + "learning_rate": 3.912930811090956e-06, + "loss": 1.349, + "step": 755 + }, + { + "epoch": 0.017632293218505745, + "grad_norm": 1.3036906719207764, + "learning_rate": 3.918113500906971e-06, + "loss": 1.5648, + "step": 756 + }, + { + "epoch": 0.017655616357683663, + "grad_norm": 1.3129525184631348, + "learning_rate": 3.923296190722985e-06, + "loss": 1.7147, + "step": 757 + }, + { + "epoch": 0.01767893949686158, + "grad_norm": 1.4686280488967896, + "learning_rate": 3.928478880539e-06, + "loss": 1.6136, + "step": 758 + }, + { + "epoch": 0.0177022626360395, + "grad_norm": 1.6845604181289673, + "learning_rate": 3.933661570355015e-06, + "loss": 1.763, + "step": 759 + }, + { + "epoch": 0.017725585775217417, + "grad_norm": 2.019049644470215, + "learning_rate": 3.938844260171029e-06, + "loss": 1.2543, + "step": 760 + }, + { + "epoch": 0.017748908914395334, + "grad_norm": 1.4184072017669678, + "learning_rate": 3.944026949987043e-06, + "loss": 1.596, + "step": 761 + }, + { + "epoch": 0.017772232053573252, + "grad_norm": 1.127982497215271, + "learning_rate": 3.9492096398030585e-06, + "loss": 1.5485, + "step": 762 + }, + { + "epoch": 0.017795555192751167, + "grad_norm": 1.5097321271896362, + "learning_rate": 3.954392329619073e-06, + "loss": 1.5452, + "step": 763 + }, + { + "epoch": 0.017818878331929085, + "grad_norm": 1.3832807540893555, + "learning_rate": 3.959575019435087e-06, + "loss": 1.3865, + "step": 764 + }, + { + "epoch": 0.017842201471107003, + "grad_norm": 1.065623164176941, + "learning_rate": 3.964757709251102e-06, + "loss": 1.2218, + "step": 765 + }, + { + "epoch": 0.01786552461028492, + "grad_norm": 1.2190065383911133, + "learning_rate": 3.969940399067116e-06, + "loss": 1.2169, + "step": 766 + }, + { + "epoch": 0.01788884774946284, + "grad_norm": 1.741749882698059, + "learning_rate": 3.97512308888313e-06, + "loss": 1.7316, + "step": 767 + }, + { + "epoch": 0.017912170888640756, + "grad_norm": 1.2072060108184814, + "learning_rate": 3.980305778699145e-06, + "loss": 1.815, + "step": 768 + }, + { + "epoch": 0.017935494027818674, + "grad_norm": 1.4645625352859497, + "learning_rate": 3.98548846851516e-06, + "loss": 1.2218, + "step": 769 + }, + { + "epoch": 0.017958817166996592, + "grad_norm": 1.4466350078582764, + "learning_rate": 3.990671158331174e-06, + "loss": 1.7291, + "step": 770 + }, + { + "epoch": 0.01798214030617451, + "grad_norm": 1.364358901977539, + "learning_rate": 3.9958538481471885e-06, + "loss": 1.6527, + "step": 771 + }, + { + "epoch": 0.018005463445352428, + "grad_norm": 1.2262394428253174, + "learning_rate": 4.001036537963203e-06, + "loss": 1.5522, + "step": 772 + }, + { + "epoch": 0.018028786584530346, + "grad_norm": 1.694001317024231, + "learning_rate": 4.006219227779218e-06, + "loss": 1.5791, + "step": 773 + }, + { + "epoch": 0.018052109723708264, + "grad_norm": 0.7941157817840576, + "learning_rate": 4.011401917595232e-06, + "loss": 1.23, + "step": 774 + }, + { + "epoch": 0.01807543286288618, + "grad_norm": 1.1942747831344604, + "learning_rate": 4.016584607411247e-06, + "loss": 1.4316, + "step": 775 + }, + { + "epoch": 0.0180987560020641, + "grad_norm": 1.5809072256088257, + "learning_rate": 4.021767297227261e-06, + "loss": 1.7361, + "step": 776 + }, + { + "epoch": 0.018122079141242014, + "grad_norm": 1.2918401956558228, + "learning_rate": 4.026949987043276e-06, + "loss": 1.3285, + "step": 777 + }, + { + "epoch": 0.018145402280419932, + "grad_norm": 1.966123342514038, + "learning_rate": 4.032132676859291e-06, + "loss": 1.2037, + "step": 778 + }, + { + "epoch": 0.01816872541959785, + "grad_norm": 1.3362590074539185, + "learning_rate": 4.037315366675304e-06, + "loss": 1.3811, + "step": 779 + }, + { + "epoch": 0.018192048558775768, + "grad_norm": 1.0375605821609497, + "learning_rate": 4.042498056491319e-06, + "loss": 1.481, + "step": 780 + }, + { + "epoch": 0.018215371697953685, + "grad_norm": 2.414684295654297, + "learning_rate": 4.047680746307334e-06, + "loss": 1.773, + "step": 781 + }, + { + "epoch": 0.018238694837131603, + "grad_norm": 1.2252676486968994, + "learning_rate": 4.052863436123348e-06, + "loss": 1.514, + "step": 782 + }, + { + "epoch": 0.01826201797630952, + "grad_norm": 1.517791748046875, + "learning_rate": 4.058046125939362e-06, + "loss": 1.3442, + "step": 783 + }, + { + "epoch": 0.01828534111548744, + "grad_norm": 1.0303611755371094, + "learning_rate": 4.0632288157553776e-06, + "loss": 1.5593, + "step": 784 + }, + { + "epoch": 0.018308664254665357, + "grad_norm": 1.3615033626556396, + "learning_rate": 4.068411505571392e-06, + "loss": 1.6971, + "step": 785 + }, + { + "epoch": 0.018331987393843275, + "grad_norm": 1.1224147081375122, + "learning_rate": 4.073594195387406e-06, + "loss": 1.2134, + "step": 786 + }, + { + "epoch": 0.018355310533021193, + "grad_norm": 1.3592679500579834, + "learning_rate": 4.078776885203421e-06, + "loss": 1.7391, + "step": 787 + }, + { + "epoch": 0.01837863367219911, + "grad_norm": 1.6286187171936035, + "learning_rate": 4.083959575019436e-06, + "loss": 1.7279, + "step": 788 + }, + { + "epoch": 0.01840195681137703, + "grad_norm": 1.2597742080688477, + "learning_rate": 4.08914226483545e-06, + "loss": 1.5227, + "step": 789 + }, + { + "epoch": 0.018425279950554947, + "grad_norm": 1.2776849269866943, + "learning_rate": 4.0943249546514645e-06, + "loss": 1.3575, + "step": 790 + }, + { + "epoch": 0.01844860308973286, + "grad_norm": 1.2529163360595703, + "learning_rate": 4.099507644467479e-06, + "loss": 1.6356, + "step": 791 + }, + { + "epoch": 0.01847192622891078, + "grad_norm": 1.184187650680542, + "learning_rate": 4.104690334283494e-06, + "loss": 1.734, + "step": 792 + }, + { + "epoch": 0.018495249368088697, + "grad_norm": 1.176222562789917, + "learning_rate": 4.1098730240995075e-06, + "loss": 1.5206, + "step": 793 + }, + { + "epoch": 0.018518572507266615, + "grad_norm": 1.0694701671600342, + "learning_rate": 4.115055713915522e-06, + "loss": 1.1824, + "step": 794 + }, + { + "epoch": 0.018541895646444533, + "grad_norm": 1.5169551372528076, + "learning_rate": 4.120238403731537e-06, + "loss": 1.3817, + "step": 795 + }, + { + "epoch": 0.01856521878562245, + "grad_norm": 1.0996246337890625, + "learning_rate": 4.125421093547551e-06, + "loss": 1.0921, + "step": 796 + }, + { + "epoch": 0.01858854192480037, + "grad_norm": 1.0202140808105469, + "learning_rate": 4.130603783363566e-06, + "loss": 1.2687, + "step": 797 + }, + { + "epoch": 0.018611865063978286, + "grad_norm": 2.089864730834961, + "learning_rate": 4.13578647317958e-06, + "loss": 1.5417, + "step": 798 + }, + { + "epoch": 0.018635188203156204, + "grad_norm": 1.1465847492218018, + "learning_rate": 4.140969162995595e-06, + "loss": 1.3415, + "step": 799 + }, + { + "epoch": 0.018658511342334122, + "grad_norm": 1.1085565090179443, + "learning_rate": 4.14615185281161e-06, + "loss": 1.4662, + "step": 800 + }, + { + "epoch": 0.01868183448151204, + "grad_norm": 1.2206768989562988, + "learning_rate": 4.151334542627624e-06, + "loss": 1.4954, + "step": 801 + }, + { + "epoch": 0.018705157620689958, + "grad_norm": 1.1540756225585938, + "learning_rate": 4.156517232443638e-06, + "loss": 1.4953, + "step": 802 + }, + { + "epoch": 0.018728480759867876, + "grad_norm": 1.9667025804519653, + "learning_rate": 4.1616999222596535e-06, + "loss": 1.1834, + "step": 803 + }, + { + "epoch": 0.018751803899045794, + "grad_norm": 1.2202988862991333, + "learning_rate": 4.166882612075668e-06, + "loss": 1.7045, + "step": 804 + }, + { + "epoch": 0.018775127038223708, + "grad_norm": 1.2399123907089233, + "learning_rate": 4.172065301891682e-06, + "loss": 1.4937, + "step": 805 + }, + { + "epoch": 0.018798450177401626, + "grad_norm": 1.5780203342437744, + "learning_rate": 4.177247991707697e-06, + "loss": 1.6386, + "step": 806 + }, + { + "epoch": 0.018821773316579544, + "grad_norm": 1.524564266204834, + "learning_rate": 4.182430681523711e-06, + "loss": 1.4951, + "step": 807 + }, + { + "epoch": 0.01884509645575746, + "grad_norm": 1.342991590499878, + "learning_rate": 4.187613371339725e-06, + "loss": 1.3007, + "step": 808 + }, + { + "epoch": 0.01886841959493538, + "grad_norm": 1.320813775062561, + "learning_rate": 4.19279606115574e-06, + "loss": 1.2112, + "step": 809 + }, + { + "epoch": 0.018891742734113297, + "grad_norm": 1.2329927682876587, + "learning_rate": 4.197978750971755e-06, + "loss": 1.333, + "step": 810 + }, + { + "epoch": 0.018915065873291215, + "grad_norm": 1.3429094552993774, + "learning_rate": 4.203161440787769e-06, + "loss": 1.4805, + "step": 811 + }, + { + "epoch": 0.018938389012469133, + "grad_norm": 1.643641710281372, + "learning_rate": 4.2083441306037835e-06, + "loss": 1.5665, + "step": 812 + }, + { + "epoch": 0.01896171215164705, + "grad_norm": 1.111887812614441, + "learning_rate": 4.213526820419798e-06, + "loss": 1.6087, + "step": 813 + }, + { + "epoch": 0.01898503529082497, + "grad_norm": 1.3594610691070557, + "learning_rate": 4.218709510235813e-06, + "loss": 1.7666, + "step": 814 + }, + { + "epoch": 0.019008358430002887, + "grad_norm": 1.2298046350479126, + "learning_rate": 4.223892200051827e-06, + "loss": 1.5032, + "step": 815 + }, + { + "epoch": 0.019031681569180805, + "grad_norm": 1.2679171562194824, + "learning_rate": 4.229074889867842e-06, + "loss": 1.4375, + "step": 816 + }, + { + "epoch": 0.019055004708358723, + "grad_norm": 1.0543935298919678, + "learning_rate": 4.234257579683856e-06, + "loss": 1.6645, + "step": 817 + }, + { + "epoch": 0.01907832784753664, + "grad_norm": 1.2821168899536133, + "learning_rate": 4.239440269499871e-06, + "loss": 1.1945, + "step": 818 + }, + { + "epoch": 0.01910165098671456, + "grad_norm": 1.5575084686279297, + "learning_rate": 4.244622959315886e-06, + "loss": 1.3262, + "step": 819 + }, + { + "epoch": 0.019124974125892473, + "grad_norm": 1.2359989881515503, + "learning_rate": 4.2498056491319e-06, + "loss": 1.4127, + "step": 820 + }, + { + "epoch": 0.01914829726507039, + "grad_norm": 1.0559273958206177, + "learning_rate": 4.254988338947914e-06, + "loss": 1.4455, + "step": 821 + }, + { + "epoch": 0.01917162040424831, + "grad_norm": 1.3651732206344604, + "learning_rate": 4.260171028763929e-06, + "loss": 1.245, + "step": 822 + }, + { + "epoch": 0.019194943543426227, + "grad_norm": 1.0067932605743408, + "learning_rate": 4.265353718579943e-06, + "loss": 1.4954, + "step": 823 + }, + { + "epoch": 0.019218266682604145, + "grad_norm": 1.7477822303771973, + "learning_rate": 4.270536408395957e-06, + "loss": 1.8164, + "step": 824 + }, + { + "epoch": 0.019241589821782062, + "grad_norm": 1.1976604461669922, + "learning_rate": 4.2757190982119726e-06, + "loss": 1.4552, + "step": 825 + }, + { + "epoch": 0.01926491296095998, + "grad_norm": 1.306269884109497, + "learning_rate": 4.280901788027987e-06, + "loss": 1.6348, + "step": 826 + }, + { + "epoch": 0.019288236100137898, + "grad_norm": 1.5786314010620117, + "learning_rate": 4.286084477844001e-06, + "loss": 1.4592, + "step": 827 + }, + { + "epoch": 0.019311559239315816, + "grad_norm": 1.4481762647628784, + "learning_rate": 4.291267167660016e-06, + "loss": 1.3409, + "step": 828 + }, + { + "epoch": 0.019334882378493734, + "grad_norm": 1.1410714387893677, + "learning_rate": 4.296449857476031e-06, + "loss": 1.5746, + "step": 829 + }, + { + "epoch": 0.019358205517671652, + "grad_norm": 1.363434076309204, + "learning_rate": 4.301632547292045e-06, + "loss": 1.0836, + "step": 830 + }, + { + "epoch": 0.01938152865684957, + "grad_norm": 1.1413646936416626, + "learning_rate": 4.3068152371080595e-06, + "loss": 1.8687, + "step": 831 + }, + { + "epoch": 0.019404851796027488, + "grad_norm": 1.9734309911727905, + "learning_rate": 4.311997926924074e-06, + "loss": 1.3295, + "step": 832 + }, + { + "epoch": 0.019428174935205406, + "grad_norm": 1.5119333267211914, + "learning_rate": 4.317180616740089e-06, + "loss": 1.6817, + "step": 833 + }, + { + "epoch": 0.01945149807438332, + "grad_norm": 1.3933395147323608, + "learning_rate": 4.3223633065561025e-06, + "loss": 1.5288, + "step": 834 + }, + { + "epoch": 0.019474821213561238, + "grad_norm": 1.3713746070861816, + "learning_rate": 4.327545996372117e-06, + "loss": 1.6361, + "step": 835 + }, + { + "epoch": 0.019498144352739156, + "grad_norm": 1.1849229335784912, + "learning_rate": 4.332728686188132e-06, + "loss": 1.6611, + "step": 836 + }, + { + "epoch": 0.019521467491917074, + "grad_norm": 2.122307777404785, + "learning_rate": 4.337911376004146e-06, + "loss": 1.6258, + "step": 837 + }, + { + "epoch": 0.01954479063109499, + "grad_norm": 1.221781611442566, + "learning_rate": 4.343094065820161e-06, + "loss": 1.9081, + "step": 838 + }, + { + "epoch": 0.01956811377027291, + "grad_norm": 1.2895511388778687, + "learning_rate": 4.348276755636175e-06, + "loss": 1.2742, + "step": 839 + }, + { + "epoch": 0.019591436909450827, + "grad_norm": 1.1531336307525635, + "learning_rate": 4.35345944545219e-06, + "loss": 1.587, + "step": 840 + }, + { + "epoch": 0.019614760048628745, + "grad_norm": 1.3979135751724243, + "learning_rate": 4.358642135268205e-06, + "loss": 1.5208, + "step": 841 + }, + { + "epoch": 0.019638083187806663, + "grad_norm": 1.3758100271224976, + "learning_rate": 4.363824825084219e-06, + "loss": 1.246, + "step": 842 + }, + { + "epoch": 0.01966140632698458, + "grad_norm": 1.3759677410125732, + "learning_rate": 4.369007514900233e-06, + "loss": 1.7344, + "step": 843 + }, + { + "epoch": 0.0196847294661625, + "grad_norm": 1.5575461387634277, + "learning_rate": 4.3741902047162485e-06, + "loss": 1.5554, + "step": 844 + }, + { + "epoch": 0.019708052605340417, + "grad_norm": 1.5018088817596436, + "learning_rate": 4.379372894532263e-06, + "loss": 1.3433, + "step": 845 + }, + { + "epoch": 0.019731375744518335, + "grad_norm": 1.4393954277038574, + "learning_rate": 4.384555584348277e-06, + "loss": 1.7277, + "step": 846 + }, + { + "epoch": 0.019754698883696253, + "grad_norm": 1.0249360799789429, + "learning_rate": 4.389738274164292e-06, + "loss": 1.6538, + "step": 847 + }, + { + "epoch": 0.019778022022874167, + "grad_norm": 1.128587007522583, + "learning_rate": 4.394920963980306e-06, + "loss": 1.2935, + "step": 848 + }, + { + "epoch": 0.019801345162052085, + "grad_norm": 1.301287293434143, + "learning_rate": 4.40010365379632e-06, + "loss": 1.4193, + "step": 849 + }, + { + "epoch": 0.019824668301230003, + "grad_norm": 1.5180747509002686, + "learning_rate": 4.405286343612335e-06, + "loss": 1.2061, + "step": 850 + }, + { + "epoch": 0.01984799144040792, + "grad_norm": 0.9110321402549744, + "learning_rate": 4.41046903342835e-06, + "loss": 1.2803, + "step": 851 + }, + { + "epoch": 0.01987131457958584, + "grad_norm": 1.68843674659729, + "learning_rate": 4.415651723244364e-06, + "loss": 1.2037, + "step": 852 + }, + { + "epoch": 0.019894637718763757, + "grad_norm": 1.2198610305786133, + "learning_rate": 4.4208344130603785e-06, + "loss": 1.6652, + "step": 853 + }, + { + "epoch": 0.019917960857941674, + "grad_norm": 1.579087257385254, + "learning_rate": 4.426017102876393e-06, + "loss": 1.5859, + "step": 854 + }, + { + "epoch": 0.019941283997119592, + "grad_norm": 1.7198874950408936, + "learning_rate": 4.431199792692408e-06, + "loss": 1.4662, + "step": 855 + }, + { + "epoch": 0.01996460713629751, + "grad_norm": 2.817178726196289, + "learning_rate": 4.436382482508422e-06, + "loss": 1.3427, + "step": 856 + }, + { + "epoch": 0.019987930275475428, + "grad_norm": 1.4508287906646729, + "learning_rate": 4.441565172324437e-06, + "loss": 1.2893, + "step": 857 + }, + { + "epoch": 0.020011253414653346, + "grad_norm": 1.29767644405365, + "learning_rate": 4.446747862140451e-06, + "loss": 1.5759, + "step": 858 + }, + { + "epoch": 0.020034576553831264, + "grad_norm": 1.84248685836792, + "learning_rate": 4.451930551956466e-06, + "loss": 2.1373, + "step": 859 + }, + { + "epoch": 0.020057899693009182, + "grad_norm": 1.6153839826583862, + "learning_rate": 4.457113241772481e-06, + "loss": 1.3915, + "step": 860 + }, + { + "epoch": 0.0200812228321871, + "grad_norm": 1.3203104734420776, + "learning_rate": 4.462295931588495e-06, + "loss": 1.569, + "step": 861 + }, + { + "epoch": 0.020104545971365014, + "grad_norm": 1.6475995779037476, + "learning_rate": 4.467478621404509e-06, + "loss": 1.6446, + "step": 862 + }, + { + "epoch": 0.020127869110542932, + "grad_norm": 1.165834665298462, + "learning_rate": 4.472661311220524e-06, + "loss": 1.7323, + "step": 863 + }, + { + "epoch": 0.02015119224972085, + "grad_norm": 1.3182172775268555, + "learning_rate": 4.477844001036538e-06, + "loss": 1.6265, + "step": 864 + }, + { + "epoch": 0.020174515388898768, + "grad_norm": 1.1236745119094849, + "learning_rate": 4.483026690852552e-06, + "loss": 1.2358, + "step": 865 + }, + { + "epoch": 0.020197838528076686, + "grad_norm": 1.2104893922805786, + "learning_rate": 4.4882093806685676e-06, + "loss": 1.4677, + "step": 866 + }, + { + "epoch": 0.020221161667254604, + "grad_norm": 1.6824678182601929, + "learning_rate": 4.493392070484582e-06, + "loss": 1.5802, + "step": 867 + }, + { + "epoch": 0.02024448480643252, + "grad_norm": 1.0679930448532104, + "learning_rate": 4.498574760300596e-06, + "loss": 1.4105, + "step": 868 + }, + { + "epoch": 0.02026780794561044, + "grad_norm": 1.3705253601074219, + "learning_rate": 4.503757450116611e-06, + "loss": 1.5095, + "step": 869 + }, + { + "epoch": 0.020291131084788357, + "grad_norm": 1.307491660118103, + "learning_rate": 4.508940139932626e-06, + "loss": 1.3987, + "step": 870 + }, + { + "epoch": 0.020314454223966275, + "grad_norm": 1.4814496040344238, + "learning_rate": 4.51412282974864e-06, + "loss": 1.635, + "step": 871 + }, + { + "epoch": 0.020337777363144193, + "grad_norm": 0.935867190361023, + "learning_rate": 4.5193055195646545e-06, + "loss": 1.6734, + "step": 872 + }, + { + "epoch": 0.02036110050232211, + "grad_norm": 1.3890215158462524, + "learning_rate": 4.524488209380669e-06, + "loss": 1.4458, + "step": 873 + }, + { + "epoch": 0.02038442364150003, + "grad_norm": 1.628081202507019, + "learning_rate": 4.529670899196684e-06, + "loss": 1.4814, + "step": 874 + }, + { + "epoch": 0.020407746780677947, + "grad_norm": 1.5255577564239502, + "learning_rate": 4.534853589012698e-06, + "loss": 1.3884, + "step": 875 + }, + { + "epoch": 0.020431069919855865, + "grad_norm": 2.09283185005188, + "learning_rate": 4.540036278828712e-06, + "loss": 1.7396, + "step": 876 + }, + { + "epoch": 0.02045439305903378, + "grad_norm": 0.9901561737060547, + "learning_rate": 4.545218968644727e-06, + "loss": 1.4941, + "step": 877 + }, + { + "epoch": 0.020477716198211697, + "grad_norm": 1.8444923162460327, + "learning_rate": 4.550401658460741e-06, + "loss": 1.2724, + "step": 878 + }, + { + "epoch": 0.020501039337389615, + "grad_norm": 1.414305567741394, + "learning_rate": 4.555584348276756e-06, + "loss": 1.5781, + "step": 879 + }, + { + "epoch": 0.020524362476567533, + "grad_norm": 1.1960091590881348, + "learning_rate": 4.56076703809277e-06, + "loss": 1.536, + "step": 880 + }, + { + "epoch": 0.02054768561574545, + "grad_norm": 2.241649627685547, + "learning_rate": 4.565949727908785e-06, + "loss": 1.6636, + "step": 881 + }, + { + "epoch": 0.02057100875492337, + "grad_norm": 1.0672343969345093, + "learning_rate": 4.5711324177248e-06, + "loss": 1.6369, + "step": 882 + }, + { + "epoch": 0.020594331894101287, + "grad_norm": 1.6761622428894043, + "learning_rate": 4.576315107540814e-06, + "loss": 1.2554, + "step": 883 + }, + { + "epoch": 0.020617655033279204, + "grad_norm": 1.1365658044815063, + "learning_rate": 4.581497797356828e-06, + "loss": 1.6271, + "step": 884 + }, + { + "epoch": 0.020640978172457122, + "grad_norm": 1.0631389617919922, + "learning_rate": 4.5866804871728435e-06, + "loss": 1.6393, + "step": 885 + }, + { + "epoch": 0.02066430131163504, + "grad_norm": 3.27304744720459, + "learning_rate": 4.591863176988858e-06, + "loss": 1.3521, + "step": 886 + }, + { + "epoch": 0.020687624450812958, + "grad_norm": 1.3354477882385254, + "learning_rate": 4.597045866804872e-06, + "loss": 1.5137, + "step": 887 + }, + { + "epoch": 0.020710947589990876, + "grad_norm": 2.192812919616699, + "learning_rate": 4.602228556620887e-06, + "loss": 1.7294, + "step": 888 + }, + { + "epoch": 0.020734270729168794, + "grad_norm": 0.9716669321060181, + "learning_rate": 4.607411246436901e-06, + "loss": 1.4244, + "step": 889 + }, + { + "epoch": 0.020757593868346712, + "grad_norm": 1.0377227067947388, + "learning_rate": 4.612593936252915e-06, + "loss": 1.3041, + "step": 890 + }, + { + "epoch": 0.020780917007524626, + "grad_norm": 1.971074104309082, + "learning_rate": 4.61777662606893e-06, + "loss": 1.4917, + "step": 891 + }, + { + "epoch": 0.020804240146702544, + "grad_norm": 1.3108222484588623, + "learning_rate": 4.622959315884945e-06, + "loss": 1.5923, + "step": 892 + }, + { + "epoch": 0.020827563285880462, + "grad_norm": 1.4194189310073853, + "learning_rate": 4.628142005700959e-06, + "loss": 1.2378, + "step": 893 + }, + { + "epoch": 0.02085088642505838, + "grad_norm": 1.5872682332992554, + "learning_rate": 4.6333246955169735e-06, + "loss": 1.3573, + "step": 894 + }, + { + "epoch": 0.020874209564236298, + "grad_norm": 1.351704716682434, + "learning_rate": 4.638507385332988e-06, + "loss": 1.8374, + "step": 895 + }, + { + "epoch": 0.020897532703414216, + "grad_norm": 1.15986168384552, + "learning_rate": 4.643690075149003e-06, + "loss": 1.4303, + "step": 896 + }, + { + "epoch": 0.020920855842592134, + "grad_norm": 1.912819743156433, + "learning_rate": 4.648872764965017e-06, + "loss": 1.7733, + "step": 897 + }, + { + "epoch": 0.02094417898177005, + "grad_norm": 1.6582539081573486, + "learning_rate": 4.654055454781032e-06, + "loss": 1.4696, + "step": 898 + }, + { + "epoch": 0.02096750212094797, + "grad_norm": 1.147661805152893, + "learning_rate": 4.659238144597046e-06, + "loss": 1.5037, + "step": 899 + }, + { + "epoch": 0.020990825260125887, + "grad_norm": 1.1773402690887451, + "learning_rate": 4.664420834413061e-06, + "loss": 1.604, + "step": 900 + }, + { + "epoch": 0.021014148399303805, + "grad_norm": 1.9128248691558838, + "learning_rate": 4.669603524229076e-06, + "loss": 1.3081, + "step": 901 + }, + { + "epoch": 0.021037471538481723, + "grad_norm": 1.0742683410644531, + "learning_rate": 4.67478621404509e-06, + "loss": 1.5619, + "step": 902 + }, + { + "epoch": 0.02106079467765964, + "grad_norm": 1.19862699508667, + "learning_rate": 4.679968903861104e-06, + "loss": 1.6896, + "step": 903 + }, + { + "epoch": 0.02108411781683756, + "grad_norm": 1.276283860206604, + "learning_rate": 4.685151593677119e-06, + "loss": 1.65, + "step": 904 + }, + { + "epoch": 0.021107440956015473, + "grad_norm": 1.3582435846328735, + "learning_rate": 4.690334283493133e-06, + "loss": 1.2686, + "step": 905 + }, + { + "epoch": 0.02113076409519339, + "grad_norm": 1.2145341634750366, + "learning_rate": 4.695516973309147e-06, + "loss": 1.8032, + "step": 906 + }, + { + "epoch": 0.02115408723437131, + "grad_norm": 1.1219233274459839, + "learning_rate": 4.7006996631251626e-06, + "loss": 1.7681, + "step": 907 + }, + { + "epoch": 0.021177410373549227, + "grad_norm": 1.0474015474319458, + "learning_rate": 4.705882352941177e-06, + "loss": 1.4555, + "step": 908 + }, + { + "epoch": 0.021200733512727145, + "grad_norm": 1.6325182914733887, + "learning_rate": 4.711065042757191e-06, + "loss": 1.432, + "step": 909 + }, + { + "epoch": 0.021224056651905063, + "grad_norm": 1.5804178714752197, + "learning_rate": 4.716247732573206e-06, + "loss": 1.7409, + "step": 910 + }, + { + "epoch": 0.02124737979108298, + "grad_norm": 1.226804256439209, + "learning_rate": 4.721430422389221e-06, + "loss": 1.8077, + "step": 911 + }, + { + "epoch": 0.0212707029302609, + "grad_norm": 1.0747625827789307, + "learning_rate": 4.726613112205235e-06, + "loss": 1.411, + "step": 912 + }, + { + "epoch": 0.021294026069438816, + "grad_norm": 1.2126623392105103, + "learning_rate": 4.7317958020212495e-06, + "loss": 1.6464, + "step": 913 + }, + { + "epoch": 0.021317349208616734, + "grad_norm": 1.196486473083496, + "learning_rate": 4.736978491837264e-06, + "loss": 1.4365, + "step": 914 + }, + { + "epoch": 0.021340672347794652, + "grad_norm": 1.4727115631103516, + "learning_rate": 4.742161181653279e-06, + "loss": 1.5059, + "step": 915 + }, + { + "epoch": 0.02136399548697257, + "grad_norm": 1.293938159942627, + "learning_rate": 4.747343871469293e-06, + "loss": 1.5508, + "step": 916 + }, + { + "epoch": 0.021387318626150488, + "grad_norm": 1.3074458837509155, + "learning_rate": 4.752526561285307e-06, + "loss": 1.364, + "step": 917 + }, + { + "epoch": 0.021410641765328406, + "grad_norm": 1.708522081375122, + "learning_rate": 4.757709251101322e-06, + "loss": 1.2891, + "step": 918 + }, + { + "epoch": 0.02143396490450632, + "grad_norm": 1.2926160097122192, + "learning_rate": 4.762891940917336e-06, + "loss": 1.1779, + "step": 919 + }, + { + "epoch": 0.021457288043684238, + "grad_norm": 1.7751168012619019, + "learning_rate": 4.768074630733351e-06, + "loss": 1.3136, + "step": 920 + }, + { + "epoch": 0.021480611182862156, + "grad_norm": 1.3698194026947021, + "learning_rate": 4.773257320549365e-06, + "loss": 1.5203, + "step": 921 + }, + { + "epoch": 0.021503934322040074, + "grad_norm": 1.4710402488708496, + "learning_rate": 4.77844001036538e-06, + "loss": 2.0632, + "step": 922 + }, + { + "epoch": 0.021527257461217992, + "grad_norm": 1.3340466022491455, + "learning_rate": 4.783622700181395e-06, + "loss": 0.9449, + "step": 923 + }, + { + "epoch": 0.02155058060039591, + "grad_norm": 1.990078330039978, + "learning_rate": 4.788805389997409e-06, + "loss": 1.4095, + "step": 924 + }, + { + "epoch": 0.021573903739573828, + "grad_norm": 2.6495463848114014, + "learning_rate": 4.793988079813423e-06, + "loss": 1.5914, + "step": 925 + }, + { + "epoch": 0.021597226878751746, + "grad_norm": 1.368868350982666, + "learning_rate": 4.7991707696294385e-06, + "loss": 1.8007, + "step": 926 + }, + { + "epoch": 0.021620550017929663, + "grad_norm": 1.3946820497512817, + "learning_rate": 4.804353459445453e-06, + "loss": 1.3846, + "step": 927 + }, + { + "epoch": 0.02164387315710758, + "grad_norm": 1.6035547256469727, + "learning_rate": 4.809536149261467e-06, + "loss": 1.6677, + "step": 928 + }, + { + "epoch": 0.0216671962962855, + "grad_norm": 1.29734468460083, + "learning_rate": 4.814718839077482e-06, + "loss": 1.3697, + "step": 929 + }, + { + "epoch": 0.021690519435463417, + "grad_norm": 1.1746439933776855, + "learning_rate": 4.819901528893497e-06, + "loss": 1.6134, + "step": 930 + }, + { + "epoch": 0.021713842574641335, + "grad_norm": 1.255861759185791, + "learning_rate": 4.82508421870951e-06, + "loss": 1.6253, + "step": 931 + }, + { + "epoch": 0.021737165713819253, + "grad_norm": 1.5499615669250488, + "learning_rate": 4.830266908525525e-06, + "loss": 1.2794, + "step": 932 + }, + { + "epoch": 0.02176048885299717, + "grad_norm": 1.6138273477554321, + "learning_rate": 4.83544959834154e-06, + "loss": 1.6365, + "step": 933 + }, + { + "epoch": 0.021783811992175085, + "grad_norm": 1.7135401964187622, + "learning_rate": 4.840632288157554e-06, + "loss": 1.509, + "step": 934 + }, + { + "epoch": 0.021807135131353003, + "grad_norm": 1.4290528297424316, + "learning_rate": 4.8458149779735685e-06, + "loss": 1.3415, + "step": 935 + }, + { + "epoch": 0.02183045827053092, + "grad_norm": 2.034870147705078, + "learning_rate": 4.850997667789583e-06, + "loss": 1.6834, + "step": 936 + }, + { + "epoch": 0.02185378140970884, + "grad_norm": 1.6626250743865967, + "learning_rate": 4.856180357605598e-06, + "loss": 1.3573, + "step": 937 + }, + { + "epoch": 0.021877104548886757, + "grad_norm": 1.2256288528442383, + "learning_rate": 4.861363047421612e-06, + "loss": 1.5497, + "step": 938 + }, + { + "epoch": 0.021900427688064675, + "grad_norm": 1.218955397605896, + "learning_rate": 4.866545737237627e-06, + "loss": 1.6823, + "step": 939 + }, + { + "epoch": 0.021923750827242593, + "grad_norm": 1.0629289150238037, + "learning_rate": 4.871728427053641e-06, + "loss": 1.3894, + "step": 940 + }, + { + "epoch": 0.02194707396642051, + "grad_norm": 2.6169822216033936, + "learning_rate": 4.876911116869656e-06, + "loss": 1.4063, + "step": 941 + }, + { + "epoch": 0.02197039710559843, + "grad_norm": 1.1517153978347778, + "learning_rate": 4.882093806685671e-06, + "loss": 1.3838, + "step": 942 + }, + { + "epoch": 0.021993720244776346, + "grad_norm": 1.6320403814315796, + "learning_rate": 4.887276496501685e-06, + "loss": 1.5752, + "step": 943 + }, + { + "epoch": 0.022017043383954264, + "grad_norm": 1.7344862222671509, + "learning_rate": 4.892459186317699e-06, + "loss": 1.3182, + "step": 944 + }, + { + "epoch": 0.022040366523132182, + "grad_norm": 1.2497214078903198, + "learning_rate": 4.897641876133714e-06, + "loss": 1.2266, + "step": 945 + }, + { + "epoch": 0.0220636896623101, + "grad_norm": 1.996893048286438, + "learning_rate": 4.902824565949728e-06, + "loss": 1.2708, + "step": 946 + }, + { + "epoch": 0.022087012801488018, + "grad_norm": 1.1130571365356445, + "learning_rate": 4.908007255765742e-06, + "loss": 1.4791, + "step": 947 + }, + { + "epoch": 0.022110335940665932, + "grad_norm": 1.2698702812194824, + "learning_rate": 4.9131899455817576e-06, + "loss": 1.3711, + "step": 948 + }, + { + "epoch": 0.02213365907984385, + "grad_norm": 1.0363445281982422, + "learning_rate": 4.918372635397772e-06, + "loss": 1.4153, + "step": 949 + }, + { + "epoch": 0.022156982219021768, + "grad_norm": 1.1418310403823853, + "learning_rate": 4.923555325213786e-06, + "loss": 1.3377, + "step": 950 + }, + { + "epoch": 0.022180305358199686, + "grad_norm": 1.3740698099136353, + "learning_rate": 4.928738015029801e-06, + "loss": 1.375, + "step": 951 + }, + { + "epoch": 0.022203628497377604, + "grad_norm": 1.5656532049179077, + "learning_rate": 4.933920704845816e-06, + "loss": 1.651, + "step": 952 + }, + { + "epoch": 0.022226951636555522, + "grad_norm": 1.209380865097046, + "learning_rate": 4.93910339466183e-06, + "loss": 1.6956, + "step": 953 + }, + { + "epoch": 0.02225027477573344, + "grad_norm": 1.9917747974395752, + "learning_rate": 4.9442860844778445e-06, + "loss": 1.2802, + "step": 954 + }, + { + "epoch": 0.022273597914911358, + "grad_norm": 2.168260097503662, + "learning_rate": 4.949468774293859e-06, + "loss": 1.9773, + "step": 955 + }, + { + "epoch": 0.022296921054089276, + "grad_norm": 1.113978624343872, + "learning_rate": 4.954651464109874e-06, + "loss": 1.8121, + "step": 956 + }, + { + "epoch": 0.022320244193267193, + "grad_norm": 1.4833635091781616, + "learning_rate": 4.959834153925888e-06, + "loss": 1.694, + "step": 957 + }, + { + "epoch": 0.02234356733244511, + "grad_norm": 1.3287935256958008, + "learning_rate": 4.965016843741902e-06, + "loss": 1.4865, + "step": 958 + }, + { + "epoch": 0.02236689047162303, + "grad_norm": 1.5515238046646118, + "learning_rate": 4.970199533557917e-06, + "loss": 1.6035, + "step": 959 + }, + { + "epoch": 0.022390213610800947, + "grad_norm": 1.2824245691299438, + "learning_rate": 4.975382223373931e-06, + "loss": 1.5124, + "step": 960 + }, + { + "epoch": 0.022413536749978865, + "grad_norm": 1.2062418460845947, + "learning_rate": 4.980564913189946e-06, + "loss": 1.5982, + "step": 961 + }, + { + "epoch": 0.02243685988915678, + "grad_norm": 1.2790741920471191, + "learning_rate": 4.98574760300596e-06, + "loss": 1.586, + "step": 962 + }, + { + "epoch": 0.022460183028334697, + "grad_norm": 1.202909231185913, + "learning_rate": 4.990930292821975e-06, + "loss": 1.7387, + "step": 963 + }, + { + "epoch": 0.022483506167512615, + "grad_norm": 1.328963041305542, + "learning_rate": 4.99611298263799e-06, + "loss": 1.5611, + "step": 964 + }, + { + "epoch": 0.022506829306690533, + "grad_norm": 1.3728841543197632, + "learning_rate": 5.001295672454004e-06, + "loss": 1.6887, + "step": 965 + }, + { + "epoch": 0.02253015244586845, + "grad_norm": 1.2474596500396729, + "learning_rate": 5.006478362270018e-06, + "loss": 1.7337, + "step": 966 + }, + { + "epoch": 0.02255347558504637, + "grad_norm": 1.4526808261871338, + "learning_rate": 5.0116610520860335e-06, + "loss": 1.4009, + "step": 967 + }, + { + "epoch": 0.022576798724224287, + "grad_norm": 1.74959397315979, + "learning_rate": 5.016843741902048e-06, + "loss": 1.4153, + "step": 968 + }, + { + "epoch": 0.022600121863402205, + "grad_norm": 1.7886738777160645, + "learning_rate": 5.022026431718062e-06, + "loss": 1.3897, + "step": 969 + }, + { + "epoch": 0.022623445002580123, + "grad_norm": 1.3122284412384033, + "learning_rate": 5.027209121534077e-06, + "loss": 1.6551, + "step": 970 + }, + { + "epoch": 0.02264676814175804, + "grad_norm": 1.5374927520751953, + "learning_rate": 5.032391811350092e-06, + "loss": 1.6396, + "step": 971 + }, + { + "epoch": 0.02267009128093596, + "grad_norm": 1.6476905345916748, + "learning_rate": 5.037574501166106e-06, + "loss": 1.733, + "step": 972 + }, + { + "epoch": 0.022693414420113876, + "grad_norm": 1.3407307863235474, + "learning_rate": 5.0427571909821205e-06, + "loss": 1.4984, + "step": 973 + }, + { + "epoch": 0.022716737559291794, + "grad_norm": 1.5565712451934814, + "learning_rate": 5.047939880798135e-06, + "loss": 1.6524, + "step": 974 + }, + { + "epoch": 0.022740060698469712, + "grad_norm": 1.381903052330017, + "learning_rate": 5.053122570614149e-06, + "loss": 1.5325, + "step": 975 + }, + { + "epoch": 0.022763383837647626, + "grad_norm": 1.916326880455017, + "learning_rate": 5.058305260430164e-06, + "loss": 1.2326, + "step": 976 + }, + { + "epoch": 0.022786706976825544, + "grad_norm": 1.1621575355529785, + "learning_rate": 5.063487950246179e-06, + "loss": 1.2568, + "step": 977 + }, + { + "epoch": 0.022810030116003462, + "grad_norm": 1.3575561046600342, + "learning_rate": 5.068670640062193e-06, + "loss": 1.3755, + "step": 978 + }, + { + "epoch": 0.02283335325518138, + "grad_norm": 1.482701063156128, + "learning_rate": 5.0738533298782065e-06, + "loss": 1.598, + "step": 979 + }, + { + "epoch": 0.022856676394359298, + "grad_norm": 1.2530887126922607, + "learning_rate": 5.079036019694221e-06, + "loss": 1.66, + "step": 980 + }, + { + "epoch": 0.022879999533537216, + "grad_norm": 1.4960439205169678, + "learning_rate": 5.084218709510236e-06, + "loss": 1.5341, + "step": 981 + }, + { + "epoch": 0.022903322672715134, + "grad_norm": 1.507735252380371, + "learning_rate": 5.0894013993262504e-06, + "loss": 1.3987, + "step": 982 + }, + { + "epoch": 0.022926645811893052, + "grad_norm": 2.0131475925445557, + "learning_rate": 5.094584089142265e-06, + "loss": 1.3134, + "step": 983 + }, + { + "epoch": 0.02294996895107097, + "grad_norm": 1.8096015453338623, + "learning_rate": 5.099766778958279e-06, + "loss": 1.3707, + "step": 984 + }, + { + "epoch": 0.022973292090248888, + "grad_norm": 1.0444198846817017, + "learning_rate": 5.104949468774294e-06, + "loss": 1.4119, + "step": 985 + }, + { + "epoch": 0.022996615229426805, + "grad_norm": 1.3110159635543823, + "learning_rate": 5.110132158590309e-06, + "loss": 1.2187, + "step": 986 + }, + { + "epoch": 0.023019938368604723, + "grad_norm": 1.3191614151000977, + "learning_rate": 5.115314848406323e-06, + "loss": 1.3691, + "step": 987 + }, + { + "epoch": 0.02304326150778264, + "grad_norm": 1.3888386487960815, + "learning_rate": 5.120497538222337e-06, + "loss": 1.1934, + "step": 988 + }, + { + "epoch": 0.02306658464696056, + "grad_norm": 1.2101585865020752, + "learning_rate": 5.1256802280383526e-06, + "loss": 1.4962, + "step": 989 + }, + { + "epoch": 0.023089907786138477, + "grad_norm": 1.2938464879989624, + "learning_rate": 5.130862917854367e-06, + "loss": 1.4601, + "step": 990 + }, + { + "epoch": 0.02311323092531639, + "grad_norm": 2.072444200515747, + "learning_rate": 5.136045607670381e-06, + "loss": 1.7241, + "step": 991 + }, + { + "epoch": 0.02313655406449431, + "grad_norm": 1.7139407396316528, + "learning_rate": 5.141228297486396e-06, + "loss": 1.394, + "step": 992 + }, + { + "epoch": 0.023159877203672227, + "grad_norm": 1.5825177431106567, + "learning_rate": 5.146410987302411e-06, + "loss": 1.4218, + "step": 993 + }, + { + "epoch": 0.023183200342850145, + "grad_norm": 1.2233787775039673, + "learning_rate": 5.151593677118425e-06, + "loss": 1.2882, + "step": 994 + }, + { + "epoch": 0.023206523482028063, + "grad_norm": 1.6474647521972656, + "learning_rate": 5.1567763669344395e-06, + "loss": 1.6499, + "step": 995 + }, + { + "epoch": 0.02322984662120598, + "grad_norm": 1.669651985168457, + "learning_rate": 5.161959056750454e-06, + "loss": 1.1727, + "step": 996 + }, + { + "epoch": 0.0232531697603839, + "grad_norm": 1.4976879358291626, + "learning_rate": 5.167141746566469e-06, + "loss": 1.2149, + "step": 997 + }, + { + "epoch": 0.023276492899561817, + "grad_norm": 1.4033470153808594, + "learning_rate": 5.172324436382483e-06, + "loss": 1.3004, + "step": 998 + }, + { + "epoch": 0.023299816038739735, + "grad_norm": 1.3042150735855103, + "learning_rate": 5.177507126198498e-06, + "loss": 1.3803, + "step": 999 + }, + { + "epoch": 0.023323139177917653, + "grad_norm": 1.4327346086502075, + "learning_rate": 5.182689816014512e-06, + "loss": 1.7267, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 128625, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.5429008193870234e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}