diff --git "a/limo/full/checkpoint-1545/trainer_state.json" "b/limo/full/checkpoint-1545/trainer_state.json" new file mode 100644--- /dev/null +++ "b/limo/full/checkpoint-1545/trainer_state.json" @@ -0,0 +1,10848 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 500, + "global_step": 1545, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009708737864077669, + "grad_norm": 32.0991353240983, + "learning_rate": 4.999994831641374e-06, + "loss": 4.1052, + "step": 1 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 29.524023707455505, + "learning_rate": 4.9999793265868636e-06, + "loss": 3.2535, + "step": 2 + }, + { + "epoch": 0.02912621359223301, + "grad_norm": 25.915328826410054, + "learning_rate": 4.999953484900578e-06, + "loss": 3.0603, + "step": 3 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 26.925318996290773, + "learning_rate": 4.9999173066893655e-06, + "loss": 3.8736, + "step": 4 + }, + { + "epoch": 0.04854368932038835, + "grad_norm": 17.49179072124481, + "learning_rate": 4.9998707921028104e-06, + "loss": 3.2503, + "step": 5 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 11.174584909912863, + "learning_rate": 4.999813941333237e-06, + "loss": 2.0295, + "step": 6 + }, + { + "epoch": 0.06796116504854369, + "grad_norm": 9.90835381349434, + "learning_rate": 4.999746754615704e-06, + "loss": 1.9601, + "step": 7 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 2.252755813363907, + "learning_rate": 4.9996692322280085e-06, + "loss": 1.1375, + "step": 8 + }, + { + "epoch": 0.08737864077669903, + "grad_norm": 9.417040995705188, + "learning_rate": 4.999581374490681e-06, + "loss": 2.4889, + "step": 9 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 5.128297805941186, + "learning_rate": 4.999483181766986e-06, + "loss": 1.9038, + "step": 10 + }, + { + "epoch": 0.10679611650485436, + "grad_norm": 2.6424619563549627, + "learning_rate": 4.999374654462919e-06, + "loss": 1.3898, + "step": 11 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 5.089573735718794, + "learning_rate": 4.999255793027207e-06, + "loss": 2.1286, + "step": 12 + }, + { + "epoch": 0.1262135922330097, + "grad_norm": 2.251991573691883, + "learning_rate": 4.999126597951305e-06, + "loss": 1.0427, + "step": 13 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 2.2920823891389333, + "learning_rate": 4.998987069769394e-06, + "loss": 1.1071, + "step": 14 + }, + { + "epoch": 0.14563106796116504, + "grad_norm": 2.8377769456674247, + "learning_rate": 4.998837209058379e-06, + "loss": 1.6229, + "step": 15 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 7.727434690224115, + "learning_rate": 4.998677016437888e-06, + "loss": 1.3171, + "step": 16 + }, + { + "epoch": 0.1650485436893204, + "grad_norm": 3.74374589959768, + "learning_rate": 4.998506492570266e-06, + "loss": 1.7463, + "step": 17 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 3.837005912107839, + "learning_rate": 4.998325638160576e-06, + "loss": 1.3993, + "step": 18 + }, + { + "epoch": 0.18446601941747573, + "grad_norm": 5.797983985086783, + "learning_rate": 4.998134453956596e-06, + "loss": 1.1204, + "step": 19 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 2.7548345494912807, + "learning_rate": 4.997932940748811e-06, + "loss": 1.2081, + "step": 20 + }, + { + "epoch": 0.20388349514563106, + "grad_norm": 5.045371621372198, + "learning_rate": 4.997721099370416e-06, + "loss": 1.2124, + "step": 21 + }, + { + "epoch": 0.21359223300970873, + "grad_norm": 2.051110624952864, + "learning_rate": 4.997498930697308e-06, + "loss": 1.3137, + "step": 22 + }, + { + "epoch": 0.22330097087378642, + "grad_norm": 2.691454094294257, + "learning_rate": 4.997266435648086e-06, + "loss": 1.3098, + "step": 23 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 2.4865134695003017, + "learning_rate": 4.997023615184044e-06, + "loss": 1.6151, + "step": 24 + }, + { + "epoch": 0.24271844660194175, + "grad_norm": 2.8075788095017757, + "learning_rate": 4.996770470309167e-06, + "loss": 1.6531, + "step": 25 + }, + { + "epoch": 0.2524271844660194, + "grad_norm": 1.768040918463024, + "learning_rate": 4.996507002070131e-06, + "loss": 1.1902, + "step": 26 + }, + { + "epoch": 0.2621359223300971, + "grad_norm": 1.955363000769332, + "learning_rate": 4.996233211556295e-06, + "loss": 1.2454, + "step": 27 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 4.241739298469607, + "learning_rate": 4.9959490998996974e-06, + "loss": 0.8855, + "step": 28 + }, + { + "epoch": 0.2815533980582524, + "grad_norm": 7.222330025323884, + "learning_rate": 4.995654668275049e-06, + "loss": 1.2271, + "step": 29 + }, + { + "epoch": 0.2912621359223301, + "grad_norm": 2.3651169987619527, + "learning_rate": 4.995349917899735e-06, + "loss": 1.3329, + "step": 30 + }, + { + "epoch": 0.30097087378640774, + "grad_norm": 1.6948963634507868, + "learning_rate": 4.9950348500338005e-06, + "loss": 0.9562, + "step": 31 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 1.3136462562718183, + "learning_rate": 4.994709465979954e-06, + "loss": 0.9246, + "step": 32 + }, + { + "epoch": 0.32038834951456313, + "grad_norm": 3.6835006171200306, + "learning_rate": 4.994373767083556e-06, + "loss": 1.0724, + "step": 33 + }, + { + "epoch": 0.3300970873786408, + "grad_norm": 3.163430858725886, + "learning_rate": 4.994027754732616e-06, + "loss": 1.1139, + "step": 34 + }, + { + "epoch": 0.33980582524271846, + "grad_norm": 1.4746343819625023, + "learning_rate": 4.993671430357788e-06, + "loss": 0.9994, + "step": 35 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 1.9654224512814882, + "learning_rate": 4.99330479543236e-06, + "loss": 1.3667, + "step": 36 + }, + { + "epoch": 0.3592233009708738, + "grad_norm": 1.2495201306162087, + "learning_rate": 4.992927851472254e-06, + "loss": 1.1252, + "step": 37 + }, + { + "epoch": 0.36893203883495146, + "grad_norm": 1.5824371832462278, + "learning_rate": 4.992540600036014e-06, + "loss": 1.3831, + "step": 38 + }, + { + "epoch": 0.3786407766990291, + "grad_norm": 1.4702657605395504, + "learning_rate": 4.992143042724805e-06, + "loss": 1.1461, + "step": 39 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 1.4506087672413808, + "learning_rate": 4.991735181182401e-06, + "loss": 1.4195, + "step": 40 + }, + { + "epoch": 0.39805825242718446, + "grad_norm": 1.075865370590295, + "learning_rate": 4.991317017095182e-06, + "loss": 0.9813, + "step": 41 + }, + { + "epoch": 0.4077669902912621, + "grad_norm": 1.2265855895293323, + "learning_rate": 4.990888552192126e-06, + "loss": 1.0049, + "step": 42 + }, + { + "epoch": 0.4174757281553398, + "grad_norm": 2.4151552794703437, + "learning_rate": 4.9904497882448004e-06, + "loss": 1.1099, + "step": 43 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 1.6677746145821908, + "learning_rate": 4.990000727067357e-06, + "loss": 1.1481, + "step": 44 + }, + { + "epoch": 0.4368932038834951, + "grad_norm": 1.5396824114800838, + "learning_rate": 4.989541370516523e-06, + "loss": 0.9952, + "step": 45 + }, + { + "epoch": 0.44660194174757284, + "grad_norm": 1.3928969634273873, + "learning_rate": 4.989071720491595e-06, + "loss": 0.9147, + "step": 46 + }, + { + "epoch": 0.4563106796116505, + "grad_norm": 0.9492551299059091, + "learning_rate": 4.988591778934428e-06, + "loss": 0.6869, + "step": 47 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 1.2714456311245477, + "learning_rate": 4.9881015478294294e-06, + "loss": 0.8276, + "step": 48 + }, + { + "epoch": 0.47572815533980584, + "grad_norm": 2.6314825001613658, + "learning_rate": 4.987601029203553e-06, + "loss": 1.0247, + "step": 49 + }, + { + "epoch": 0.4854368932038835, + "grad_norm": 1.416971495316372, + "learning_rate": 4.987090225126285e-06, + "loss": 1.012, + "step": 50 + }, + { + "epoch": 0.49514563106796117, + "grad_norm": 1.361560829125586, + "learning_rate": 4.98656913770964e-06, + "loss": 0.8907, + "step": 51 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 1.3035984152044737, + "learning_rate": 4.986037769108154e-06, + "loss": 1.056, + "step": 52 + }, + { + "epoch": 0.5145631067961165, + "grad_norm": 1.0150205143363307, + "learning_rate": 4.9854961215188676e-06, + "loss": 1.0017, + "step": 53 + }, + { + "epoch": 0.5242718446601942, + "grad_norm": 0.9609277033036215, + "learning_rate": 4.984944197181324e-06, + "loss": 0.7601, + "step": 54 + }, + { + "epoch": 0.5339805825242718, + "grad_norm": 1.1898667428428673, + "learning_rate": 4.9843819983775575e-06, + "loss": 0.8858, + "step": 55 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 1.0139938825198491, + "learning_rate": 4.983809527432086e-06, + "loss": 0.8071, + "step": 56 + }, + { + "epoch": 0.5533980582524272, + "grad_norm": 1.247418732762796, + "learning_rate": 4.983226786711895e-06, + "loss": 0.9675, + "step": 57 + }, + { + "epoch": 0.5631067961165048, + "grad_norm": 0.8942637413037233, + "learning_rate": 4.982633778626437e-06, + "loss": 0.8187, + "step": 58 + }, + { + "epoch": 0.5728155339805825, + "grad_norm": 1.3943617986028647, + "learning_rate": 4.982030505627613e-06, + "loss": 1.0678, + "step": 59 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.8610131821728051, + "learning_rate": 4.98141697020977e-06, + "loss": 0.8306, + "step": 60 + }, + { + "epoch": 0.5922330097087378, + "grad_norm": 1.0768921632881472, + "learning_rate": 4.9807931749096836e-06, + "loss": 0.81, + "step": 61 + }, + { + "epoch": 0.6019417475728155, + "grad_norm": 0.9625381291373968, + "learning_rate": 4.980159122306551e-06, + "loss": 0.892, + "step": 62 + }, + { + "epoch": 0.6116504854368932, + "grad_norm": 0.9617828022691955, + "learning_rate": 4.979514815021984e-06, + "loss": 1.0243, + "step": 63 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 1.2606711145146943, + "learning_rate": 4.978860255719989e-06, + "loss": 0.7773, + "step": 64 + }, + { + "epoch": 0.6310679611650486, + "grad_norm": 0.8531956488376088, + "learning_rate": 4.978195447106965e-06, + "loss": 0.9458, + "step": 65 + }, + { + "epoch": 0.6407766990291263, + "grad_norm": 0.8576435230419439, + "learning_rate": 4.9775203919316864e-06, + "loss": 0.7812, + "step": 66 + }, + { + "epoch": 0.6504854368932039, + "grad_norm": 0.9030199864232913, + "learning_rate": 4.976835092985297e-06, + "loss": 0.9382, + "step": 67 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.8483972987027173, + "learning_rate": 4.976139553101291e-06, + "loss": 0.8671, + "step": 68 + }, + { + "epoch": 0.6699029126213593, + "grad_norm": 0.8960545150162272, + "learning_rate": 4.975433775155509e-06, + "loss": 0.8646, + "step": 69 + }, + { + "epoch": 0.6796116504854369, + "grad_norm": 1.0017538898162217, + "learning_rate": 4.974717762066123e-06, + "loss": 0.8805, + "step": 70 + }, + { + "epoch": 0.6893203883495146, + "grad_norm": 0.9212309895628247, + "learning_rate": 4.973991516793621e-06, + "loss": 1.0576, + "step": 71 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.9305642899936102, + "learning_rate": 4.973255042340801e-06, + "loss": 0.8486, + "step": 72 + }, + { + "epoch": 0.7087378640776699, + "grad_norm": 1.0773851273416222, + "learning_rate": 4.972508341752754e-06, + "loss": 1.0583, + "step": 73 + }, + { + "epoch": 0.7184466019417476, + "grad_norm": 0.892091799769842, + "learning_rate": 4.9717514181168534e-06, + "loss": 0.7527, + "step": 74 + }, + { + "epoch": 0.7281553398058253, + "grad_norm": 0.8481741026230679, + "learning_rate": 4.970984274562741e-06, + "loss": 0.7125, + "step": 75 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 0.9312066959231862, + "learning_rate": 4.970206914262315e-06, + "loss": 0.8687, + "step": 76 + }, + { + "epoch": 0.7475728155339806, + "grad_norm": 0.9215979104056321, + "learning_rate": 4.969419340429717e-06, + "loss": 0.7691, + "step": 77 + }, + { + "epoch": 0.7572815533980582, + "grad_norm": 1.227175476240732, + "learning_rate": 4.968621556321319e-06, + "loss": 0.92, + "step": 78 + }, + { + "epoch": 0.7669902912621359, + "grad_norm": 0.9330601553607981, + "learning_rate": 4.967813565235708e-06, + "loss": 0.8216, + "step": 79 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.8252798793274818, + "learning_rate": 4.966995370513675e-06, + "loss": 0.7061, + "step": 80 + }, + { + "epoch": 0.7864077669902912, + "grad_norm": 0.9922301648204198, + "learning_rate": 4.966166975538197e-06, + "loss": 1.0408, + "step": 81 + }, + { + "epoch": 0.7961165048543689, + "grad_norm": 0.7891923797199208, + "learning_rate": 4.965328383734429e-06, + "loss": 0.7595, + "step": 82 + }, + { + "epoch": 0.8058252427184466, + "grad_norm": 1.0034345889698408, + "learning_rate": 4.964479598569686e-06, + "loss": 1.0233, + "step": 83 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.9634692778615994, + "learning_rate": 4.963620623553428e-06, + "loss": 0.929, + "step": 84 + }, + { + "epoch": 0.8252427184466019, + "grad_norm": 0.8794808183269647, + "learning_rate": 4.962751462237248e-06, + "loss": 0.7247, + "step": 85 + }, + { + "epoch": 0.8349514563106796, + "grad_norm": 0.8991558039524955, + "learning_rate": 4.9618721182148564e-06, + "loss": 0.6248, + "step": 86 + }, + { + "epoch": 0.8446601941747572, + "grad_norm": 0.9291777621093169, + "learning_rate": 4.960982595122064e-06, + "loss": 0.7035, + "step": 87 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 0.85791569259972, + "learning_rate": 4.960082896636773e-06, + "loss": 0.9148, + "step": 88 + }, + { + "epoch": 0.8640776699029126, + "grad_norm": 0.9063445307146435, + "learning_rate": 4.959173026478952e-06, + "loss": 0.7805, + "step": 89 + }, + { + "epoch": 0.8737864077669902, + "grad_norm": 0.7587205124423355, + "learning_rate": 4.958252988410631e-06, + "loss": 0.6329, + "step": 90 + }, + { + "epoch": 0.883495145631068, + "grad_norm": 1.0713074090013364, + "learning_rate": 4.9573227862358794e-06, + "loss": 0.7955, + "step": 91 + }, + { + "epoch": 0.8932038834951457, + "grad_norm": 0.8477622625159322, + "learning_rate": 4.956382423800791e-06, + "loss": 0.8325, + "step": 92 + }, + { + "epoch": 0.9029126213592233, + "grad_norm": 0.9269059925838263, + "learning_rate": 4.955431904993471e-06, + "loss": 0.8194, + "step": 93 + }, + { + "epoch": 0.912621359223301, + "grad_norm": 1.0475028792580197, + "learning_rate": 4.954471233744015e-06, + "loss": 0.5835, + "step": 94 + }, + { + "epoch": 0.9223300970873787, + "grad_norm": 0.8985260034505762, + "learning_rate": 4.9535004140245005e-06, + "loss": 0.8063, + "step": 95 + }, + { + "epoch": 0.9320388349514563, + "grad_norm": 1.0048572531965805, + "learning_rate": 4.952519449848962e-06, + "loss": 0.8127, + "step": 96 + }, + { + "epoch": 0.941747572815534, + "grad_norm": 0.9062750156316196, + "learning_rate": 4.951528345273379e-06, + "loss": 0.7181, + "step": 97 + }, + { + "epoch": 0.9514563106796117, + "grad_norm": 0.8778949835317967, + "learning_rate": 4.950527104395659e-06, + "loss": 0.8103, + "step": 98 + }, + { + "epoch": 0.9611650485436893, + "grad_norm": 0.9728187484090823, + "learning_rate": 4.9495157313556185e-06, + "loss": 0.6329, + "step": 99 + }, + { + "epoch": 0.970873786407767, + "grad_norm": 0.7352370132544686, + "learning_rate": 4.94849423033497e-06, + "loss": 0.5231, + "step": 100 + }, + { + "epoch": 0.9805825242718447, + "grad_norm": 0.8479762742075609, + "learning_rate": 4.9474626055573e-06, + "loss": 0.9551, + "step": 101 + }, + { + "epoch": 0.9902912621359223, + "grad_norm": 0.7351142523348447, + "learning_rate": 4.946420861288051e-06, + "loss": 0.738, + "step": 102 + }, + { + "epoch": 1.0, + "grad_norm": 0.9009004817884025, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.7789, + "step": 103 + }, + { + "epoch": 1.0097087378640777, + "grad_norm": 0.8057060436062744, + "learning_rate": 4.944307031545797e-06, + "loss": 0.6629, + "step": 104 + }, + { + "epoch": 1.0194174757281553, + "grad_norm": 0.8977736266341005, + "learning_rate": 4.943234954812812e-06, + "loss": 0.9053, + "step": 105 + }, + { + "epoch": 1.029126213592233, + "grad_norm": 0.7717440949849311, + "learning_rate": 4.942152776068264e-06, + "loss": 0.6404, + "step": 106 + }, + { + "epoch": 1.0388349514563107, + "grad_norm": 0.890503065931168, + "learning_rate": 4.941060499786622e-06, + "loss": 0.9117, + "step": 107 + }, + { + "epoch": 1.0485436893203883, + "grad_norm": 0.8814259279520221, + "learning_rate": 4.939958130484106e-06, + "loss": 0.7055, + "step": 108 + }, + { + "epoch": 1.058252427184466, + "grad_norm": 0.671081647050562, + "learning_rate": 4.938845672718668e-06, + "loss": 0.6671, + "step": 109 + }, + { + "epoch": 1.0679611650485437, + "grad_norm": 0.8278909841687843, + "learning_rate": 4.937723131089974e-06, + "loss": 0.7318, + "step": 110 + }, + { + "epoch": 1.0776699029126213, + "grad_norm": 0.9269878913315724, + "learning_rate": 4.93659051023938e-06, + "loss": 0.7646, + "step": 111 + }, + { + "epoch": 1.087378640776699, + "grad_norm": 0.665235455755867, + "learning_rate": 4.93544781484992e-06, + "loss": 0.4998, + "step": 112 + }, + { + "epoch": 1.0970873786407767, + "grad_norm": 0.8487063874467249, + "learning_rate": 4.9342950496462815e-06, + "loss": 0.7435, + "step": 113 + }, + { + "epoch": 1.1067961165048543, + "grad_norm": 0.8430924624096062, + "learning_rate": 4.933132219394786e-06, + "loss": 0.6992, + "step": 114 + }, + { + "epoch": 1.116504854368932, + "grad_norm": 0.8993084000328078, + "learning_rate": 4.931959328903376e-06, + "loss": 0.8961, + "step": 115 + }, + { + "epoch": 1.1262135922330097, + "grad_norm": 0.7373642852662377, + "learning_rate": 4.930776383021584e-06, + "loss": 0.6722, + "step": 116 + }, + { + "epoch": 1.1359223300970873, + "grad_norm": 0.8356211654383496, + "learning_rate": 4.92958338664052e-06, + "loss": 0.528, + "step": 117 + }, + { + "epoch": 1.145631067961165, + "grad_norm": 0.8788366905889159, + "learning_rate": 4.928380344692853e-06, + "loss": 0.6369, + "step": 118 + }, + { + "epoch": 1.1553398058252426, + "grad_norm": 0.70400203786099, + "learning_rate": 4.927167262152784e-06, + "loss": 0.6961, + "step": 119 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 0.7933675823948615, + "learning_rate": 4.925944144036027e-06, + "loss": 0.7316, + "step": 120 + }, + { + "epoch": 1.174757281553398, + "grad_norm": 0.7413576987999055, + "learning_rate": 4.924710995399796e-06, + "loss": 0.6764, + "step": 121 + }, + { + "epoch": 1.1844660194174756, + "grad_norm": 0.8385780569550869, + "learning_rate": 4.923467821342773e-06, + "loss": 0.7602, + "step": 122 + }, + { + "epoch": 1.1941747572815533, + "grad_norm": 0.8531356062842188, + "learning_rate": 4.922214627005092e-06, + "loss": 0.749, + "step": 123 + }, + { + "epoch": 1.203883495145631, + "grad_norm": 0.8216291410900652, + "learning_rate": 4.920951417568323e-06, + "loss": 0.8079, + "step": 124 + }, + { + "epoch": 1.2135922330097086, + "grad_norm": 0.8320005263963047, + "learning_rate": 4.919678198255438e-06, + "loss": 0.6805, + "step": 125 + }, + { + "epoch": 1.2233009708737863, + "grad_norm": 0.8268979479586152, + "learning_rate": 4.918394974330801e-06, + "loss": 0.7583, + "step": 126 + }, + { + "epoch": 1.233009708737864, + "grad_norm": 0.7645562011859167, + "learning_rate": 4.917101751100142e-06, + "loss": 0.7109, + "step": 127 + }, + { + "epoch": 1.2427184466019416, + "grad_norm": 0.8421630378850258, + "learning_rate": 4.915798533910534e-06, + "loss": 0.714, + "step": 128 + }, + { + "epoch": 1.2524271844660193, + "grad_norm": 0.7552330522382976, + "learning_rate": 4.9144853281503715e-06, + "loss": 0.5679, + "step": 129 + }, + { + "epoch": 1.262135922330097, + "grad_norm": 0.852273134838057, + "learning_rate": 4.91316213924935e-06, + "loss": 0.7952, + "step": 130 + }, + { + "epoch": 1.2718446601941746, + "grad_norm": 0.8114602189230878, + "learning_rate": 4.911828972678441e-06, + "loss": 0.6176, + "step": 131 + }, + { + "epoch": 1.2815533980582523, + "grad_norm": 0.8164393262307038, + "learning_rate": 4.91048583394987e-06, + "loss": 0.6477, + "step": 132 + }, + { + "epoch": 1.29126213592233, + "grad_norm": 0.8581250231339308, + "learning_rate": 4.909132728617095e-06, + "loss": 0.9476, + "step": 133 + }, + { + "epoch": 1.3009708737864076, + "grad_norm": 0.766359434418812, + "learning_rate": 4.907769662274785e-06, + "loss": 0.6528, + "step": 134 + }, + { + "epoch": 1.3106796116504853, + "grad_norm": 0.6976692264719603, + "learning_rate": 4.90639664055879e-06, + "loss": 0.5637, + "step": 135 + }, + { + "epoch": 1.3203883495145632, + "grad_norm": 0.7314627468283406, + "learning_rate": 4.905013669146127e-06, + "loss": 0.6096, + "step": 136 + }, + { + "epoch": 1.3300970873786409, + "grad_norm": 0.7773436570942428, + "learning_rate": 4.903620753754949e-06, + "loss": 0.7461, + "step": 137 + }, + { + "epoch": 1.3398058252427185, + "grad_norm": 0.8921198866699622, + "learning_rate": 4.902217900144524e-06, + "loss": 0.9358, + "step": 138 + }, + { + "epoch": 1.3495145631067962, + "grad_norm": 0.7375485723614339, + "learning_rate": 4.900805114115214e-06, + "loss": 0.8942, + "step": 139 + }, + { + "epoch": 1.3592233009708738, + "grad_norm": 0.7942794591041569, + "learning_rate": 4.899382401508446e-06, + "loss": 0.5492, + "step": 140 + }, + { + "epoch": 1.3689320388349515, + "grad_norm": 0.8078627057423822, + "learning_rate": 4.8979497682066916e-06, + "loss": 0.7808, + "step": 141 + }, + { + "epoch": 1.3786407766990292, + "grad_norm": 0.6837805173092772, + "learning_rate": 4.89650722013344e-06, + "loss": 0.5391, + "step": 142 + }, + { + "epoch": 1.3883495145631068, + "grad_norm": 0.9832481462127313, + "learning_rate": 4.895054763253177e-06, + "loss": 0.7406, + "step": 143 + }, + { + "epoch": 1.3980582524271845, + "grad_norm": 0.769076719995811, + "learning_rate": 4.8935924035713564e-06, + "loss": 0.5929, + "step": 144 + }, + { + "epoch": 1.4077669902912622, + "grad_norm": 0.8430007967016218, + "learning_rate": 4.892120147134378e-06, + "loss": 0.699, + "step": 145 + }, + { + "epoch": 1.4174757281553398, + "grad_norm": 0.9122194953590461, + "learning_rate": 4.8906380000295615e-06, + "loss": 0.6895, + "step": 146 + }, + { + "epoch": 1.4271844660194175, + "grad_norm": 0.7898786206966036, + "learning_rate": 4.889145968385121e-06, + "loss": 0.6528, + "step": 147 + }, + { + "epoch": 1.4368932038834952, + "grad_norm": 0.7666616277046007, + "learning_rate": 4.887644058370139e-06, + "loss": 0.531, + "step": 148 + }, + { + "epoch": 1.4466019417475728, + "grad_norm": 0.7653718997690864, + "learning_rate": 4.886132276194544e-06, + "loss": 0.5768, + "step": 149 + }, + { + "epoch": 1.4563106796116505, + "grad_norm": 0.90831243558374, + "learning_rate": 4.884610628109082e-06, + "loss": 0.5652, + "step": 150 + }, + { + "epoch": 1.4660194174757282, + "grad_norm": 0.7442744457963555, + "learning_rate": 4.883079120405292e-06, + "loss": 0.6688, + "step": 151 + }, + { + "epoch": 1.4757281553398058, + "grad_norm": 0.7344457110605118, + "learning_rate": 4.881537759415478e-06, + "loss": 0.5314, + "step": 152 + }, + { + "epoch": 1.4854368932038835, + "grad_norm": 1.1695813268075903, + "learning_rate": 4.879986551512684e-06, + "loss": 0.7273, + "step": 153 + }, + { + "epoch": 1.4951456310679612, + "grad_norm": 0.6738881315688398, + "learning_rate": 4.878425503110672e-06, + "loss": 0.5607, + "step": 154 + }, + { + "epoch": 1.5048543689320388, + "grad_norm": 0.8581543167197856, + "learning_rate": 4.876854620663887e-06, + "loss": 0.6943, + "step": 155 + }, + { + "epoch": 1.5145631067961165, + "grad_norm": 0.8060496260215426, + "learning_rate": 4.875273910667434e-06, + "loss": 0.6212, + "step": 156 + }, + { + "epoch": 1.5242718446601942, + "grad_norm": 0.8453811238639173, + "learning_rate": 4.873683379657057e-06, + "loss": 0.5456, + "step": 157 + }, + { + "epoch": 1.5339805825242718, + "grad_norm": 0.8844708129131296, + "learning_rate": 4.8720830342091015e-06, + "loss": 0.9448, + "step": 158 + }, + { + "epoch": 1.5436893203883495, + "grad_norm": 0.861907269409231, + "learning_rate": 4.870472880940496e-06, + "loss": 0.7925, + "step": 159 + }, + { + "epoch": 1.5533980582524272, + "grad_norm": 1.0829307275890252, + "learning_rate": 4.868852926508721e-06, + "loss": 0.8343, + "step": 160 + }, + { + "epoch": 1.5631067961165048, + "grad_norm": 0.875424595240487, + "learning_rate": 4.867223177611779e-06, + "loss": 0.7158, + "step": 161 + }, + { + "epoch": 1.5728155339805825, + "grad_norm": 0.6659133214470974, + "learning_rate": 4.865583640988173e-06, + "loss": 0.5684, + "step": 162 + }, + { + "epoch": 1.5825242718446602, + "grad_norm": 0.772588673794732, + "learning_rate": 4.863934323416871e-06, + "loss": 0.6486, + "step": 163 + }, + { + "epoch": 1.5922330097087378, + "grad_norm": 0.741499702229279, + "learning_rate": 4.862275231717288e-06, + "loss": 0.5758, + "step": 164 + }, + { + "epoch": 1.6019417475728155, + "grad_norm": 0.920336013887185, + "learning_rate": 4.860606372749247e-06, + "loss": 0.5468, + "step": 165 + }, + { + "epoch": 1.6116504854368932, + "grad_norm": 0.7093065195416415, + "learning_rate": 4.858927753412958e-06, + "loss": 0.6127, + "step": 166 + }, + { + "epoch": 1.6213592233009708, + "grad_norm": 0.8110174812695655, + "learning_rate": 4.857239380648985e-06, + "loss": 0.6017, + "step": 167 + }, + { + "epoch": 1.6310679611650487, + "grad_norm": 0.7375215564893128, + "learning_rate": 4.855541261438223e-06, + "loss": 0.6753, + "step": 168 + }, + { + "epoch": 1.6407766990291264, + "grad_norm": 0.7500973220432647, + "learning_rate": 4.8538334028018605e-06, + "loss": 0.8343, + "step": 169 + }, + { + "epoch": 1.650485436893204, + "grad_norm": 0.6948713105727569, + "learning_rate": 4.8521158118013605e-06, + "loss": 0.5493, + "step": 170 + }, + { + "epoch": 1.6601941747572817, + "grad_norm": 0.773755912005905, + "learning_rate": 4.850388495538423e-06, + "loss": 0.6271, + "step": 171 + }, + { + "epoch": 1.6699029126213594, + "grad_norm": 0.8759327452228973, + "learning_rate": 4.84865146115496e-06, + "loss": 0.5641, + "step": 172 + }, + { + "epoch": 1.679611650485437, + "grad_norm": 0.8673872477886911, + "learning_rate": 4.846904715833066e-06, + "loss": 0.5295, + "step": 173 + }, + { + "epoch": 1.6893203883495147, + "grad_norm": 0.7435589108212891, + "learning_rate": 4.8451482667949836e-06, + "loss": 0.7122, + "step": 174 + }, + { + "epoch": 1.6990291262135924, + "grad_norm": 0.7510040594244859, + "learning_rate": 4.843382121303082e-06, + "loss": 0.6579, + "step": 175 + }, + { + "epoch": 1.70873786407767, + "grad_norm": 0.7254228155893363, + "learning_rate": 4.841606286659819e-06, + "loss": 0.5591, + "step": 176 + }, + { + "epoch": 1.7184466019417477, + "grad_norm": 0.7786787819553973, + "learning_rate": 4.839820770207714e-06, + "loss": 0.5417, + "step": 177 + }, + { + "epoch": 1.7281553398058254, + "grad_norm": 0.7424090664501856, + "learning_rate": 4.8380255793293195e-06, + "loss": 0.5679, + "step": 178 + }, + { + "epoch": 1.737864077669903, + "grad_norm": 0.880425961769788, + "learning_rate": 4.8362207214471864e-06, + "loss": 0.5504, + "step": 179 + }, + { + "epoch": 1.7475728155339807, + "grad_norm": 0.7692052821337514, + "learning_rate": 4.83440620402384e-06, + "loss": 0.6217, + "step": 180 + }, + { + "epoch": 1.7572815533980584, + "grad_norm": 0.7571562044181578, + "learning_rate": 4.832582034561738e-06, + "loss": 0.641, + "step": 181 + }, + { + "epoch": 1.766990291262136, + "grad_norm": 0.8286514373124609, + "learning_rate": 4.830748220603251e-06, + "loss": 0.7462, + "step": 182 + }, + { + "epoch": 1.7766990291262137, + "grad_norm": 0.8663863630421635, + "learning_rate": 4.828904769730628e-06, + "loss": 0.7598, + "step": 183 + }, + { + "epoch": 1.7864077669902914, + "grad_norm": 0.8392430546644409, + "learning_rate": 4.827051689565958e-06, + "loss": 0.6278, + "step": 184 + }, + { + "epoch": 1.796116504854369, + "grad_norm": 0.7431256674232911, + "learning_rate": 4.825188987771149e-06, + "loss": 0.7471, + "step": 185 + }, + { + "epoch": 1.8058252427184467, + "grad_norm": 0.775602420628486, + "learning_rate": 4.82331667204789e-06, + "loss": 0.4658, + "step": 186 + }, + { + "epoch": 1.8155339805825244, + "grad_norm": 0.8220707289832607, + "learning_rate": 4.821434750137619e-06, + "loss": 0.6212, + "step": 187 + }, + { + "epoch": 1.825242718446602, + "grad_norm": 0.8721039703571108, + "learning_rate": 4.819543229821494e-06, + "loss": 0.7135, + "step": 188 + }, + { + "epoch": 1.8349514563106797, + "grad_norm": 0.7196046947557964, + "learning_rate": 4.8176421189203605e-06, + "loss": 0.5464, + "step": 189 + }, + { + "epoch": 1.8446601941747574, + "grad_norm": 0.7844213665577071, + "learning_rate": 4.815731425294716e-06, + "loss": 0.6536, + "step": 190 + }, + { + "epoch": 1.854368932038835, + "grad_norm": 0.7921297235454372, + "learning_rate": 4.813811156844681e-06, + "loss": 0.7183, + "step": 191 + }, + { + "epoch": 1.8640776699029127, + "grad_norm": 0.863196612505686, + "learning_rate": 4.811881321509964e-06, + "loss": 0.6976, + "step": 192 + }, + { + "epoch": 1.8737864077669903, + "grad_norm": 0.7755881818234687, + "learning_rate": 4.809941927269829e-06, + "loss": 0.8491, + "step": 193 + }, + { + "epoch": 1.883495145631068, + "grad_norm": 0.8186186296199729, + "learning_rate": 4.807992982143064e-06, + "loss": 0.6343, + "step": 194 + }, + { + "epoch": 1.8932038834951457, + "grad_norm": 0.913496737962807, + "learning_rate": 4.806034494187949e-06, + "loss": 0.8436, + "step": 195 + }, + { + "epoch": 1.9029126213592233, + "grad_norm": 0.7590143443155779, + "learning_rate": 4.804066471502216e-06, + "loss": 0.603, + "step": 196 + }, + { + "epoch": 1.912621359223301, + "grad_norm": 0.766119646590294, + "learning_rate": 4.802088922223024e-06, + "loss": 0.5167, + "step": 197 + }, + { + "epoch": 1.9223300970873787, + "grad_norm": 0.7274017571399753, + "learning_rate": 4.80010185452692e-06, + "loss": 0.6725, + "step": 198 + }, + { + "epoch": 1.9320388349514563, + "grad_norm": 0.73780689049117, + "learning_rate": 4.798105276629806e-06, + "loss": 0.521, + "step": 199 + }, + { + "epoch": 1.941747572815534, + "grad_norm": 0.7791155049733209, + "learning_rate": 4.796099196786908e-06, + "loss": 0.583, + "step": 200 + }, + { + "epoch": 1.9514563106796117, + "grad_norm": 0.8004953963543238, + "learning_rate": 4.794083623292737e-06, + "loss": 0.7842, + "step": 201 + }, + { + "epoch": 1.9611650485436893, + "grad_norm": 0.7807030577892377, + "learning_rate": 4.792058564481058e-06, + "loss": 0.5534, + "step": 202 + }, + { + "epoch": 1.970873786407767, + "grad_norm": 0.7092633632036252, + "learning_rate": 4.7900240287248554e-06, + "loss": 0.7083, + "step": 203 + }, + { + "epoch": 1.9805825242718447, + "grad_norm": 0.8875925228430702, + "learning_rate": 4.7879800244362975e-06, + "loss": 0.7782, + "step": 204 + }, + { + "epoch": 1.9902912621359223, + "grad_norm": 0.8909791320483021, + "learning_rate": 4.785926560066703e-06, + "loss": 0.7681, + "step": 205 + }, + { + "epoch": 2.0, + "grad_norm": 0.7275875515555469, + "learning_rate": 4.783863644106502e-06, + "loss": 0.649, + "step": 206 + }, + { + "epoch": 2.0097087378640777, + "grad_norm": 0.802116606621558, + "learning_rate": 4.781791285085209e-06, + "loss": 0.5568, + "step": 207 + }, + { + "epoch": 2.0194174757281553, + "grad_norm": 0.7406594271373391, + "learning_rate": 4.779709491571378e-06, + "loss": 0.5144, + "step": 208 + }, + { + "epoch": 2.029126213592233, + "grad_norm": 0.6857989185878482, + "learning_rate": 4.777618272172573e-06, + "loss": 0.6449, + "step": 209 + }, + { + "epoch": 2.0388349514563107, + "grad_norm": 0.734748210095666, + "learning_rate": 4.775517635535332e-06, + "loss": 0.7377, + "step": 210 + }, + { + "epoch": 2.0485436893203883, + "grad_norm": 0.7708218856857236, + "learning_rate": 4.77340759034513e-06, + "loss": 0.6855, + "step": 211 + }, + { + "epoch": 2.058252427184466, + "grad_norm": 0.7279897754942517, + "learning_rate": 4.771288145326343e-06, + "loss": 0.6684, + "step": 212 + }, + { + "epoch": 2.0679611650485437, + "grad_norm": 0.674474791686522, + "learning_rate": 4.769159309242213e-06, + "loss": 0.438, + "step": 213 + }, + { + "epoch": 2.0776699029126213, + "grad_norm": 0.792505397680811, + "learning_rate": 4.767021090894809e-06, + "loss": 0.6831, + "step": 214 + }, + { + "epoch": 2.087378640776699, + "grad_norm": 0.744545194492276, + "learning_rate": 4.764873499124997e-06, + "loss": 0.6976, + "step": 215 + }, + { + "epoch": 2.0970873786407767, + "grad_norm": 0.7354909274219694, + "learning_rate": 4.762716542812395e-06, + "loss": 0.5495, + "step": 216 + }, + { + "epoch": 2.1067961165048543, + "grad_norm": 0.7133200950602842, + "learning_rate": 4.7605502308753415e-06, + "loss": 0.5687, + "step": 217 + }, + { + "epoch": 2.116504854368932, + "grad_norm": 0.7982098464573162, + "learning_rate": 4.758374572270859e-06, + "loss": 0.6886, + "step": 218 + }, + { + "epoch": 2.1262135922330097, + "grad_norm": 0.6913449166288441, + "learning_rate": 4.756189575994614e-06, + "loss": 0.3812, + "step": 219 + }, + { + "epoch": 2.1359223300970873, + "grad_norm": 0.7192629747712952, + "learning_rate": 4.753995251080884e-06, + "loss": 0.575, + "step": 220 + }, + { + "epoch": 2.145631067961165, + "grad_norm": 0.6248283732386594, + "learning_rate": 4.7517916066025126e-06, + "loss": 0.463, + "step": 221 + }, + { + "epoch": 2.1553398058252426, + "grad_norm": 0.8287445715621566, + "learning_rate": 4.7495786516708806e-06, + "loss": 0.7071, + "step": 222 + }, + { + "epoch": 2.1650485436893203, + "grad_norm": 0.8318311223138934, + "learning_rate": 4.747356395435865e-06, + "loss": 0.4322, + "step": 223 + }, + { + "epoch": 2.174757281553398, + "grad_norm": 0.8533840933951986, + "learning_rate": 4.745124847085799e-06, + "loss": 0.568, + "step": 224 + }, + { + "epoch": 2.1844660194174756, + "grad_norm": 0.7901361363585824, + "learning_rate": 4.742884015847436e-06, + "loss": 0.5393, + "step": 225 + }, + { + "epoch": 2.1941747572815533, + "grad_norm": 0.8111627610336898, + "learning_rate": 4.740633910985911e-06, + "loss": 0.5969, + "step": 226 + }, + { + "epoch": 2.203883495145631, + "grad_norm": 0.7671540334640228, + "learning_rate": 4.738374541804704e-06, + "loss": 0.4576, + "step": 227 + }, + { + "epoch": 2.2135922330097086, + "grad_norm": 0.7242562019069873, + "learning_rate": 4.7361059176456e-06, + "loss": 0.5186, + "step": 228 + }, + { + "epoch": 2.2233009708737863, + "grad_norm": 0.7763865691421442, + "learning_rate": 4.733828047888647e-06, + "loss": 0.6682, + "step": 229 + }, + { + "epoch": 2.233009708737864, + "grad_norm": 0.6906133600693856, + "learning_rate": 4.731540941952126e-06, + "loss": 0.4051, + "step": 230 + }, + { + "epoch": 2.2427184466019416, + "grad_norm": 0.7481490824065317, + "learning_rate": 4.7292446092925016e-06, + "loss": 0.4902, + "step": 231 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 0.7044106920537051, + "learning_rate": 4.726939059404392e-06, + "loss": 0.4499, + "step": 232 + }, + { + "epoch": 2.262135922330097, + "grad_norm": 0.6853728707124397, + "learning_rate": 4.724624301820524e-06, + "loss": 0.4757, + "step": 233 + }, + { + "epoch": 2.2718446601941746, + "grad_norm": 0.9364489830239873, + "learning_rate": 4.722300346111695e-06, + "loss": 0.6159, + "step": 234 + }, + { + "epoch": 2.2815533980582523, + "grad_norm": 0.8138300584150231, + "learning_rate": 4.719967201886734e-06, + "loss": 0.5365, + "step": 235 + }, + { + "epoch": 2.29126213592233, + "grad_norm": 0.7475372553230331, + "learning_rate": 4.717624878792461e-06, + "loss": 0.7347, + "step": 236 + }, + { + "epoch": 2.3009708737864076, + "grad_norm": 0.9795714505586022, + "learning_rate": 4.715273386513651e-06, + "loss": 0.7175, + "step": 237 + }, + { + "epoch": 2.3106796116504853, + "grad_norm": 0.7332975150286841, + "learning_rate": 4.712912734772988e-06, + "loss": 0.5204, + "step": 238 + }, + { + "epoch": 2.320388349514563, + "grad_norm": 0.6488936041709285, + "learning_rate": 4.710542933331025e-06, + "loss": 0.5825, + "step": 239 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 0.7371958257036496, + "learning_rate": 4.708163991986152e-06, + "loss": 0.6679, + "step": 240 + }, + { + "epoch": 2.3398058252427183, + "grad_norm": 0.8214266450639852, + "learning_rate": 4.705775920574546e-06, + "loss": 0.6434, + "step": 241 + }, + { + "epoch": 2.349514563106796, + "grad_norm": 0.7702926046174587, + "learning_rate": 4.703378728970134e-06, + "loss": 0.4755, + "step": 242 + }, + { + "epoch": 2.3592233009708736, + "grad_norm": 0.6996698003201337, + "learning_rate": 4.700972427084551e-06, + "loss": 0.592, + "step": 243 + }, + { + "epoch": 2.3689320388349513, + "grad_norm": 0.7560377791738715, + "learning_rate": 4.698557024867105e-06, + "loss": 0.4565, + "step": 244 + }, + { + "epoch": 2.378640776699029, + "grad_norm": 0.7633818347510686, + "learning_rate": 4.696132532304727e-06, + "loss": 0.6659, + "step": 245 + }, + { + "epoch": 2.3883495145631066, + "grad_norm": 0.7234976470337168, + "learning_rate": 4.693698959421935e-06, + "loss": 0.5678, + "step": 246 + }, + { + "epoch": 2.3980582524271843, + "grad_norm": 0.750666487544434, + "learning_rate": 4.691256316280789e-06, + "loss": 0.8332, + "step": 247 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 0.7493376372555444, + "learning_rate": 4.688804612980855e-06, + "loss": 0.4837, + "step": 248 + }, + { + "epoch": 2.4174757281553396, + "grad_norm": 0.6751745421938132, + "learning_rate": 4.686343859659158e-06, + "loss": 0.6951, + "step": 249 + }, + { + "epoch": 2.4271844660194173, + "grad_norm": 0.734057362640619, + "learning_rate": 4.683874066490143e-06, + "loss": 0.5507, + "step": 250 + }, + { + "epoch": 2.436893203883495, + "grad_norm": 0.7227243519681933, + "learning_rate": 4.681395243685631e-06, + "loss": 0.56, + "step": 251 + }, + { + "epoch": 2.4466019417475726, + "grad_norm": 0.7751770457742055, + "learning_rate": 4.67890740149478e-06, + "loss": 0.518, + "step": 252 + }, + { + "epoch": 2.4563106796116507, + "grad_norm": 0.8010398356492873, + "learning_rate": 4.676410550204036e-06, + "loss": 0.6707, + "step": 253 + }, + { + "epoch": 2.466019417475728, + "grad_norm": 0.6322959904355947, + "learning_rate": 4.673904700137098e-06, + "loss": 0.3992, + "step": 254 + }, + { + "epoch": 2.475728155339806, + "grad_norm": 0.6951997535390679, + "learning_rate": 4.671389861654873e-06, + "loss": 0.4745, + "step": 255 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 0.7106909584689424, + "learning_rate": 4.668866045155428e-06, + "loss": 0.596, + "step": 256 + }, + { + "epoch": 2.4951456310679614, + "grad_norm": 0.8847918134161639, + "learning_rate": 4.666333261073956e-06, + "loss": 0.6131, + "step": 257 + }, + { + "epoch": 2.5048543689320386, + "grad_norm": 0.7875255376725511, + "learning_rate": 4.6637915198827265e-06, + "loss": 0.6808, + "step": 258 + }, + { + "epoch": 2.5145631067961167, + "grad_norm": 0.74457845575171, + "learning_rate": 4.661240832091042e-06, + "loss": 0.534, + "step": 259 + }, + { + "epoch": 2.524271844660194, + "grad_norm": 0.850799682933559, + "learning_rate": 4.658681208245198e-06, + "loss": 0.6302, + "step": 260 + }, + { + "epoch": 2.533980582524272, + "grad_norm": 0.7621063847057238, + "learning_rate": 4.65611265892844e-06, + "loss": 0.5924, + "step": 261 + }, + { + "epoch": 2.5436893203883493, + "grad_norm": 0.8162383044494845, + "learning_rate": 4.653535194760912e-06, + "loss": 0.5497, + "step": 262 + }, + { + "epoch": 2.5533980582524274, + "grad_norm": 0.7920894927810362, + "learning_rate": 4.650948826399624e-06, + "loss": 0.392, + "step": 263 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 0.7383821620603919, + "learning_rate": 4.648353564538397e-06, + "loss": 0.5049, + "step": 264 + }, + { + "epoch": 2.5728155339805827, + "grad_norm": 0.7650056257507512, + "learning_rate": 4.645749419907829e-06, + "loss": 0.4973, + "step": 265 + }, + { + "epoch": 2.58252427184466, + "grad_norm": 0.7120510594433922, + "learning_rate": 4.64313640327524e-06, + "loss": 0.406, + "step": 266 + }, + { + "epoch": 2.592233009708738, + "grad_norm": 0.7835448578972172, + "learning_rate": 4.640514525444637e-06, + "loss": 0.6122, + "step": 267 + }, + { + "epoch": 2.6019417475728153, + "grad_norm": 0.9134409217464837, + "learning_rate": 4.637883797256663e-06, + "loss": 0.845, + "step": 268 + }, + { + "epoch": 2.6116504854368934, + "grad_norm": 0.7387942613013898, + "learning_rate": 4.635244229588558e-06, + "loss": 0.5296, + "step": 269 + }, + { + "epoch": 2.6213592233009706, + "grad_norm": 0.6722442278200302, + "learning_rate": 4.632595833354105e-06, + "loss": 0.6111, + "step": 270 + }, + { + "epoch": 2.6310679611650487, + "grad_norm": 0.790100355706828, + "learning_rate": 4.629938619503593e-06, + "loss": 0.4931, + "step": 271 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 0.7457720631344418, + "learning_rate": 4.627272599023772e-06, + "loss": 0.6932, + "step": 272 + }, + { + "epoch": 2.650485436893204, + "grad_norm": 0.7742836603714502, + "learning_rate": 4.6245977829378e-06, + "loss": 0.8069, + "step": 273 + }, + { + "epoch": 2.6601941747572817, + "grad_norm": 0.7522977861605561, + "learning_rate": 4.6219141823052035e-06, + "loss": 0.4594, + "step": 274 + }, + { + "epoch": 2.6699029126213594, + "grad_norm": 0.7897785740037921, + "learning_rate": 4.619221808221833e-06, + "loss": 0.7502, + "step": 275 + }, + { + "epoch": 2.679611650485437, + "grad_norm": 0.6542147168655412, + "learning_rate": 4.616520671819812e-06, + "loss": 0.455, + "step": 276 + }, + { + "epoch": 2.6893203883495147, + "grad_norm": 1.3324292561191493, + "learning_rate": 4.613810784267492e-06, + "loss": 0.615, + "step": 277 + }, + { + "epoch": 2.6990291262135924, + "grad_norm": 0.6366824455259196, + "learning_rate": 4.61109215676941e-06, + "loss": 0.4398, + "step": 278 + }, + { + "epoch": 2.70873786407767, + "grad_norm": 0.7464337608282651, + "learning_rate": 4.608364800566241e-06, + "loss": 0.587, + "step": 279 + }, + { + "epoch": 2.7184466019417477, + "grad_norm": 0.6895928222340304, + "learning_rate": 4.605628726934747e-06, + "loss": 0.6703, + "step": 280 + }, + { + "epoch": 2.7281553398058254, + "grad_norm": 0.6878653541621508, + "learning_rate": 4.602883947187738e-06, + "loss": 0.5948, + "step": 281 + }, + { + "epoch": 2.737864077669903, + "grad_norm": 0.6926203162726917, + "learning_rate": 4.600130472674017e-06, + "loss": 0.4607, + "step": 282 + }, + { + "epoch": 2.7475728155339807, + "grad_norm": 0.7781936564011823, + "learning_rate": 4.5973683147783405e-06, + "loss": 0.7486, + "step": 283 + }, + { + "epoch": 2.7572815533980584, + "grad_norm": 0.7501081998810354, + "learning_rate": 4.594597484921365e-06, + "loss": 0.5782, + "step": 284 + }, + { + "epoch": 2.766990291262136, + "grad_norm": 0.7647372977005877, + "learning_rate": 4.5918179945596055e-06, + "loss": 0.5159, + "step": 285 + }, + { + "epoch": 2.7766990291262137, + "grad_norm": 0.6977238852528742, + "learning_rate": 4.589029855185384e-06, + "loss": 0.5334, + "step": 286 + }, + { + "epoch": 2.7864077669902914, + "grad_norm": 0.9130101610006348, + "learning_rate": 4.586233078326785e-06, + "loss": 0.8354, + "step": 287 + }, + { + "epoch": 2.796116504854369, + "grad_norm": 0.8318292434477851, + "learning_rate": 4.583427675547602e-06, + "loss": 0.6258, + "step": 288 + }, + { + "epoch": 2.8058252427184467, + "grad_norm": 0.7324981921403528, + "learning_rate": 4.580613658447301e-06, + "loss": 0.709, + "step": 289 + }, + { + "epoch": 2.8155339805825244, + "grad_norm": 0.7523751933345019, + "learning_rate": 4.577791038660959e-06, + "loss": 0.4851, + "step": 290 + }, + { + "epoch": 2.825242718446602, + "grad_norm": 0.7542503570491638, + "learning_rate": 4.574959827859226e-06, + "loss": 0.4493, + "step": 291 + }, + { + "epoch": 2.8349514563106797, + "grad_norm": 0.741187265691656, + "learning_rate": 4.572120037748273e-06, + "loss": 0.4682, + "step": 292 + }, + { + "epoch": 2.8446601941747574, + "grad_norm": 0.7141919069039983, + "learning_rate": 4.5692716800697415e-06, + "loss": 0.6235, + "step": 293 + }, + { + "epoch": 2.854368932038835, + "grad_norm": 0.7414161387983556, + "learning_rate": 4.566414766600698e-06, + "loss": 0.5613, + "step": 294 + }, + { + "epoch": 2.8640776699029127, + "grad_norm": 0.700849509216115, + "learning_rate": 4.563549309153589e-06, + "loss": 0.443, + "step": 295 + }, + { + "epoch": 2.8737864077669903, + "grad_norm": 0.7854807058221909, + "learning_rate": 4.56067531957618e-06, + "loss": 0.4657, + "step": 296 + }, + { + "epoch": 2.883495145631068, + "grad_norm": 0.7744368603433763, + "learning_rate": 4.557792809751519e-06, + "loss": 0.6192, + "step": 297 + }, + { + "epoch": 2.8932038834951457, + "grad_norm": 0.811186979862698, + "learning_rate": 4.554901791597883e-06, + "loss": 0.5432, + "step": 298 + }, + { + "epoch": 2.9029126213592233, + "grad_norm": 0.7610230989372687, + "learning_rate": 4.552002277068725e-06, + "loss": 0.5689, + "step": 299 + }, + { + "epoch": 2.912621359223301, + "grad_norm": 0.738268676455508, + "learning_rate": 4.549094278152631e-06, + "loss": 0.6102, + "step": 300 + }, + { + "epoch": 2.9223300970873787, + "grad_norm": 0.8265276954113333, + "learning_rate": 4.546177806873266e-06, + "loss": 0.4803, + "step": 301 + }, + { + "epoch": 2.9320388349514563, + "grad_norm": 0.7994330788168778, + "learning_rate": 4.543252875289326e-06, + "loss": 0.5232, + "step": 302 + }, + { + "epoch": 2.941747572815534, + "grad_norm": 0.7803228028067196, + "learning_rate": 4.540319495494486e-06, + "loss": 0.5785, + "step": 303 + }, + { + "epoch": 2.9514563106796117, + "grad_norm": 0.812661521852206, + "learning_rate": 4.537377679617353e-06, + "loss": 0.5857, + "step": 304 + }, + { + "epoch": 2.9611650485436893, + "grad_norm": 0.8017682940418599, + "learning_rate": 4.534427439821416e-06, + "loss": 0.4679, + "step": 305 + }, + { + "epoch": 2.970873786407767, + "grad_norm": 0.7557388320666087, + "learning_rate": 4.531468788304992e-06, + "loss": 0.4511, + "step": 306 + }, + { + "epoch": 2.9805825242718447, + "grad_norm": 0.7990082356636408, + "learning_rate": 4.5285017373011784e-06, + "loss": 0.4999, + "step": 307 + }, + { + "epoch": 2.9902912621359223, + "grad_norm": 0.8410650831835784, + "learning_rate": 4.5255262990778024e-06, + "loss": 0.4279, + "step": 308 + }, + { + "epoch": 3.0, + "grad_norm": 0.67842206115732, + "learning_rate": 4.522542485937369e-06, + "loss": 0.4778, + "step": 309 + }, + { + "epoch": 3.0097087378640777, + "grad_norm": 0.7208972107613384, + "learning_rate": 4.519550310217013e-06, + "loss": 0.5166, + "step": 310 + }, + { + "epoch": 3.0194174757281553, + "grad_norm": 0.73349262960244, + "learning_rate": 4.516549784288442e-06, + "loss": 0.4455, + "step": 311 + }, + { + "epoch": 3.029126213592233, + "grad_norm": 0.6956083097152967, + "learning_rate": 4.513540920557892e-06, + "loss": 0.4371, + "step": 312 + }, + { + "epoch": 3.0388349514563107, + "grad_norm": 0.7915417702341847, + "learning_rate": 4.510523731466072e-06, + "loss": 0.6448, + "step": 313 + }, + { + "epoch": 3.0485436893203883, + "grad_norm": 0.5990172469379881, + "learning_rate": 4.507498229488116e-06, + "loss": 0.3384, + "step": 314 + }, + { + "epoch": 3.058252427184466, + "grad_norm": 0.8318855981134111, + "learning_rate": 4.504464427133527e-06, + "loss": 0.4508, + "step": 315 + }, + { + "epoch": 3.0679611650485437, + "grad_norm": 0.8758812451211815, + "learning_rate": 4.501422336946126e-06, + "loss": 0.4328, + "step": 316 + }, + { + "epoch": 3.0776699029126213, + "grad_norm": 0.7844299628970943, + "learning_rate": 4.498371971504005e-06, + "loss": 0.4222, + "step": 317 + }, + { + "epoch": 3.087378640776699, + "grad_norm": 0.7195150581975005, + "learning_rate": 4.49531334341947e-06, + "loss": 0.4417, + "step": 318 + }, + { + "epoch": 3.0970873786407767, + "grad_norm": 0.7671602770728359, + "learning_rate": 4.49224646533899e-06, + "loss": 0.42, + "step": 319 + }, + { + "epoch": 3.1067961165048543, + "grad_norm": 0.6929615407642132, + "learning_rate": 4.489171349943144e-06, + "loss": 0.4332, + "step": 320 + }, + { + "epoch": 3.116504854368932, + "grad_norm": 0.7086258123510658, + "learning_rate": 4.486088009946575e-06, + "loss": 0.3632, + "step": 321 + }, + { + "epoch": 3.1262135922330097, + "grad_norm": 0.7708151250786913, + "learning_rate": 4.482996458097926e-06, + "loss": 0.4975, + "step": 322 + }, + { + "epoch": 3.1359223300970873, + "grad_norm": 0.846155524161367, + "learning_rate": 4.479896707179796e-06, + "loss": 0.6871, + "step": 323 + }, + { + "epoch": 3.145631067961165, + "grad_norm": 0.8231242232551517, + "learning_rate": 4.476788770008685e-06, + "loss": 0.6574, + "step": 324 + }, + { + "epoch": 3.1553398058252426, + "grad_norm": 0.8549632502448101, + "learning_rate": 4.473672659434941e-06, + "loss": 0.5856, + "step": 325 + }, + { + "epoch": 3.1650485436893203, + "grad_norm": 0.7228128426745009, + "learning_rate": 4.470548388342704e-06, + "loss": 0.4776, + "step": 326 + }, + { + "epoch": 3.174757281553398, + "grad_norm": 0.8108949680823984, + "learning_rate": 4.467415969649858e-06, + "loss": 0.4874, + "step": 327 + }, + { + "epoch": 3.1844660194174756, + "grad_norm": 0.6851857911673899, + "learning_rate": 4.464275416307973e-06, + "loss": 0.4994, + "step": 328 + }, + { + "epoch": 3.1941747572815533, + "grad_norm": 0.761950009953729, + "learning_rate": 4.461126741302253e-06, + "loss": 0.4929, + "step": 329 + }, + { + "epoch": 3.203883495145631, + "grad_norm": 0.8432338612654883, + "learning_rate": 4.457969957651485e-06, + "loss": 0.4137, + "step": 330 + }, + { + "epoch": 3.2135922330097086, + "grad_norm": 0.7741138798177974, + "learning_rate": 4.454805078407979e-06, + "loss": 0.6696, + "step": 331 + }, + { + "epoch": 3.2233009708737863, + "grad_norm": 0.71765697779996, + "learning_rate": 4.451632116657521e-06, + "loss": 0.4506, + "step": 332 + }, + { + "epoch": 3.233009708737864, + "grad_norm": 0.6253829454649015, + "learning_rate": 4.448451085519314e-06, + "loss": 0.3586, + "step": 333 + }, + { + "epoch": 3.2427184466019416, + "grad_norm": 0.9686569714580476, + "learning_rate": 4.445261998145927e-06, + "loss": 0.4832, + "step": 334 + }, + { + "epoch": 3.2524271844660193, + "grad_norm": 0.682456331535341, + "learning_rate": 4.442064867723236e-06, + "loss": 0.4737, + "step": 335 + }, + { + "epoch": 3.262135922330097, + "grad_norm": 0.801934212380896, + "learning_rate": 4.438859707470376e-06, + "loss": 0.4988, + "step": 336 + }, + { + "epoch": 3.2718446601941746, + "grad_norm": 0.6210847995583545, + "learning_rate": 4.435646530639679e-06, + "loss": 0.3549, + "step": 337 + }, + { + "epoch": 3.2815533980582523, + "grad_norm": 0.7693142736109597, + "learning_rate": 4.432425350516627e-06, + "loss": 0.4612, + "step": 338 + }, + { + "epoch": 3.29126213592233, + "grad_norm": 0.7469011823723339, + "learning_rate": 4.42919618041979e-06, + "loss": 0.3343, + "step": 339 + }, + { + "epoch": 3.3009708737864076, + "grad_norm": 0.6080757362122666, + "learning_rate": 4.425959033700776e-06, + "loss": 0.2422, + "step": 340 + }, + { + "epoch": 3.3106796116504853, + "grad_norm": 0.7696858723921102, + "learning_rate": 4.422713923744174e-06, + "loss": 0.5103, + "step": 341 + }, + { + "epoch": 3.320388349514563, + "grad_norm": 0.8778656215024975, + "learning_rate": 4.419460863967496e-06, + "loss": 0.5335, + "step": 342 + }, + { + "epoch": 3.3300970873786406, + "grad_norm": 0.6994532036876416, + "learning_rate": 4.416199867821126e-06, + "loss": 0.5022, + "step": 343 + }, + { + "epoch": 3.3398058252427183, + "grad_norm": 0.7969656894134531, + "learning_rate": 4.412930948788263e-06, + "loss": 0.4246, + "step": 344 + }, + { + "epoch": 3.349514563106796, + "grad_norm": 0.8154555970057127, + "learning_rate": 4.409654120384863e-06, + "loss": 0.6664, + "step": 345 + }, + { + "epoch": 3.3592233009708736, + "grad_norm": 0.7113779335682774, + "learning_rate": 4.406369396159585e-06, + "loss": 0.6024, + "step": 346 + }, + { + "epoch": 3.3689320388349513, + "grad_norm": 0.7946176349594999, + "learning_rate": 4.403076789693735e-06, + "loss": 0.6273, + "step": 347 + }, + { + "epoch": 3.378640776699029, + "grad_norm": 0.6725243345900541, + "learning_rate": 4.399776314601212e-06, + "loss": 0.422, + "step": 348 + }, + { + "epoch": 3.3883495145631066, + "grad_norm": 0.8474457763132652, + "learning_rate": 4.396467984528445e-06, + "loss": 0.5515, + "step": 349 + }, + { + "epoch": 3.3980582524271843, + "grad_norm": 0.7635758657443303, + "learning_rate": 4.393151813154345e-06, + "loss": 0.417, + "step": 350 + }, + { + "epoch": 3.407766990291262, + "grad_norm": 0.7301687492460268, + "learning_rate": 4.3898278141902396e-06, + "loss": 0.4335, + "step": 351 + }, + { + "epoch": 3.4174757281553396, + "grad_norm": 0.8217888155242994, + "learning_rate": 4.386496001379826e-06, + "loss": 0.5301, + "step": 352 + }, + { + "epoch": 3.4271844660194173, + "grad_norm": 0.700440895441813, + "learning_rate": 4.383156388499106e-06, + "loss": 0.5289, + "step": 353 + }, + { + "epoch": 3.436893203883495, + "grad_norm": 0.676777993038823, + "learning_rate": 4.3798089893563335e-06, + "loss": 0.4079, + "step": 354 + }, + { + "epoch": 3.4466019417475726, + "grad_norm": 0.7359945465771945, + "learning_rate": 4.3764538177919555e-06, + "loss": 0.3024, + "step": 355 + }, + { + "epoch": 3.4563106796116507, + "grad_norm": 0.7431315486816701, + "learning_rate": 4.3730908876785574e-06, + "loss": 0.4715, + "step": 356 + }, + { + "epoch": 3.466019417475728, + "grad_norm": 0.858728130994808, + "learning_rate": 4.3697202129208e-06, + "loss": 0.5126, + "step": 357 + }, + { + "epoch": 3.475728155339806, + "grad_norm": 0.7006328621067651, + "learning_rate": 4.36634180745537e-06, + "loss": 0.3219, + "step": 358 + }, + { + "epoch": 3.4854368932038833, + "grad_norm": 0.8347983820993932, + "learning_rate": 4.3629556852509145e-06, + "loss": 0.7038, + "step": 359 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 0.6736438089942041, + "learning_rate": 4.35956186030799e-06, + "loss": 0.388, + "step": 360 + }, + { + "epoch": 3.5048543689320386, + "grad_norm": 0.7313432635941632, + "learning_rate": 4.356160346659001e-06, + "loss": 0.4803, + "step": 361 + }, + { + "epoch": 3.5145631067961167, + "grad_norm": 0.914378556446873, + "learning_rate": 4.3527511583681384e-06, + "loss": 0.9472, + "step": 362 + }, + { + "epoch": 3.524271844660194, + "grad_norm": 0.7928555788899265, + "learning_rate": 4.34933430953133e-06, + "loss": 0.5195, + "step": 363 + }, + { + "epoch": 3.533980582524272, + "grad_norm": 0.8552957566999786, + "learning_rate": 4.345909814276177e-06, + "loss": 0.5432, + "step": 364 + }, + { + "epoch": 3.5436893203883493, + "grad_norm": 0.6593786360114855, + "learning_rate": 4.3424776867618935e-06, + "loss": 0.5773, + "step": 365 + }, + { + "epoch": 3.5533980582524274, + "grad_norm": 0.7453073470450546, + "learning_rate": 4.339037941179253e-06, + "loss": 0.5414, + "step": 366 + }, + { + "epoch": 3.5631067961165046, + "grad_norm": 0.7285347424646024, + "learning_rate": 4.335590591750526e-06, + "loss": 0.4336, + "step": 367 + }, + { + "epoch": 3.5728155339805827, + "grad_norm": 0.7229430966497608, + "learning_rate": 4.332135652729423e-06, + "loss": 0.4226, + "step": 368 + }, + { + "epoch": 3.58252427184466, + "grad_norm": 0.7264295125746805, + "learning_rate": 4.328673138401036e-06, + "loss": 0.5502, + "step": 369 + }, + { + "epoch": 3.592233009708738, + "grad_norm": 0.7134753381323626, + "learning_rate": 4.325203063081776e-06, + "loss": 0.2876, + "step": 370 + }, + { + "epoch": 3.6019417475728153, + "grad_norm": 0.7636004423408037, + "learning_rate": 4.32172544111932e-06, + "loss": 0.3651, + "step": 371 + }, + { + "epoch": 3.6116504854368934, + "grad_norm": 0.731718208729643, + "learning_rate": 4.318240286892544e-06, + "loss": 0.4391, + "step": 372 + }, + { + "epoch": 3.6213592233009706, + "grad_norm": 0.6392786785401195, + "learning_rate": 4.314747614811471e-06, + "loss": 0.4575, + "step": 373 + }, + { + "epoch": 3.6310679611650487, + "grad_norm": 0.7082359191028809, + "learning_rate": 4.3112474393172055e-06, + "loss": 0.3473, + "step": 374 + }, + { + "epoch": 3.6407766990291264, + "grad_norm": 0.704495139070046, + "learning_rate": 4.307739774881878e-06, + "loss": 0.4346, + "step": 375 + }, + { + "epoch": 3.650485436893204, + "grad_norm": 0.7500124236230435, + "learning_rate": 4.304224636008582e-06, + "loss": 0.2937, + "step": 376 + }, + { + "epoch": 3.6601941747572817, + "grad_norm": 0.7473030765489458, + "learning_rate": 4.300702037231318e-06, + "loss": 0.5837, + "step": 377 + }, + { + "epoch": 3.6699029126213594, + "grad_norm": 0.6924606297437474, + "learning_rate": 4.297171993114927e-06, + "loss": 0.2863, + "step": 378 + }, + { + "epoch": 3.679611650485437, + "grad_norm": 0.7683874913737846, + "learning_rate": 4.2936345182550365e-06, + "loss": 0.5933, + "step": 379 + }, + { + "epoch": 3.6893203883495147, + "grad_norm": 0.738704129251105, + "learning_rate": 4.290089627277998e-06, + "loss": 0.3695, + "step": 380 + }, + { + "epoch": 3.6990291262135924, + "grad_norm": 0.730122979591364, + "learning_rate": 4.286537334840825e-06, + "loss": 0.5314, + "step": 381 + }, + { + "epoch": 3.70873786407767, + "grad_norm": 0.8561211215862192, + "learning_rate": 4.2829776556311355e-06, + "loss": 0.4077, + "step": 382 + }, + { + "epoch": 3.7184466019417477, + "grad_norm": 0.8670647072409673, + "learning_rate": 4.279410604367088e-06, + "loss": 0.6157, + "step": 383 + }, + { + "epoch": 3.7281553398058254, + "grad_norm": 0.8439911656034701, + "learning_rate": 4.275836195797323e-06, + "loss": 0.5611, + "step": 384 + }, + { + "epoch": 3.737864077669903, + "grad_norm": 0.8111642418779228, + "learning_rate": 4.2722544447008995e-06, + "loss": 0.6242, + "step": 385 + }, + { + "epoch": 3.7475728155339807, + "grad_norm": 0.8234444876560004, + "learning_rate": 4.268665365887238e-06, + "loss": 0.626, + "step": 386 + }, + { + "epoch": 3.7572815533980584, + "grad_norm": 0.7499486492298939, + "learning_rate": 4.265068974196056e-06, + "loss": 0.3372, + "step": 387 + }, + { + "epoch": 3.766990291262136, + "grad_norm": 0.8486583520088408, + "learning_rate": 4.261465284497307e-06, + "loss": 0.3682, + "step": 388 + }, + { + "epoch": 3.7766990291262137, + "grad_norm": 0.8004979648644168, + "learning_rate": 4.257854311691118e-06, + "loss": 0.5395, + "step": 389 + }, + { + "epoch": 3.7864077669902914, + "grad_norm": 0.7151044131463972, + "learning_rate": 4.254236070707734e-06, + "loss": 0.3921, + "step": 390 + }, + { + "epoch": 3.796116504854369, + "grad_norm": 0.6664683762506176, + "learning_rate": 4.250610576507445e-06, + "loss": 0.357, + "step": 391 + }, + { + "epoch": 3.8058252427184467, + "grad_norm": 0.8605859568498817, + "learning_rate": 4.246977844080537e-06, + "loss": 0.5304, + "step": 392 + }, + { + "epoch": 3.8155339805825244, + "grad_norm": 0.8012964647317763, + "learning_rate": 4.24333788844722e-06, + "loss": 0.4708, + "step": 393 + }, + { + "epoch": 3.825242718446602, + "grad_norm": 0.8135374110450408, + "learning_rate": 4.239690724657571e-06, + "loss": 0.4547, + "step": 394 + }, + { + "epoch": 3.8349514563106797, + "grad_norm": 1.1239630061318624, + "learning_rate": 4.236036367791471e-06, + "loss": 0.5222, + "step": 395 + }, + { + "epoch": 3.8446601941747574, + "grad_norm": 0.8761219541832255, + "learning_rate": 4.23237483295854e-06, + "loss": 0.584, + "step": 396 + }, + { + "epoch": 3.854368932038835, + "grad_norm": 0.7205334687878558, + "learning_rate": 4.228706135298081e-06, + "loss": 0.383, + "step": 397 + }, + { + "epoch": 3.8640776699029127, + "grad_norm": 0.778638883459709, + "learning_rate": 4.225030289979006e-06, + "loss": 0.3611, + "step": 398 + }, + { + "epoch": 3.8737864077669903, + "grad_norm": 0.775480657637448, + "learning_rate": 4.221347312199788e-06, + "loss": 0.3474, + "step": 399 + }, + { + "epoch": 3.883495145631068, + "grad_norm": 0.9124181162931588, + "learning_rate": 4.2176572171883865e-06, + "loss": 0.5554, + "step": 400 + }, + { + "epoch": 3.8932038834951457, + "grad_norm": 0.7674226802327085, + "learning_rate": 4.213960020202187e-06, + "loss": 0.5775, + "step": 401 + }, + { + "epoch": 3.9029126213592233, + "grad_norm": 0.8374874478064829, + "learning_rate": 4.2102557365279435e-06, + "loss": 0.6738, + "step": 402 + }, + { + "epoch": 3.912621359223301, + "grad_norm": 0.789430401614793, + "learning_rate": 4.206544381481708e-06, + "loss": 0.5645, + "step": 403 + }, + { + "epoch": 3.9223300970873787, + "grad_norm": 0.8301680910378548, + "learning_rate": 4.202825970408772e-06, + "loss": 0.3362, + "step": 404 + }, + { + "epoch": 3.9320388349514563, + "grad_norm": 0.7434161207431128, + "learning_rate": 4.199100518683601e-06, + "loss": 0.3865, + "step": 405 + }, + { + "epoch": 3.941747572815534, + "grad_norm": 0.8023276174001709, + "learning_rate": 4.195368041709772e-06, + "loss": 0.557, + "step": 406 + }, + { + "epoch": 3.9514563106796117, + "grad_norm": 0.6897944866344006, + "learning_rate": 4.191628554919907e-06, + "loss": 0.4306, + "step": 407 + }, + { + "epoch": 3.9611650485436893, + "grad_norm": 0.7587292046885276, + "learning_rate": 4.187882073775615e-06, + "loss": 0.4791, + "step": 408 + }, + { + "epoch": 3.970873786407767, + "grad_norm": 0.7402776540544128, + "learning_rate": 4.184128613767422e-06, + "loss": 0.4587, + "step": 409 + }, + { + "epoch": 3.9805825242718447, + "grad_norm": 0.918829002078532, + "learning_rate": 4.18036819041471e-06, + "loss": 0.4834, + "step": 410 + }, + { + "epoch": 3.9902912621359223, + "grad_norm": 0.6951873113715458, + "learning_rate": 4.17660081926565e-06, + "loss": 0.3421, + "step": 411 + }, + { + "epoch": 4.0, + "grad_norm": 0.6863501093124053, + "learning_rate": 4.172826515897146e-06, + "loss": 0.3388, + "step": 412 + }, + { + "epoch": 4.009708737864078, + "grad_norm": 0.7114971026737045, + "learning_rate": 4.169045295914757e-06, + "loss": 0.5117, + "step": 413 + }, + { + "epoch": 4.019417475728155, + "grad_norm": 0.669639982894413, + "learning_rate": 4.165257174952647e-06, + "loss": 0.2925, + "step": 414 + }, + { + "epoch": 4.029126213592233, + "grad_norm": 0.7192571477195556, + "learning_rate": 4.161462168673508e-06, + "loss": 0.4189, + "step": 415 + }, + { + "epoch": 4.038834951456311, + "grad_norm": 0.7186614597357849, + "learning_rate": 4.157660292768502e-06, + "loss": 0.5147, + "step": 416 + }, + { + "epoch": 4.048543689320389, + "grad_norm": 0.7615450499237213, + "learning_rate": 4.1538515629571985e-06, + "loss": 0.3839, + "step": 417 + }, + { + "epoch": 4.058252427184466, + "grad_norm": 0.7825364795958354, + "learning_rate": 4.1500359949875e-06, + "loss": 0.5142, + "step": 418 + }, + { + "epoch": 4.067961165048544, + "grad_norm": 0.7567301120413946, + "learning_rate": 4.1462136046355864e-06, + "loss": 0.3761, + "step": 419 + }, + { + "epoch": 4.077669902912621, + "grad_norm": 0.7496382866591231, + "learning_rate": 4.142384407705846e-06, + "loss": 0.6578, + "step": 420 + }, + { + "epoch": 4.087378640776699, + "grad_norm": 0.6832493076636806, + "learning_rate": 4.138548420030808e-06, + "loss": 0.2888, + "step": 421 + }, + { + "epoch": 4.097087378640777, + "grad_norm": 0.8082055763247596, + "learning_rate": 4.13470565747108e-06, + "loss": 0.4464, + "step": 422 + }, + { + "epoch": 4.106796116504855, + "grad_norm": 0.6922107618500224, + "learning_rate": 4.130856135915282e-06, + "loss": 0.414, + "step": 423 + }, + { + "epoch": 4.116504854368932, + "grad_norm": 0.8568390451993608, + "learning_rate": 4.126999871279982e-06, + "loss": 0.3524, + "step": 424 + }, + { + "epoch": 4.12621359223301, + "grad_norm": 0.7555701643934601, + "learning_rate": 4.123136879509626e-06, + "loss": 0.4294, + "step": 425 + }, + { + "epoch": 4.135922330097087, + "grad_norm": 0.7172542927002353, + "learning_rate": 4.119267176576475e-06, + "loss": 0.464, + "step": 426 + }, + { + "epoch": 4.145631067961165, + "grad_norm": 0.6682558575648311, + "learning_rate": 4.11539077848054e-06, + "loss": 0.4873, + "step": 427 + }, + { + "epoch": 4.155339805825243, + "grad_norm": 0.8598473815632454, + "learning_rate": 4.111507701249513e-06, + "loss": 0.4819, + "step": 428 + }, + { + "epoch": 4.165048543689321, + "grad_norm": 0.9167731886550053, + "learning_rate": 4.107617960938702e-06, + "loss": 0.1954, + "step": 429 + }, + { + "epoch": 4.174757281553398, + "grad_norm": 0.8392679998978078, + "learning_rate": 4.103721573630965e-06, + "loss": 0.3716, + "step": 430 + }, + { + "epoch": 4.184466019417476, + "grad_norm": 0.7039689710893157, + "learning_rate": 4.099818555436645e-06, + "loss": 0.399, + "step": 431 + }, + { + "epoch": 4.194174757281553, + "grad_norm": 0.8962803885867973, + "learning_rate": 4.095908922493499e-06, + "loss": 0.6665, + "step": 432 + }, + { + "epoch": 4.203883495145631, + "grad_norm": 1.919580266091257, + "learning_rate": 4.091992690966636e-06, + "loss": 0.2248, + "step": 433 + }, + { + "epoch": 4.213592233009709, + "grad_norm": 0.7602103224056724, + "learning_rate": 4.088069877048447e-06, + "loss": 0.3179, + "step": 434 + }, + { + "epoch": 4.223300970873787, + "grad_norm": 0.6467738319616958, + "learning_rate": 4.084140496958539e-06, + "loss": 0.2963, + "step": 435 + }, + { + "epoch": 4.233009708737864, + "grad_norm": 0.7220900194096829, + "learning_rate": 4.080204566943668e-06, + "loss": 0.2475, + "step": 436 + }, + { + "epoch": 4.242718446601942, + "grad_norm": 1.189054603182352, + "learning_rate": 4.076262103277673e-06, + "loss": 0.2943, + "step": 437 + }, + { + "epoch": 4.252427184466019, + "grad_norm": 0.7169713980642173, + "learning_rate": 4.072313122261406e-06, + "loss": 0.6236, + "step": 438 + }, + { + "epoch": 4.262135922330097, + "grad_norm": 0.631689118975417, + "learning_rate": 4.068357640222668e-06, + "loss": 0.2852, + "step": 439 + }, + { + "epoch": 4.271844660194175, + "grad_norm": 0.756930350000182, + "learning_rate": 4.06439567351614e-06, + "loss": 0.3246, + "step": 440 + }, + { + "epoch": 4.281553398058253, + "grad_norm": 0.6706899017753607, + "learning_rate": 4.0604272385233105e-06, + "loss": 0.3887, + "step": 441 + }, + { + "epoch": 4.29126213592233, + "grad_norm": 0.6775487389857546, + "learning_rate": 4.056452351652418e-06, + "loss": 0.2223, + "step": 442 + }, + { + "epoch": 4.300970873786408, + "grad_norm": 0.8187523228643651, + "learning_rate": 4.052471029338375e-06, + "loss": 0.4288, + "step": 443 + }, + { + "epoch": 4.310679611650485, + "grad_norm": 0.6200886929947741, + "learning_rate": 4.048483288042703e-06, + "loss": 0.24, + "step": 444 + }, + { + "epoch": 4.320388349514563, + "grad_norm": 0.7372457408545906, + "learning_rate": 4.0444891442534615e-06, + "loss": 0.3116, + "step": 445 + }, + { + "epoch": 4.330097087378641, + "grad_norm": 0.7260528356835414, + "learning_rate": 4.040488614485187e-06, + "loss": 0.4287, + "step": 446 + }, + { + "epoch": 4.339805825242719, + "grad_norm": 0.8887283733391886, + "learning_rate": 4.036481715278818e-06, + "loss": 0.4366, + "step": 447 + }, + { + "epoch": 4.349514563106796, + "grad_norm": 0.7417768044969674, + "learning_rate": 4.032468463201626e-06, + "loss": 0.4489, + "step": 448 + }, + { + "epoch": 4.359223300970874, + "grad_norm": 0.767167243913028, + "learning_rate": 4.028448874847152e-06, + "loss": 0.1889, + "step": 449 + }, + { + "epoch": 4.368932038834951, + "grad_norm": 0.7548738530081098, + "learning_rate": 4.024422966835137e-06, + "loss": 0.3836, + "step": 450 + }, + { + "epoch": 4.378640776699029, + "grad_norm": 0.7267980574762086, + "learning_rate": 4.0203907558114475e-06, + "loss": 0.3113, + "step": 451 + }, + { + "epoch": 4.388349514563107, + "grad_norm": 0.7384964389802123, + "learning_rate": 4.016352258448016e-06, + "loss": 0.2558, + "step": 452 + }, + { + "epoch": 4.398058252427185, + "grad_norm": 0.7354848571749719, + "learning_rate": 4.0123074914427635e-06, + "loss": 0.2631, + "step": 453 + }, + { + "epoch": 4.407766990291262, + "grad_norm": 0.6121502281910712, + "learning_rate": 4.008256471519536e-06, + "loss": 0.2496, + "step": 454 + }, + { + "epoch": 4.41747572815534, + "grad_norm": 0.8179123108880375, + "learning_rate": 4.004199215428032e-06, + "loss": 0.3859, + "step": 455 + }, + { + "epoch": 4.427184466019417, + "grad_norm": 0.9831209055765248, + "learning_rate": 4.000135739943735e-06, + "loss": 0.4455, + "step": 456 + }, + { + "epoch": 4.436893203883495, + "grad_norm": 0.8368630657095354, + "learning_rate": 3.996066061867844e-06, + "loss": 0.4183, + "step": 457 + }, + { + "epoch": 4.446601941747573, + "grad_norm": 0.7510366020026782, + "learning_rate": 3.991990198027203e-06, + "loss": 0.3243, + "step": 458 + }, + { + "epoch": 4.456310679611651, + "grad_norm": 0.7739088378814717, + "learning_rate": 3.987908165274233e-06, + "loss": 0.3937, + "step": 459 + }, + { + "epoch": 4.466019417475728, + "grad_norm": 0.7451064795923457, + "learning_rate": 3.9838199804868635e-06, + "loss": 0.4762, + "step": 460 + }, + { + "epoch": 4.475728155339806, + "grad_norm": 0.8005653610588523, + "learning_rate": 3.979725660568456e-06, + "loss": 0.4203, + "step": 461 + }, + { + "epoch": 4.485436893203883, + "grad_norm": 0.7571415862327683, + "learning_rate": 3.975625222447742e-06, + "loss": 0.2966, + "step": 462 + }, + { + "epoch": 4.495145631067961, + "grad_norm": 0.9407682824067278, + "learning_rate": 3.97151868307875e-06, + "loss": 0.4631, + "step": 463 + }, + { + "epoch": 4.504854368932039, + "grad_norm": 0.8280322987446443, + "learning_rate": 3.9674060594407345e-06, + "loss": 0.3439, + "step": 464 + }, + { + "epoch": 4.514563106796117, + "grad_norm": 0.7056881797171722, + "learning_rate": 3.963287368538105e-06, + "loss": 0.2049, + "step": 465 + }, + { + "epoch": 4.524271844660194, + "grad_norm": 0.9332990417689767, + "learning_rate": 3.959162627400361e-06, + "loss": 0.4755, + "step": 466 + }, + { + "epoch": 4.533980582524272, + "grad_norm": 1.1403012923588804, + "learning_rate": 3.9550318530820145e-06, + "loss": 0.4185, + "step": 467 + }, + { + "epoch": 4.543689320388349, + "grad_norm": 0.689075963184153, + "learning_rate": 3.9508950626625244e-06, + "loss": 0.2702, + "step": 468 + }, + { + "epoch": 4.553398058252427, + "grad_norm": 0.8020665017742961, + "learning_rate": 3.946752273246224e-06, + "loss": 0.4359, + "step": 469 + }, + { + "epoch": 4.563106796116505, + "grad_norm": 0.8212257599019372, + "learning_rate": 3.942603501962249e-06, + "loss": 0.352, + "step": 470 + }, + { + "epoch": 4.572815533980583, + "grad_norm": 0.7742300518258042, + "learning_rate": 3.9384487659644716e-06, + "loss": 0.2489, + "step": 471 + }, + { + "epoch": 4.58252427184466, + "grad_norm": 0.7529500735779854, + "learning_rate": 3.934288082431423e-06, + "loss": 0.4064, + "step": 472 + }, + { + "epoch": 4.592233009708738, + "grad_norm": 0.7285209333659659, + "learning_rate": 3.930121468566227e-06, + "loss": 0.4903, + "step": 473 + }, + { + "epoch": 4.601941747572815, + "grad_norm": 0.702088268251986, + "learning_rate": 3.925948941596528e-06, + "loss": 0.3524, + "step": 474 + }, + { + "epoch": 4.611650485436893, + "grad_norm": 0.6123252794922119, + "learning_rate": 3.92177051877442e-06, + "loss": 0.291, + "step": 475 + }, + { + "epoch": 4.621359223300971, + "grad_norm": 0.743156646498327, + "learning_rate": 3.917586217376369e-06, + "loss": 0.435, + "step": 476 + }, + { + "epoch": 4.631067961165049, + "grad_norm": 0.704106227231604, + "learning_rate": 3.913396054703155e-06, + "loss": 0.4408, + "step": 477 + }, + { + "epoch": 4.640776699029126, + "grad_norm": 0.7127662571747935, + "learning_rate": 3.909200048079786e-06, + "loss": 0.4177, + "step": 478 + }, + { + "epoch": 4.650485436893204, + "grad_norm": 0.748418825692663, + "learning_rate": 3.9049982148554384e-06, + "loss": 0.5271, + "step": 479 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 1.0250685486452722, + "learning_rate": 3.900790572403376e-06, + "loss": 0.3461, + "step": 480 + }, + { + "epoch": 4.669902912621359, + "grad_norm": 0.7829872249544201, + "learning_rate": 3.896577138120881e-06, + "loss": 0.4164, + "step": 481 + }, + { + "epoch": 4.679611650485437, + "grad_norm": 0.9232004605724133, + "learning_rate": 3.892357929429187e-06, + "loss": 0.3579, + "step": 482 + }, + { + "epoch": 4.689320388349515, + "grad_norm": 0.784529648802751, + "learning_rate": 3.8881329637734e-06, + "loss": 0.2236, + "step": 483 + }, + { + "epoch": 4.699029126213592, + "grad_norm": 0.7262690017605888, + "learning_rate": 3.883902258622431e-06, + "loss": 0.3756, + "step": 484 + }, + { + "epoch": 4.70873786407767, + "grad_norm": 0.8102856355194789, + "learning_rate": 3.8796658314689205e-06, + "loss": 0.4178, + "step": 485 + }, + { + "epoch": 4.718446601941747, + "grad_norm": 0.7486005625149885, + "learning_rate": 3.875423699829168e-06, + "loss": 0.2835, + "step": 486 + }, + { + "epoch": 4.728155339805825, + "grad_norm": 0.7379335654656654, + "learning_rate": 3.871175881243061e-06, + "loss": 0.3489, + "step": 487 + }, + { + "epoch": 4.737864077669903, + "grad_norm": 0.8104226180405947, + "learning_rate": 3.866922393273999e-06, + "loss": 0.4974, + "step": 488 + }, + { + "epoch": 4.747572815533981, + "grad_norm": 0.7845427197820135, + "learning_rate": 3.862663253508822e-06, + "loss": 0.3453, + "step": 489 + }, + { + "epoch": 4.757281553398058, + "grad_norm": 0.7436234031119154, + "learning_rate": 3.858398479557739e-06, + "loss": 0.3273, + "step": 490 + }, + { + "epoch": 4.766990291262136, + "grad_norm": 2.217227869082823, + "learning_rate": 3.8541280890542565e-06, + "loss": 0.3229, + "step": 491 + }, + { + "epoch": 4.776699029126213, + "grad_norm": 1.0428680689060519, + "learning_rate": 3.849852099655102e-06, + "loss": 0.4476, + "step": 492 + }, + { + "epoch": 4.786407766990291, + "grad_norm": 0.7093952303779135, + "learning_rate": 3.845570529040151e-06, + "loss": 0.3531, + "step": 493 + }, + { + "epoch": 4.796116504854369, + "grad_norm": 0.7919634533614799, + "learning_rate": 3.841283394912361e-06, + "loss": 0.3435, + "step": 494 + }, + { + "epoch": 4.805825242718447, + "grad_norm": 0.7136080403274071, + "learning_rate": 3.836990714997686e-06, + "loss": 0.4444, + "step": 495 + }, + { + "epoch": 4.815533980582524, + "grad_norm": 0.8376006583714606, + "learning_rate": 3.832692507045015e-06, + "loss": 0.4478, + "step": 496 + }, + { + "epoch": 4.825242718446602, + "grad_norm": 0.6616123052279387, + "learning_rate": 3.828388788826091e-06, + "loss": 0.5166, + "step": 497 + }, + { + "epoch": 4.834951456310679, + "grad_norm": 0.7483368464194841, + "learning_rate": 3.824079578135442e-06, + "loss": 0.4151, + "step": 498 + }, + { + "epoch": 4.844660194174757, + "grad_norm": 0.646917604155803, + "learning_rate": 3.819764892790307e-06, + "loss": 0.4058, + "step": 499 + }, + { + "epoch": 4.854368932038835, + "grad_norm": 0.8267691480427513, + "learning_rate": 3.815444750630555e-06, + "loss": 0.3406, + "step": 500 + }, + { + "epoch": 4.864077669902913, + "grad_norm": 0.8928434754303194, + "learning_rate": 3.811119169518624e-06, + "loss": 0.541, + "step": 501 + }, + { + "epoch": 4.87378640776699, + "grad_norm": 0.9334290516881728, + "learning_rate": 3.8067881673394363e-06, + "loss": 0.6994, + "step": 502 + }, + { + "epoch": 4.883495145631068, + "grad_norm": 0.7272706791055442, + "learning_rate": 3.802451762000331e-06, + "loss": 0.3112, + "step": 503 + }, + { + "epoch": 4.893203883495145, + "grad_norm": 0.6753535996733946, + "learning_rate": 3.7981099714309856e-06, + "loss": 0.2749, + "step": 504 + }, + { + "epoch": 4.902912621359223, + "grad_norm": 0.8446207256750283, + "learning_rate": 3.7937628135833453e-06, + "loss": 0.354, + "step": 505 + }, + { + "epoch": 4.9126213592233015, + "grad_norm": 0.8427902134243664, + "learning_rate": 3.7894103064315463e-06, + "loss": 0.3765, + "step": 506 + }, + { + "epoch": 4.922330097087379, + "grad_norm": 0.7801637629685254, + "learning_rate": 3.7850524679718424e-06, + "loss": 0.3014, + "step": 507 + }, + { + "epoch": 4.932038834951456, + "grad_norm": 0.7448345484733, + "learning_rate": 3.7806893162225328e-06, + "loss": 0.2862, + "step": 508 + }, + { + "epoch": 4.941747572815534, + "grad_norm": 0.7506722271057215, + "learning_rate": 3.7763208692238818e-06, + "loss": 0.4005, + "step": 509 + }, + { + "epoch": 4.951456310679612, + "grad_norm": 0.7507341810173735, + "learning_rate": 3.7719471450380518e-06, + "loss": 0.3154, + "step": 510 + }, + { + "epoch": 4.961165048543689, + "grad_norm": 0.7509927539126146, + "learning_rate": 3.7675681617490212e-06, + "loss": 0.5283, + "step": 511 + }, + { + "epoch": 4.970873786407767, + "grad_norm": 0.7780851180094496, + "learning_rate": 3.7631839374625167e-06, + "loss": 0.5371, + "step": 512 + }, + { + "epoch": 4.980582524271845, + "grad_norm": 0.7269269443801724, + "learning_rate": 3.758794490305932e-06, + "loss": 0.3471, + "step": 513 + }, + { + "epoch": 4.990291262135923, + "grad_norm": 0.7023346099879195, + "learning_rate": 3.7543998384282565e-06, + "loss": 0.3844, + "step": 514 + }, + { + "epoch": 5.0, + "grad_norm": 0.6668852497704912, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.4468, + "step": 515 + }, + { + "epoch": 5.009708737864078, + "grad_norm": 0.7476449787043878, + "learning_rate": 3.745594993213118e-06, + "loss": 0.3995, + "step": 516 + }, + { + "epoch": 5.019417475728155, + "grad_norm": 0.6983005205382378, + "learning_rate": 3.7411848362809324e-06, + "loss": 0.5712, + "step": 517 + }, + { + "epoch": 5.029126213592233, + "grad_norm": 0.7248404467665612, + "learning_rate": 3.7367695474380623e-06, + "loss": 0.3509, + "step": 518 + }, + { + "epoch": 5.038834951456311, + "grad_norm": 0.8246975738832559, + "learning_rate": 3.7323491449403444e-06, + "loss": 0.344, + "step": 519 + }, + { + "epoch": 5.048543689320389, + "grad_norm": 0.6636977636917745, + "learning_rate": 3.7279236470647593e-06, + "loss": 0.1996, + "step": 520 + }, + { + "epoch": 5.058252427184466, + "grad_norm": 0.8867501754473929, + "learning_rate": 3.723493072109355e-06, + "loss": 0.5021, + "step": 521 + }, + { + "epoch": 5.067961165048544, + "grad_norm": 0.8626723097108108, + "learning_rate": 3.719057438393172e-06, + "loss": 0.2895, + "step": 522 + }, + { + "epoch": 5.077669902912621, + "grad_norm": 1.0193146429465378, + "learning_rate": 3.714616764256166e-06, + "loss": 0.4664, + "step": 523 + }, + { + "epoch": 5.087378640776699, + "grad_norm": 0.8843905485635875, + "learning_rate": 3.7101710680591353e-06, + "loss": 0.3435, + "step": 524 + }, + { + "epoch": 5.097087378640777, + "grad_norm": 0.9512264521315206, + "learning_rate": 3.7057203681836407e-06, + "loss": 0.3871, + "step": 525 + }, + { + "epoch": 5.106796116504855, + "grad_norm": 0.8490880861462835, + "learning_rate": 3.701264683031934e-06, + "loss": 0.4215, + "step": 526 + }, + { + "epoch": 5.116504854368932, + "grad_norm": 0.7923161136499023, + "learning_rate": 3.6968040310268766e-06, + "loss": 0.3106, + "step": 527 + }, + { + "epoch": 5.12621359223301, + "grad_norm": 0.7158609870667556, + "learning_rate": 3.692338430611869e-06, + "loss": 0.3569, + "step": 528 + }, + { + "epoch": 5.135922330097087, + "grad_norm": 0.793510070177626, + "learning_rate": 3.687867900250771e-06, + "loss": 0.2498, + "step": 529 + }, + { + "epoch": 5.145631067961165, + "grad_norm": 0.789606227818232, + "learning_rate": 3.683392458427825e-06, + "loss": 0.2286, + "step": 530 + }, + { + "epoch": 5.155339805825243, + "grad_norm": 0.8813406233305229, + "learning_rate": 3.6789121236475818e-06, + "loss": 0.3946, + "step": 531 + }, + { + "epoch": 5.165048543689321, + "grad_norm": 0.7309365109221931, + "learning_rate": 3.674426914434824e-06, + "loss": 0.36, + "step": 532 + }, + { + "epoch": 5.174757281553398, + "grad_norm": 0.7734597901253387, + "learning_rate": 3.6699368493344856e-06, + "loss": 0.3541, + "step": 533 + }, + { + "epoch": 5.184466019417476, + "grad_norm": 0.684738131548144, + "learning_rate": 3.665441946911582e-06, + "loss": 0.2127, + "step": 534 + }, + { + "epoch": 5.194174757281553, + "grad_norm": 0.7131381186080148, + "learning_rate": 3.660942225751126e-06, + "loss": 0.2997, + "step": 535 + }, + { + "epoch": 5.203883495145631, + "grad_norm": 0.8304343144105206, + "learning_rate": 3.6564377044580558e-06, + "loss": 0.2777, + "step": 536 + }, + { + "epoch": 5.213592233009709, + "grad_norm": 0.7512779127156971, + "learning_rate": 3.6519284016571567e-06, + "loss": 0.2193, + "step": 537 + }, + { + "epoch": 5.223300970873787, + "grad_norm": 0.8246283116758232, + "learning_rate": 3.647414335992985e-06, + "loss": 0.3837, + "step": 538 + }, + { + "epoch": 5.233009708737864, + "grad_norm": 0.8396621277026458, + "learning_rate": 3.642895526129787e-06, + "loss": 0.3746, + "step": 539 + }, + { + "epoch": 5.242718446601942, + "grad_norm": 0.7935778168524156, + "learning_rate": 3.638371990751428e-06, + "loss": 0.3837, + "step": 540 + }, + { + "epoch": 5.252427184466019, + "grad_norm": 0.8991486533621065, + "learning_rate": 3.63384374856131e-06, + "loss": 0.3581, + "step": 541 + }, + { + "epoch": 5.262135922330097, + "grad_norm": 0.9432724876796261, + "learning_rate": 3.629310818282297e-06, + "loss": 0.4725, + "step": 542 + }, + { + "epoch": 5.271844660194175, + "grad_norm": 0.7506118004771117, + "learning_rate": 3.6247732186566365e-06, + "loss": 0.3419, + "step": 543 + }, + { + "epoch": 5.281553398058253, + "grad_norm": 0.7598462241955665, + "learning_rate": 3.6202309684458813e-06, + "loss": 0.2133, + "step": 544 + }, + { + "epoch": 5.29126213592233, + "grad_norm": 0.8062442517121977, + "learning_rate": 3.615684086430815e-06, + "loss": 0.2294, + "step": 545 + }, + { + "epoch": 5.300970873786408, + "grad_norm": 0.9107115654245546, + "learning_rate": 3.61113259141137e-06, + "loss": 0.3629, + "step": 546 + }, + { + "epoch": 5.310679611650485, + "grad_norm": 0.6755535448056335, + "learning_rate": 3.606576502206554e-06, + "loss": 0.1603, + "step": 547 + }, + { + "epoch": 5.320388349514563, + "grad_norm": 0.7340467593460501, + "learning_rate": 3.602015837654369e-06, + "loss": 0.1743, + "step": 548 + }, + { + "epoch": 5.330097087378641, + "grad_norm": 0.7628965161854933, + "learning_rate": 3.5974506166117355e-06, + "loss": 0.1826, + "step": 549 + }, + { + "epoch": 5.339805825242719, + "grad_norm": 0.8396283760259973, + "learning_rate": 3.592880857954413e-06, + "loss": 0.4369, + "step": 550 + }, + { + "epoch": 5.349514563106796, + "grad_norm": 0.7344763407362891, + "learning_rate": 3.588306580576922e-06, + "loss": 0.1936, + "step": 551 + }, + { + "epoch": 5.359223300970874, + "grad_norm": 0.6947637843679245, + "learning_rate": 3.583727803392468e-06, + "loss": 0.1258, + "step": 552 + }, + { + "epoch": 5.368932038834951, + "grad_norm": 0.8331318012858376, + "learning_rate": 3.57914454533286e-06, + "loss": 0.3562, + "step": 553 + }, + { + "epoch": 5.378640776699029, + "grad_norm": 1.0821814432994845, + "learning_rate": 3.5745568253484363e-06, + "loss": 0.2078, + "step": 554 + }, + { + "epoch": 5.388349514563107, + "grad_norm": 0.7957795333648919, + "learning_rate": 3.5699646624079824e-06, + "loss": 0.2882, + "step": 555 + }, + { + "epoch": 5.398058252427185, + "grad_norm": 0.6525938882878057, + "learning_rate": 3.5653680754986543e-06, + "loss": 0.1807, + "step": 556 + }, + { + "epoch": 5.407766990291262, + "grad_norm": 0.76383658803048, + "learning_rate": 3.560767083625899e-06, + "loss": 0.4813, + "step": 557 + }, + { + "epoch": 5.41747572815534, + "grad_norm": 0.787353335592656, + "learning_rate": 3.556161705813378e-06, + "loss": 0.228, + "step": 558 + }, + { + "epoch": 5.427184466019417, + "grad_norm": 0.94797988039084, + "learning_rate": 3.5515519611028863e-06, + "loss": 0.3629, + "step": 559 + }, + { + "epoch": 5.436893203883495, + "grad_norm": 0.8381630759676166, + "learning_rate": 3.5469378685542742e-06, + "loss": 0.3928, + "step": 560 + }, + { + "epoch": 5.446601941747573, + "grad_norm": 0.8908509257555599, + "learning_rate": 3.542319447245372e-06, + "loss": 0.5126, + "step": 561 + }, + { + "epoch": 5.456310679611651, + "grad_norm": 0.9904764252647209, + "learning_rate": 3.537696716271904e-06, + "loss": 0.4614, + "step": 562 + }, + { + "epoch": 5.466019417475728, + "grad_norm": 0.8971864824848211, + "learning_rate": 3.533069694747415e-06, + "loss": 0.2269, + "step": 563 + }, + { + "epoch": 5.475728155339806, + "grad_norm": 0.7345775221122297, + "learning_rate": 3.528438401803192e-06, + "loss": 0.2796, + "step": 564 + }, + { + "epoch": 5.485436893203883, + "grad_norm": 0.7565695930856697, + "learning_rate": 3.52380285658818e-06, + "loss": 0.2396, + "step": 565 + }, + { + "epoch": 5.495145631067961, + "grad_norm": 0.9700827286254838, + "learning_rate": 3.5191630782689074e-06, + "loss": 0.272, + "step": 566 + }, + { + "epoch": 5.504854368932039, + "grad_norm": 0.8209558296520447, + "learning_rate": 3.5145190860294043e-06, + "loss": 0.2344, + "step": 567 + }, + { + "epoch": 5.514563106796117, + "grad_norm": 0.6284068735301757, + "learning_rate": 3.5098708990711254e-06, + "loss": 0.1831, + "step": 568 + }, + { + "epoch": 5.524271844660194, + "grad_norm": 0.7757264445876239, + "learning_rate": 3.505218536612869e-06, + "loss": 0.3549, + "step": 569 + }, + { + "epoch": 5.533980582524272, + "grad_norm": 0.7719191434722412, + "learning_rate": 3.500562017890695e-06, + "loss": 0.17, + "step": 570 + }, + { + "epoch": 5.543689320388349, + "grad_norm": 0.8121655179279875, + "learning_rate": 3.495901362157853e-06, + "loss": 0.1626, + "step": 571 + }, + { + "epoch": 5.553398058252427, + "grad_norm": 0.9163623739345025, + "learning_rate": 3.4912365886846934e-06, + "loss": 0.5342, + "step": 572 + }, + { + "epoch": 5.563106796116505, + "grad_norm": 0.8394373837788445, + "learning_rate": 3.4865677167585942e-06, + "loss": 0.3415, + "step": 573 + }, + { + "epoch": 5.572815533980583, + "grad_norm": 0.7873980583937358, + "learning_rate": 3.4818947656838796e-06, + "loss": 0.224, + "step": 574 + }, + { + "epoch": 5.58252427184466, + "grad_norm": 0.7467839551701586, + "learning_rate": 3.4772177547817387e-06, + "loss": 0.16, + "step": 575 + }, + { + "epoch": 5.592233009708738, + "grad_norm": 0.8007193223538516, + "learning_rate": 3.472536703390148e-06, + "loss": 0.2182, + "step": 576 + }, + { + "epoch": 5.601941747572815, + "grad_norm": 0.6853648803562354, + "learning_rate": 3.467851630863789e-06, + "loss": 0.2209, + "step": 577 + }, + { + "epoch": 5.611650485436893, + "grad_norm": 0.7720375849124096, + "learning_rate": 3.463162556573969e-06, + "loss": 0.3613, + "step": 578 + }, + { + "epoch": 5.621359223300971, + "grad_norm": 0.7587365273730499, + "learning_rate": 3.4584694999085424e-06, + "loss": 0.4704, + "step": 579 + }, + { + "epoch": 5.631067961165049, + "grad_norm": 0.7050203026843053, + "learning_rate": 3.4537724802718294e-06, + "loss": 0.246, + "step": 580 + }, + { + "epoch": 5.640776699029126, + "grad_norm": 0.7213230006573722, + "learning_rate": 3.4490715170845356e-06, + "loss": 0.2496, + "step": 581 + }, + { + "epoch": 5.650485436893204, + "grad_norm": 1.5911224887444453, + "learning_rate": 3.4443666297836715e-06, + "loss": 0.2489, + "step": 582 + }, + { + "epoch": 5.660194174757281, + "grad_norm": 0.8993220255740552, + "learning_rate": 3.4396578378224734e-06, + "loss": 0.2876, + "step": 583 + }, + { + "epoch": 5.669902912621359, + "grad_norm": 0.8319084775768927, + "learning_rate": 3.4349451606703214e-06, + "loss": 0.3687, + "step": 584 + }, + { + "epoch": 5.679611650485437, + "grad_norm": 0.7249535982488255, + "learning_rate": 3.430228617812661e-06, + "loss": 0.199, + "step": 585 + }, + { + "epoch": 5.689320388349515, + "grad_norm": 0.5875254159252349, + "learning_rate": 3.4255082287509183e-06, + "loss": 0.1397, + "step": 586 + }, + { + "epoch": 5.699029126213592, + "grad_norm": 1.7482143348929224, + "learning_rate": 3.420784013002426e-06, + "loss": 0.2072, + "step": 587 + }, + { + "epoch": 5.70873786407767, + "grad_norm": 0.6823640493082458, + "learning_rate": 3.416055990100336e-06, + "loss": 0.2213, + "step": 588 + }, + { + "epoch": 5.718446601941747, + "grad_norm": 0.7955612652181657, + "learning_rate": 3.4113241795935427e-06, + "loss": 0.2398, + "step": 589 + }, + { + "epoch": 5.728155339805825, + "grad_norm": 0.7758046902122557, + "learning_rate": 3.4065886010466014e-06, + "loss": 0.3072, + "step": 590 + }, + { + "epoch": 5.737864077669903, + "grad_norm": 0.7533825958833359, + "learning_rate": 3.401849274039647e-06, + "loss": 0.3601, + "step": 591 + }, + { + "epoch": 5.747572815533981, + "grad_norm": 0.6544575762213413, + "learning_rate": 3.3971062181683117e-06, + "loss": 0.2311, + "step": 592 + }, + { + "epoch": 5.757281553398058, + "grad_norm": 0.8250888056688914, + "learning_rate": 3.3923594530436477e-06, + "loss": 0.3673, + "step": 593 + }, + { + "epoch": 5.766990291262136, + "grad_norm": 0.8332226268708626, + "learning_rate": 3.387608998292041e-06, + "loss": 0.29, + "step": 594 + }, + { + "epoch": 5.776699029126213, + "grad_norm": 0.9912332225583187, + "learning_rate": 3.382854873555137e-06, + "loss": 0.191, + "step": 595 + }, + { + "epoch": 5.786407766990291, + "grad_norm": 0.7564530094127416, + "learning_rate": 3.3780970984897504e-06, + "loss": 0.1793, + "step": 596 + }, + { + "epoch": 5.796116504854369, + "grad_norm": 0.8710313622557714, + "learning_rate": 3.373335692767793e-06, + "loss": 0.395, + "step": 597 + }, + { + "epoch": 5.805825242718447, + "grad_norm": 0.7534705198649296, + "learning_rate": 3.3685706760761865e-06, + "loss": 0.3471, + "step": 598 + }, + { + "epoch": 5.815533980582524, + "grad_norm": 0.7266924212415772, + "learning_rate": 3.3638020681167827e-06, + "loss": 0.2073, + "step": 599 + }, + { + "epoch": 5.825242718446602, + "grad_norm": 0.6145017524574724, + "learning_rate": 3.3590298886062833e-06, + "loss": 0.2466, + "step": 600 + }, + { + "epoch": 5.834951456310679, + "grad_norm": 0.8159007931540191, + "learning_rate": 3.354254157276155e-06, + "loss": 0.4331, + "step": 601 + }, + { + "epoch": 5.844660194174757, + "grad_norm": 0.9049844421997624, + "learning_rate": 3.3494748938725525e-06, + "loss": 0.2736, + "step": 602 + }, + { + "epoch": 5.854368932038835, + "grad_norm": 0.8353175945697331, + "learning_rate": 3.3446921181562326e-06, + "loss": 0.2368, + "step": 603 + }, + { + "epoch": 5.864077669902913, + "grad_norm": 0.8795663444794419, + "learning_rate": 3.3399058499024767e-06, + "loss": 0.138, + "step": 604 + }, + { + "epoch": 5.87378640776699, + "grad_norm": 0.9133299820724685, + "learning_rate": 3.3351161089010055e-06, + "loss": 0.375, + "step": 605 + }, + { + "epoch": 5.883495145631068, + "grad_norm": 0.8571212547288527, + "learning_rate": 3.330322914955897e-06, + "loss": 0.3399, + "step": 606 + }, + { + "epoch": 5.893203883495145, + "grad_norm": 0.8491851116183573, + "learning_rate": 3.325526287885509e-06, + "loss": 0.3133, + "step": 607 + }, + { + "epoch": 5.902912621359223, + "grad_norm": 0.7068426870475476, + "learning_rate": 3.3207262475223913e-06, + "loss": 0.3358, + "step": 608 + }, + { + "epoch": 5.9126213592233015, + "grad_norm": 0.8823539443693329, + "learning_rate": 3.315922813713209e-06, + "loss": 0.463, + "step": 609 + }, + { + "epoch": 5.922330097087379, + "grad_norm": 0.8082936904142045, + "learning_rate": 3.3111160063186553e-06, + "loss": 0.2456, + "step": 610 + }, + { + "epoch": 5.932038834951456, + "grad_norm": 0.8575417238390962, + "learning_rate": 3.3063058452133756e-06, + "loss": 0.356, + "step": 611 + }, + { + "epoch": 5.941747572815534, + "grad_norm": 0.9195204085165103, + "learning_rate": 3.301492350285879e-06, + "loss": 0.3163, + "step": 612 + }, + { + "epoch": 5.951456310679612, + "grad_norm": 0.7507019258704335, + "learning_rate": 3.296675541438461e-06, + "loss": 0.1877, + "step": 613 + }, + { + "epoch": 5.961165048543689, + "grad_norm": 0.8316963047488544, + "learning_rate": 3.2918554385871163e-06, + "loss": 0.408, + "step": 614 + }, + { + "epoch": 5.970873786407767, + "grad_norm": 0.7669332051739475, + "learning_rate": 3.2870320616614626e-06, + "loss": 0.1992, + "step": 615 + }, + { + "epoch": 5.980582524271845, + "grad_norm": 0.8022721375687455, + "learning_rate": 3.282205430604653e-06, + "loss": 0.2359, + "step": 616 + }, + { + "epoch": 5.990291262135923, + "grad_norm": 0.8261290006148897, + "learning_rate": 3.2773755653732954e-06, + "loss": 0.2569, + "step": 617 + }, + { + "epoch": 6.0, + "grad_norm": 0.6791133651060106, + "learning_rate": 3.272542485937369e-06, + "loss": 0.2842, + "step": 618 + }, + { + "epoch": 6.009708737864078, + "grad_norm": 0.8707791160221153, + "learning_rate": 3.267706212280146e-06, + "loss": 0.1839, + "step": 619 + }, + { + "epoch": 6.019417475728155, + "grad_norm": 0.7963631564153827, + "learning_rate": 3.2628667643981036e-06, + "loss": 0.1891, + "step": 620 + }, + { + "epoch": 6.029126213592233, + "grad_norm": 1.0279730453071867, + "learning_rate": 3.2580241623008426e-06, + "loss": 0.2358, + "step": 621 + }, + { + "epoch": 6.038834951456311, + "grad_norm": 1.397048986070453, + "learning_rate": 3.2531784260110067e-06, + "loss": 0.2829, + "step": 622 + }, + { + "epoch": 6.048543689320389, + "grad_norm": 0.7957544850491932, + "learning_rate": 3.2483295755641986e-06, + "loss": 0.2457, + "step": 623 + }, + { + "epoch": 6.058252427184466, + "grad_norm": 1.465656842900919, + "learning_rate": 3.243477631008897e-06, + "loss": 0.3518, + "step": 624 + }, + { + "epoch": 6.067961165048544, + "grad_norm": 0.9566010742642381, + "learning_rate": 3.238622612406373e-06, + "loss": 0.1091, + "step": 625 + }, + { + "epoch": 6.077669902912621, + "grad_norm": 1.094395509090259, + "learning_rate": 3.233764539830608e-06, + "loss": 0.2517, + "step": 626 + }, + { + "epoch": 6.087378640776699, + "grad_norm": 0.786465657330707, + "learning_rate": 3.228903433368212e-06, + "loss": 0.1099, + "step": 627 + }, + { + "epoch": 6.097087378640777, + "grad_norm": 0.7448290029077708, + "learning_rate": 3.224039313118338e-06, + "loss": 0.0839, + "step": 628 + }, + { + "epoch": 6.106796116504855, + "grad_norm": 0.8159852737100751, + "learning_rate": 3.2191721991925993e-06, + "loss": 0.1697, + "step": 629 + }, + { + "epoch": 6.116504854368932, + "grad_norm": 0.783092292843629, + "learning_rate": 3.21430211171499e-06, + "loss": 0.2721, + "step": 630 + }, + { + "epoch": 6.12621359223301, + "grad_norm": 0.909457312270043, + "learning_rate": 3.209429070821795e-06, + "loss": 0.4595, + "step": 631 + }, + { + "epoch": 6.135922330097087, + "grad_norm": 0.7768261343108038, + "learning_rate": 3.2045530966615136e-06, + "loss": 0.2819, + "step": 632 + }, + { + "epoch": 6.145631067961165, + "grad_norm": 0.6646429241432731, + "learning_rate": 3.1996742093947724e-06, + "loss": 0.2084, + "step": 633 + }, + { + "epoch": 6.155339805825243, + "grad_norm": 1.8192215128102225, + "learning_rate": 3.1947924291942423e-06, + "loss": 0.2926, + "step": 634 + }, + { + "epoch": 6.165048543689321, + "grad_norm": 0.9265469467709787, + "learning_rate": 3.189907776244556e-06, + "loss": 0.1355, + "step": 635 + }, + { + "epoch": 6.174757281553398, + "grad_norm": 0.8092435744627986, + "learning_rate": 3.185020270742225e-06, + "loss": 0.2611, + "step": 636 + }, + { + "epoch": 6.184466019417476, + "grad_norm": 0.9280566002154651, + "learning_rate": 3.180129932895553e-06, + "loss": 0.1779, + "step": 637 + }, + { + "epoch": 6.194174757281553, + "grad_norm": 0.790228426680879, + "learning_rate": 3.1752367829245563e-06, + "loss": 0.3931, + "step": 638 + }, + { + "epoch": 6.203883495145631, + "grad_norm": 1.0755642442690179, + "learning_rate": 3.1703408410608777e-06, + "loss": 0.4567, + "step": 639 + }, + { + "epoch": 6.213592233009709, + "grad_norm": 0.755742369789303, + "learning_rate": 3.1654421275477045e-06, + "loss": 0.2269, + "step": 640 + }, + { + "epoch": 6.223300970873787, + "grad_norm": 0.9315816211452175, + "learning_rate": 3.1605406626396826e-06, + "loss": 0.2207, + "step": 641 + }, + { + "epoch": 6.233009708737864, + "grad_norm": 0.9545007552402858, + "learning_rate": 3.155636466602836e-06, + "loss": 0.2818, + "step": 642 + }, + { + "epoch": 6.242718446601942, + "grad_norm": 0.8741584596148888, + "learning_rate": 3.150729559714478e-06, + "loss": 0.376, + "step": 643 + }, + { + "epoch": 6.252427184466019, + "grad_norm": 0.8956284681200907, + "learning_rate": 3.145819962263134e-06, + "loss": 0.2309, + "step": 644 + }, + { + "epoch": 6.262135922330097, + "grad_norm": 0.9302001043992213, + "learning_rate": 3.1409076945484513e-06, + "loss": 0.1773, + "step": 645 + }, + { + "epoch": 6.271844660194175, + "grad_norm": 0.7020688549614059, + "learning_rate": 3.135992776881119e-06, + "loss": 0.1055, + "step": 646 + }, + { + "epoch": 6.281553398058253, + "grad_norm": 0.8984373280427396, + "learning_rate": 3.1310752295827818e-06, + "loss": 0.1185, + "step": 647 + }, + { + "epoch": 6.29126213592233, + "grad_norm": 0.7618424845029574, + "learning_rate": 3.1261550729859602e-06, + "loss": 0.2409, + "step": 648 + }, + { + "epoch": 6.300970873786408, + "grad_norm": 0.8351950915081744, + "learning_rate": 3.12123232743396e-06, + "loss": 0.1645, + "step": 649 + }, + { + "epoch": 6.310679611650485, + "grad_norm": 0.8458831464212704, + "learning_rate": 3.116307013280793e-06, + "loss": 0.4059, + "step": 650 + }, + { + "epoch": 6.320388349514563, + "grad_norm": 0.6691280560723778, + "learning_rate": 3.1113791508910913e-06, + "loss": 0.2279, + "step": 651 + }, + { + "epoch": 6.330097087378641, + "grad_norm": 0.7854776986942449, + "learning_rate": 3.106448760640022e-06, + "loss": 0.3258, + "step": 652 + }, + { + "epoch": 6.339805825242719, + "grad_norm": 0.6901204342705916, + "learning_rate": 3.1015158629132066e-06, + "loss": 0.1222, + "step": 653 + }, + { + "epoch": 6.349514563106796, + "grad_norm": 0.7644991091087948, + "learning_rate": 3.096580478106631e-06, + "loss": 0.2448, + "step": 654 + }, + { + "epoch": 6.359223300970874, + "grad_norm": 0.8458191662291118, + "learning_rate": 3.0916426266265676e-06, + "loss": 0.1768, + "step": 655 + }, + { + "epoch": 6.368932038834951, + "grad_norm": 0.8114001055076686, + "learning_rate": 3.086702328889486e-06, + "loss": 0.1425, + "step": 656 + }, + { + "epoch": 6.378640776699029, + "grad_norm": 1.056470365811026, + "learning_rate": 3.0817596053219697e-06, + "loss": 0.192, + "step": 657 + }, + { + "epoch": 6.388349514563107, + "grad_norm": 0.8235670285844128, + "learning_rate": 3.076814476360634e-06, + "loss": 0.1868, + "step": 658 + }, + { + "epoch": 6.398058252427185, + "grad_norm": 0.5428827520625176, + "learning_rate": 3.071866962452038e-06, + "loss": 0.0962, + "step": 659 + }, + { + "epoch": 6.407766990291262, + "grad_norm": 1.008018313016207, + "learning_rate": 3.066917084052603e-06, + "loss": 0.3406, + "step": 660 + }, + { + "epoch": 6.41747572815534, + "grad_norm": 0.8178626481888325, + "learning_rate": 3.061964861628527e-06, + "loss": 0.3335, + "step": 661 + }, + { + "epoch": 6.427184466019417, + "grad_norm": 0.8894315711207563, + "learning_rate": 3.057010315655698e-06, + "loss": 0.1577, + "step": 662 + }, + { + "epoch": 6.436893203883495, + "grad_norm": 0.8286795084689285, + "learning_rate": 3.0520534666196134e-06, + "loss": 0.2201, + "step": 663 + }, + { + "epoch": 6.446601941747573, + "grad_norm": 0.6036483911933364, + "learning_rate": 3.0470943350152914e-06, + "loss": 0.2444, + "step": 664 + }, + { + "epoch": 6.456310679611651, + "grad_norm": 0.6709778515906393, + "learning_rate": 3.042132941347189e-06, + "loss": 0.3274, + "step": 665 + }, + { + "epoch": 6.466019417475728, + "grad_norm": 0.7976957124768677, + "learning_rate": 3.037169306129115e-06, + "loss": 0.1896, + "step": 666 + }, + { + "epoch": 6.475728155339806, + "grad_norm": 0.7689359311211758, + "learning_rate": 3.0322034498841475e-06, + "loss": 0.2606, + "step": 667 + }, + { + "epoch": 6.485436893203883, + "grad_norm": 0.6978195308947686, + "learning_rate": 3.027235393144547e-06, + "loss": 0.2605, + "step": 668 + }, + { + "epoch": 6.495145631067961, + "grad_norm": 0.7202725995379047, + "learning_rate": 3.0222651564516715e-06, + "loss": 0.3485, + "step": 669 + }, + { + "epoch": 6.504854368932039, + "grad_norm": 0.8910034600894114, + "learning_rate": 3.017292760355896e-06, + "loss": 0.1461, + "step": 670 + }, + { + "epoch": 6.514563106796117, + "grad_norm": 0.9239014497460577, + "learning_rate": 3.0123182254165194e-06, + "loss": 0.0908, + "step": 671 + }, + { + "epoch": 6.524271844660194, + "grad_norm": 1.237740412456114, + "learning_rate": 3.0073415722016875e-06, + "loss": 0.4193, + "step": 672 + }, + { + "epoch": 6.533980582524272, + "grad_norm": 0.7735186784296664, + "learning_rate": 3.002362821288302e-06, + "loss": 0.1549, + "step": 673 + }, + { + "epoch": 6.543689320388349, + "grad_norm": 0.7673008373922907, + "learning_rate": 2.9973819932619404e-06, + "loss": 0.258, + "step": 674 + }, + { + "epoch": 6.553398058252427, + "grad_norm": 0.9318778367739176, + "learning_rate": 2.9923991087167657e-06, + "loss": 0.23, + "step": 675 + }, + { + "epoch": 6.563106796116505, + "grad_norm": 0.7309274196915427, + "learning_rate": 2.987414188255446e-06, + "loss": 0.3513, + "step": 676 + }, + { + "epoch": 6.572815533980583, + "grad_norm": 1.3271998778860932, + "learning_rate": 2.9824272524890664e-06, + "loss": 0.2651, + "step": 677 + }, + { + "epoch": 6.58252427184466, + "grad_norm": 0.8381939012078518, + "learning_rate": 2.977438322037046e-06, + "loss": 0.1581, + "step": 678 + }, + { + "epoch": 6.592233009708738, + "grad_norm": 0.6919842843107336, + "learning_rate": 2.9724474175270485e-06, + "loss": 0.1908, + "step": 679 + }, + { + "epoch": 6.601941747572815, + "grad_norm": 0.7191226845250073, + "learning_rate": 2.967454559594903e-06, + "loss": 0.1725, + "step": 680 + }, + { + "epoch": 6.611650485436893, + "grad_norm": 0.7901314352404131, + "learning_rate": 2.9624597688845126e-06, + "loss": 0.2598, + "step": 681 + }, + { + "epoch": 6.621359223300971, + "grad_norm": 0.8001240477856122, + "learning_rate": 2.957463066047773e-06, + "loss": 0.2253, + "step": 682 + }, + { + "epoch": 6.631067961165049, + "grad_norm": 0.9336908746395779, + "learning_rate": 2.9524644717444866e-06, + "loss": 0.1203, + "step": 683 + }, + { + "epoch": 6.640776699029126, + "grad_norm": 0.902680866331855, + "learning_rate": 2.9474640066422757e-06, + "loss": 0.3975, + "step": 684 + }, + { + "epoch": 6.650485436893204, + "grad_norm": 0.9361390212501749, + "learning_rate": 2.9424616914164982e-06, + "loss": 0.1326, + "step": 685 + }, + { + "epoch": 6.660194174757281, + "grad_norm": 1.1810674711179745, + "learning_rate": 2.9374575467501605e-06, + "loss": 0.1679, + "step": 686 + }, + { + "epoch": 6.669902912621359, + "grad_norm": 0.8359120548897934, + "learning_rate": 2.9324515933338343e-06, + "loss": 0.1959, + "step": 687 + }, + { + "epoch": 6.679611650485437, + "grad_norm": 0.8188298340548763, + "learning_rate": 2.9274438518655703e-06, + "loss": 0.333, + "step": 688 + }, + { + "epoch": 6.689320388349515, + "grad_norm": 0.630837189067903, + "learning_rate": 2.9224343430508105e-06, + "loss": 0.1625, + "step": 689 + }, + { + "epoch": 6.699029126213592, + "grad_norm": 0.6939282578828986, + "learning_rate": 2.917423087602306e-06, + "loss": 0.2996, + "step": 690 + }, + { + "epoch": 6.70873786407767, + "grad_norm": 0.8998041887412566, + "learning_rate": 2.9124101062400283e-06, + "loss": 0.2907, + "step": 691 + }, + { + "epoch": 6.718446601941747, + "grad_norm": 0.8389800760003714, + "learning_rate": 2.907395419691087e-06, + "loss": 0.3268, + "step": 692 + }, + { + "epoch": 6.728155339805825, + "grad_norm": 0.9441856769971552, + "learning_rate": 2.9023790486896404e-06, + "loss": 0.1834, + "step": 693 + }, + { + "epoch": 6.737864077669903, + "grad_norm": 0.7207079417413059, + "learning_rate": 2.8973610139768114e-06, + "loss": 0.1713, + "step": 694 + }, + { + "epoch": 6.747572815533981, + "grad_norm": 0.9416472237882838, + "learning_rate": 2.8923413363006038e-06, + "loss": 0.3246, + "step": 695 + }, + { + "epoch": 6.757281553398058, + "grad_norm": 0.7806422775042208, + "learning_rate": 2.887320036415811e-06, + "loss": 0.1888, + "step": 696 + }, + { + "epoch": 6.766990291262136, + "grad_norm": 0.9389934047803904, + "learning_rate": 2.882297135083937e-06, + "loss": 0.1117, + "step": 697 + }, + { + "epoch": 6.776699029126213, + "grad_norm": 1.0816268250392724, + "learning_rate": 2.877272653073107e-06, + "loss": 0.1527, + "step": 698 + }, + { + "epoch": 6.786407766990291, + "grad_norm": 0.803771712642974, + "learning_rate": 2.87224661115798e-06, + "loss": 0.1918, + "step": 699 + }, + { + "epoch": 6.796116504854369, + "grad_norm": 0.76795552970201, + "learning_rate": 2.8672190301196655e-06, + "loss": 0.2738, + "step": 700 + }, + { + "epoch": 6.805825242718447, + "grad_norm": 0.891590775155618, + "learning_rate": 2.8621899307456376e-06, + "loss": 0.2631, + "step": 701 + }, + { + "epoch": 6.815533980582524, + "grad_norm": 0.9199990875863763, + "learning_rate": 2.8571593338296473e-06, + "loss": 0.213, + "step": 702 + }, + { + "epoch": 6.825242718446602, + "grad_norm": 0.5739691108494726, + "learning_rate": 2.8521272601716376e-06, + "loss": 0.1101, + "step": 703 + }, + { + "epoch": 6.834951456310679, + "grad_norm": 1.0392563744404688, + "learning_rate": 2.8470937305776567e-06, + "loss": 0.4097, + "step": 704 + }, + { + "epoch": 6.844660194174757, + "grad_norm": 0.8306609969259794, + "learning_rate": 2.842058765859776e-06, + "loss": 0.1619, + "step": 705 + }, + { + "epoch": 6.854368932038835, + "grad_norm": 0.8615685770113111, + "learning_rate": 2.837022386835996e-06, + "loss": 0.1667, + "step": 706 + }, + { + "epoch": 6.864077669902913, + "grad_norm": 0.6463762969432196, + "learning_rate": 2.8319846143301676e-06, + "loss": 0.1625, + "step": 707 + }, + { + "epoch": 6.87378640776699, + "grad_norm": 0.7927082014544528, + "learning_rate": 2.826945469171903e-06, + "loss": 0.1654, + "step": 708 + }, + { + "epoch": 6.883495145631068, + "grad_norm": 0.6577362367813564, + "learning_rate": 2.82190497219649e-06, + "loss": 0.1615, + "step": 709 + }, + { + "epoch": 6.893203883495145, + "grad_norm": 1.02468338437526, + "learning_rate": 2.8168631442448046e-06, + "loss": 0.2807, + "step": 710 + }, + { + "epoch": 6.902912621359223, + "grad_norm": 0.7697195438915208, + "learning_rate": 2.8118200061632273e-06, + "loss": 0.1068, + "step": 711 + }, + { + "epoch": 6.9126213592233015, + "grad_norm": 0.8385493264196137, + "learning_rate": 2.8067755788035544e-06, + "loss": 0.3729, + "step": 712 + }, + { + "epoch": 6.922330097087379, + "grad_norm": 1.0076993590471206, + "learning_rate": 2.801729883022915e-06, + "loss": 0.172, + "step": 713 + }, + { + "epoch": 6.932038834951456, + "grad_norm": 0.7627948189362331, + "learning_rate": 2.7966829396836804e-06, + "loss": 0.2992, + "step": 714 + }, + { + "epoch": 6.941747572815534, + "grad_norm": 0.8903733137540231, + "learning_rate": 2.791634769653381e-06, + "loss": 0.1731, + "step": 715 + }, + { + "epoch": 6.951456310679612, + "grad_norm": 0.8121494855617393, + "learning_rate": 2.78658539380462e-06, + "loss": 0.3445, + "step": 716 + }, + { + "epoch": 6.961165048543689, + "grad_norm": 0.6831177053115758, + "learning_rate": 2.781534833014985e-06, + "loss": 0.2196, + "step": 717 + }, + { + "epoch": 6.970873786407767, + "grad_norm": 0.8060126813685334, + "learning_rate": 2.7764831081669635e-06, + "loss": 0.3198, + "step": 718 + }, + { + "epoch": 6.980582524271845, + "grad_norm": 0.559970759452349, + "learning_rate": 2.771430240147856e-06, + "loss": 0.1326, + "step": 719 + }, + { + "epoch": 6.990291262135923, + "grad_norm": 1.018187535764793, + "learning_rate": 2.7663762498496905e-06, + "loss": 0.2745, + "step": 720 + }, + { + "epoch": 7.0, + "grad_norm": 0.7403875608736046, + "learning_rate": 2.761321158169134e-06, + "loss": 0.1285, + "step": 721 + }, + { + "epoch": 7.009708737864078, + "grad_norm": 0.836665956792195, + "learning_rate": 2.7562649860074077e-06, + "loss": 0.1438, + "step": 722 + }, + { + "epoch": 7.019417475728155, + "grad_norm": 0.9057591504597606, + "learning_rate": 2.7512077542702005e-06, + "loss": 0.4176, + "step": 723 + }, + { + "epoch": 7.029126213592233, + "grad_norm": 0.7310746968198095, + "learning_rate": 2.746149483867582e-06, + "loss": 0.125, + "step": 724 + }, + { + "epoch": 7.038834951456311, + "grad_norm": 0.8094466402699106, + "learning_rate": 2.741090195713917e-06, + "loss": 0.1358, + "step": 725 + }, + { + "epoch": 7.048543689320389, + "grad_norm": 0.8397086363403945, + "learning_rate": 2.736029910727777e-06, + "loss": 0.1356, + "step": 726 + }, + { + "epoch": 7.058252427184466, + "grad_norm": 1.3541284896158823, + "learning_rate": 2.730968649831858e-06, + "loss": 0.1502, + "step": 727 + }, + { + "epoch": 7.067961165048544, + "grad_norm": 1.1229775978036463, + "learning_rate": 2.7259064339528875e-06, + "loss": 0.3025, + "step": 728 + }, + { + "epoch": 7.077669902912621, + "grad_norm": 0.7963179988105072, + "learning_rate": 2.720843284021543e-06, + "loss": 0.2234, + "step": 729 + }, + { + "epoch": 7.087378640776699, + "grad_norm": 0.9155996246197351, + "learning_rate": 2.7157792209723654e-06, + "loss": 0.1298, + "step": 730 + }, + { + "epoch": 7.097087378640777, + "grad_norm": 0.8846875801793995, + "learning_rate": 2.7107142657436696e-06, + "loss": 0.234, + "step": 731 + }, + { + "epoch": 7.106796116504855, + "grad_norm": 0.7661475643829027, + "learning_rate": 2.705648439277459e-06, + "loss": 0.1238, + "step": 732 + }, + { + "epoch": 7.116504854368932, + "grad_norm": 0.6344540911037735, + "learning_rate": 2.7005817625193398e-06, + "loss": 0.1464, + "step": 733 + }, + { + "epoch": 7.12621359223301, + "grad_norm": 0.9875032525499696, + "learning_rate": 2.695514256418435e-06, + "loss": 0.1332, + "step": 734 + }, + { + "epoch": 7.135922330097087, + "grad_norm": 0.8841032967053356, + "learning_rate": 2.6904459419272955e-06, + "loss": 0.1499, + "step": 735 + }, + { + "epoch": 7.145631067961165, + "grad_norm": 1.0727410414660654, + "learning_rate": 2.685376840001814e-06, + "loss": 0.1845, + "step": 736 + }, + { + "epoch": 7.155339805825243, + "grad_norm": 0.7603950939388796, + "learning_rate": 2.6803069716011405e-06, + "loss": 0.0635, + "step": 737 + }, + { + "epoch": 7.165048543689321, + "grad_norm": 0.8222924068539089, + "learning_rate": 2.6752363576875933e-06, + "loss": 0.1862, + "step": 738 + }, + { + "epoch": 7.174757281553398, + "grad_norm": 0.8437247717724511, + "learning_rate": 2.6701650192265734e-06, + "loss": 0.1873, + "step": 739 + }, + { + "epoch": 7.184466019417476, + "grad_norm": 0.9780684483534852, + "learning_rate": 2.6650929771864776e-06, + "loss": 0.1166, + "step": 740 + }, + { + "epoch": 7.194174757281553, + "grad_norm": 0.8398834065613339, + "learning_rate": 2.660020252538611e-06, + "loss": 0.2882, + "step": 741 + }, + { + "epoch": 7.203883495145631, + "grad_norm": 0.6876195874715586, + "learning_rate": 2.6549468662571026e-06, + "loss": 0.1951, + "step": 742 + }, + { + "epoch": 7.213592233009709, + "grad_norm": 1.0489870539764072, + "learning_rate": 2.6498728393188157e-06, + "loss": 0.2443, + "step": 743 + }, + { + "epoch": 7.223300970873787, + "grad_norm": 0.9997066200403729, + "learning_rate": 2.6447981927032634e-06, + "loss": 0.1229, + "step": 744 + }, + { + "epoch": 7.233009708737864, + "grad_norm": 0.7857265262109651, + "learning_rate": 2.639722947392521e-06, + "loss": 0.3217, + "step": 745 + }, + { + "epoch": 7.242718446601942, + "grad_norm": 0.8361751064417774, + "learning_rate": 2.6346471243711376e-06, + "loss": 0.1699, + "step": 746 + }, + { + "epoch": 7.252427184466019, + "grad_norm": 0.6354672587304362, + "learning_rate": 2.629570744626052e-06, + "loss": 0.2213, + "step": 747 + }, + { + "epoch": 7.262135922330097, + "grad_norm": 0.691741853784474, + "learning_rate": 2.624493829146507e-06, + "loss": 0.2447, + "step": 748 + }, + { + "epoch": 7.271844660194175, + "grad_norm": 0.9444925946366645, + "learning_rate": 2.619416398923957e-06, + "loss": 0.3385, + "step": 749 + }, + { + "epoch": 7.281553398058253, + "grad_norm": 0.8079469525410382, + "learning_rate": 2.614338474951987e-06, + "loss": 0.0788, + "step": 750 + }, + { + "epoch": 7.29126213592233, + "grad_norm": 0.6978663980764732, + "learning_rate": 2.6092600782262213e-06, + "loss": 0.1165, + "step": 751 + }, + { + "epoch": 7.300970873786408, + "grad_norm": 0.7118818478635398, + "learning_rate": 2.6041812297442417e-06, + "loss": 0.1376, + "step": 752 + }, + { + "epoch": 7.310679611650485, + "grad_norm": 0.776609030527654, + "learning_rate": 2.5991019505054965e-06, + "loss": 0.1688, + "step": 753 + }, + { + "epoch": 7.320388349514563, + "grad_norm": 0.6957555306995469, + "learning_rate": 2.5940222615112143e-06, + "loss": 0.112, + "step": 754 + }, + { + "epoch": 7.330097087378641, + "grad_norm": 0.6104826494631463, + "learning_rate": 2.5889421837643186e-06, + "loss": 0.1296, + "step": 755 + }, + { + "epoch": 7.339805825242719, + "grad_norm": 0.7019412036534555, + "learning_rate": 2.5838617382693415e-06, + "loss": 0.2872, + "step": 756 + }, + { + "epoch": 7.349514563106796, + "grad_norm": 0.91073647902368, + "learning_rate": 2.5787809460323337e-06, + "loss": 0.1448, + "step": 757 + }, + { + "epoch": 7.359223300970874, + "grad_norm": 0.7283515289799285, + "learning_rate": 2.57369982806078e-06, + "loss": 0.2016, + "step": 758 + }, + { + "epoch": 7.368932038834951, + "grad_norm": 0.8223274511720547, + "learning_rate": 2.5686184053635127e-06, + "loss": 0.1389, + "step": 759 + }, + { + "epoch": 7.378640776699029, + "grad_norm": 0.7050953772126068, + "learning_rate": 2.563536698950624e-06, + "loss": 0.1134, + "step": 760 + }, + { + "epoch": 7.388349514563107, + "grad_norm": 0.5692667770218556, + "learning_rate": 2.5584547298333772e-06, + "loss": 0.0578, + "step": 761 + }, + { + "epoch": 7.398058252427185, + "grad_norm": 0.6216050157913948, + "learning_rate": 2.5533725190241255e-06, + "loss": 0.0397, + "step": 762 + }, + { + "epoch": 7.407766990291262, + "grad_norm": 0.6761069814061764, + "learning_rate": 2.5482900875362184e-06, + "loss": 0.1089, + "step": 763 + }, + { + "epoch": 7.41747572815534, + "grad_norm": 1.0199372568203338, + "learning_rate": 2.543207456383919e-06, + "loss": 0.1276, + "step": 764 + }, + { + "epoch": 7.427184466019417, + "grad_norm": 0.8577131476418797, + "learning_rate": 2.538124646582315e-06, + "loss": 0.2917, + "step": 765 + }, + { + "epoch": 7.436893203883495, + "grad_norm": 0.7214685312852196, + "learning_rate": 2.533041679147235e-06, + "loss": 0.1534, + "step": 766 + }, + { + "epoch": 7.446601941747573, + "grad_norm": 0.7949192422368898, + "learning_rate": 2.527958575095157e-06, + "loss": 0.1386, + "step": 767 + }, + { + "epoch": 7.456310679611651, + "grad_norm": 0.9275307940564848, + "learning_rate": 2.522875355443124e-06, + "loss": 0.2946, + "step": 768 + }, + { + "epoch": 7.466019417475728, + "grad_norm": 0.9634584569650436, + "learning_rate": 2.5177920412086586e-06, + "loss": 0.258, + "step": 769 + }, + { + "epoch": 7.475728155339806, + "grad_norm": 0.8797775717303168, + "learning_rate": 2.512708653409674e-06, + "loss": 0.1895, + "step": 770 + }, + { + "epoch": 7.485436893203883, + "grad_norm": 0.7020792385043, + "learning_rate": 2.507625213064386e-06, + "loss": 0.1305, + "step": 771 + }, + { + "epoch": 7.495145631067961, + "grad_norm": 0.6721656885922416, + "learning_rate": 2.5025417411912307e-06, + "loss": 0.0923, + "step": 772 + }, + { + "epoch": 7.504854368932039, + "grad_norm": 0.9046829126351137, + "learning_rate": 2.4974582588087697e-06, + "loss": 0.2464, + "step": 773 + }, + { + "epoch": 7.514563106796117, + "grad_norm": 0.8690288633052726, + "learning_rate": 2.492374786935614e-06, + "loss": 0.2224, + "step": 774 + }, + { + "epoch": 7.524271844660194, + "grad_norm": 0.8553378382148052, + "learning_rate": 2.487291346590326e-06, + "loss": 0.2824, + "step": 775 + }, + { + "epoch": 7.533980582524272, + "grad_norm": 0.6651089727018585, + "learning_rate": 2.4822079587913414e-06, + "loss": 0.197, + "step": 776 + }, + { + "epoch": 7.543689320388349, + "grad_norm": 0.7135997471244006, + "learning_rate": 2.4771246445568763e-06, + "loss": 0.098, + "step": 777 + }, + { + "epoch": 7.553398058252427, + "grad_norm": 1.0277044305758947, + "learning_rate": 2.472041424904844e-06, + "loss": 0.123, + "step": 778 + }, + { + "epoch": 7.563106796116505, + "grad_norm": 0.7865144489394965, + "learning_rate": 2.466958320852766e-06, + "loss": 0.1981, + "step": 779 + }, + { + "epoch": 7.572815533980583, + "grad_norm": 0.987488048755019, + "learning_rate": 2.4618753534176854e-06, + "loss": 0.169, + "step": 780 + }, + { + "epoch": 7.58252427184466, + "grad_norm": 0.8166300006729655, + "learning_rate": 2.4567925436160823e-06, + "loss": 0.1999, + "step": 781 + }, + { + "epoch": 7.592233009708738, + "grad_norm": 0.81581841653359, + "learning_rate": 2.4517099124637824e-06, + "loss": 0.2586, + "step": 782 + }, + { + "epoch": 7.601941747572815, + "grad_norm": 1.108761709333979, + "learning_rate": 2.4466274809758757e-06, + "loss": 0.1429, + "step": 783 + }, + { + "epoch": 7.611650485436893, + "grad_norm": 0.7640567025062378, + "learning_rate": 2.4415452701666236e-06, + "loss": 0.1963, + "step": 784 + }, + { + "epoch": 7.621359223300971, + "grad_norm": 0.7929183080114451, + "learning_rate": 2.436463301049378e-06, + "loss": 0.2358, + "step": 785 + }, + { + "epoch": 7.631067961165049, + "grad_norm": 0.9165311722428774, + "learning_rate": 2.431381594636488e-06, + "loss": 0.2054, + "step": 786 + }, + { + "epoch": 7.640776699029126, + "grad_norm": 0.911092735481885, + "learning_rate": 2.42630017193922e-06, + "loss": 0.3815, + "step": 787 + }, + { + "epoch": 7.650485436893204, + "grad_norm": 0.7386605111856521, + "learning_rate": 2.4212190539676667e-06, + "loss": 0.055, + "step": 788 + }, + { + "epoch": 7.660194174757281, + "grad_norm": 0.7852123838015137, + "learning_rate": 2.4161382617306585e-06, + "loss": 0.2142, + "step": 789 + }, + { + "epoch": 7.669902912621359, + "grad_norm": 0.8671129620068961, + "learning_rate": 2.4110578162356814e-06, + "loss": 0.1295, + "step": 790 + }, + { + "epoch": 7.679611650485437, + "grad_norm": 0.8379544581414955, + "learning_rate": 2.405977738488786e-06, + "loss": 0.1811, + "step": 791 + }, + { + "epoch": 7.689320388349515, + "grad_norm": 0.8279537297736859, + "learning_rate": 2.4008980494945044e-06, + "loss": 0.1764, + "step": 792 + }, + { + "epoch": 7.699029126213592, + "grad_norm": 0.7533139895873169, + "learning_rate": 2.3958187702557587e-06, + "loss": 0.0803, + "step": 793 + }, + { + "epoch": 7.70873786407767, + "grad_norm": 0.7827211446349897, + "learning_rate": 2.39073992177378e-06, + "loss": 0.0769, + "step": 794 + }, + { + "epoch": 7.718446601941747, + "grad_norm": 0.870755479127296, + "learning_rate": 2.385661525048014e-06, + "loss": 0.1638, + "step": 795 + }, + { + "epoch": 7.728155339805825, + "grad_norm": 0.9060633189944168, + "learning_rate": 2.3805836010760435e-06, + "loss": 0.1948, + "step": 796 + }, + { + "epoch": 7.737864077669903, + "grad_norm": 0.785817720430874, + "learning_rate": 2.375506170853494e-06, + "loss": 0.1063, + "step": 797 + }, + { + "epoch": 7.747572815533981, + "grad_norm": 0.6084463077588024, + "learning_rate": 2.3704292553739487e-06, + "loss": 0.0805, + "step": 798 + }, + { + "epoch": 7.757281553398058, + "grad_norm": 0.6284328864307693, + "learning_rate": 2.3653528756288636e-06, + "loss": 0.1343, + "step": 799 + }, + { + "epoch": 7.766990291262136, + "grad_norm": 1.1072324614685045, + "learning_rate": 2.3602770526074804e-06, + "loss": 0.238, + "step": 800 + }, + { + "epoch": 7.776699029126213, + "grad_norm": 0.8771044918281113, + "learning_rate": 2.3552018072967375e-06, + "loss": 0.1482, + "step": 801 + }, + { + "epoch": 7.786407766990291, + "grad_norm": 0.6690650637514127, + "learning_rate": 2.3501271606811848e-06, + "loss": 0.0995, + "step": 802 + }, + { + "epoch": 7.796116504854369, + "grad_norm": 0.7709052345206647, + "learning_rate": 2.345053133742898e-06, + "loss": 0.2694, + "step": 803 + }, + { + "epoch": 7.805825242718447, + "grad_norm": 0.6349998040433816, + "learning_rate": 2.3399797474613894e-06, + "loss": 0.0663, + "step": 804 + }, + { + "epoch": 7.815533980582524, + "grad_norm": 1.0261288250181526, + "learning_rate": 2.334907022813523e-06, + "loss": 0.1044, + "step": 805 + }, + { + "epoch": 7.825242718446602, + "grad_norm": 0.6267009972409442, + "learning_rate": 2.329834980773427e-06, + "loss": 0.2633, + "step": 806 + }, + { + "epoch": 7.834951456310679, + "grad_norm": 0.918811037432451, + "learning_rate": 2.324763642312407e-06, + "loss": 0.1622, + "step": 807 + }, + { + "epoch": 7.844660194174757, + "grad_norm": 0.739956492123394, + "learning_rate": 2.3196930283988603e-06, + "loss": 0.2448, + "step": 808 + }, + { + "epoch": 7.854368932038835, + "grad_norm": 0.7010116121906006, + "learning_rate": 2.3146231599981865e-06, + "loss": 0.2406, + "step": 809 + }, + { + "epoch": 7.864077669902913, + "grad_norm": 0.7415696256419213, + "learning_rate": 2.3095540580727054e-06, + "loss": 0.279, + "step": 810 + }, + { + "epoch": 7.87378640776699, + "grad_norm": 0.7935876878721858, + "learning_rate": 2.304485743581566e-06, + "loss": 0.2569, + "step": 811 + }, + { + "epoch": 7.883495145631068, + "grad_norm": 1.2683791645513949, + "learning_rate": 2.299418237480661e-06, + "loss": 0.0492, + "step": 812 + }, + { + "epoch": 7.893203883495145, + "grad_norm": 0.8300472882833771, + "learning_rate": 2.294351560722542e-06, + "loss": 0.2097, + "step": 813 + }, + { + "epoch": 7.902912621359223, + "grad_norm": 0.770677712112457, + "learning_rate": 2.2892857342563316e-06, + "loss": 0.0987, + "step": 814 + }, + { + "epoch": 7.9126213592233015, + "grad_norm": 0.810806375980374, + "learning_rate": 2.2842207790276355e-06, + "loss": 0.2034, + "step": 815 + }, + { + "epoch": 7.922330097087379, + "grad_norm": 0.865025284529186, + "learning_rate": 2.279156715978457e-06, + "loss": 0.2431, + "step": 816 + }, + { + "epoch": 7.932038834951456, + "grad_norm": 0.7151367709362322, + "learning_rate": 2.274093566047113e-06, + "loss": 0.1682, + "step": 817 + }, + { + "epoch": 7.941747572815534, + "grad_norm": 0.7316758258717895, + "learning_rate": 2.2690313501681426e-06, + "loss": 0.1433, + "step": 818 + }, + { + "epoch": 7.951456310679612, + "grad_norm": 0.7394666643705587, + "learning_rate": 2.263970089272223e-06, + "loss": 0.178, + "step": 819 + }, + { + "epoch": 7.961165048543689, + "grad_norm": 0.654876224823624, + "learning_rate": 2.2589098042860838e-06, + "loss": 0.1848, + "step": 820 + }, + { + "epoch": 7.970873786407767, + "grad_norm": 0.6696537239231014, + "learning_rate": 2.2538505161324186e-06, + "loss": 0.0858, + "step": 821 + }, + { + "epoch": 7.980582524271845, + "grad_norm": 0.9132395771066292, + "learning_rate": 2.2487922457298007e-06, + "loss": 0.086, + "step": 822 + }, + { + "epoch": 7.990291262135923, + "grad_norm": 0.8888174000325908, + "learning_rate": 2.243735013992593e-06, + "loss": 0.2508, + "step": 823 + }, + { + "epoch": 8.0, + "grad_norm": 0.7061956513505121, + "learning_rate": 2.238678841830867e-06, + "loss": 0.1326, + "step": 824 + }, + { + "epoch": 8.009708737864077, + "grad_norm": 0.6096642773859534, + "learning_rate": 2.2336237501503103e-06, + "loss": 0.0859, + "step": 825 + }, + { + "epoch": 8.019417475728156, + "grad_norm": 0.874150385218761, + "learning_rate": 2.2285697598521446e-06, + "loss": 0.0705, + "step": 826 + }, + { + "epoch": 8.029126213592233, + "grad_norm": 0.6120565251058548, + "learning_rate": 2.2235168918330374e-06, + "loss": 0.147, + "step": 827 + }, + { + "epoch": 8.03883495145631, + "grad_norm": 0.7460242405023572, + "learning_rate": 2.2184651669850164e-06, + "loss": 0.1725, + "step": 828 + }, + { + "epoch": 8.048543689320388, + "grad_norm": 0.47300009134877163, + "learning_rate": 2.2134146061953814e-06, + "loss": 0.0186, + "step": 829 + }, + { + "epoch": 8.058252427184467, + "grad_norm": 0.978149051086085, + "learning_rate": 2.2083652303466196e-06, + "loss": 0.2288, + "step": 830 + }, + { + "epoch": 8.067961165048544, + "grad_norm": 0.9177514365909158, + "learning_rate": 2.20331706031632e-06, + "loss": 0.1992, + "step": 831 + }, + { + "epoch": 8.077669902912621, + "grad_norm": 0.6550516508961114, + "learning_rate": 2.1982701169770853e-06, + "loss": 0.1029, + "step": 832 + }, + { + "epoch": 8.087378640776699, + "grad_norm": 0.7730840644031025, + "learning_rate": 2.1932244211964456e-06, + "loss": 0.144, + "step": 833 + }, + { + "epoch": 8.097087378640778, + "grad_norm": 0.5752559068978315, + "learning_rate": 2.1881799938367735e-06, + "loss": 0.0994, + "step": 834 + }, + { + "epoch": 8.106796116504855, + "grad_norm": 0.697423153476562, + "learning_rate": 2.1831368557551962e-06, + "loss": 0.0829, + "step": 835 + }, + { + "epoch": 8.116504854368932, + "grad_norm": 0.9600930670561553, + "learning_rate": 2.1780950278035114e-06, + "loss": 0.1914, + "step": 836 + }, + { + "epoch": 8.12621359223301, + "grad_norm": 0.8777974546702457, + "learning_rate": 2.173054530828098e-06, + "loss": 0.0827, + "step": 837 + }, + { + "epoch": 8.135922330097088, + "grad_norm": 0.6781280331911594, + "learning_rate": 2.168015385669833e-06, + "loss": 0.0882, + "step": 838 + }, + { + "epoch": 8.145631067961165, + "grad_norm": 1.002740237182081, + "learning_rate": 2.162977613164005e-06, + "loss": 0.2869, + "step": 839 + }, + { + "epoch": 8.155339805825243, + "grad_norm": 1.1050895301986552, + "learning_rate": 2.157941234140225e-06, + "loss": 0.2794, + "step": 840 + }, + { + "epoch": 8.16504854368932, + "grad_norm": 0.6988453726767133, + "learning_rate": 2.1529062694223437e-06, + "loss": 0.153, + "step": 841 + }, + { + "epoch": 8.174757281553399, + "grad_norm": 0.4102856314598248, + "learning_rate": 2.147872739828364e-06, + "loss": 0.0422, + "step": 842 + }, + { + "epoch": 8.184466019417476, + "grad_norm": 0.6732421099515787, + "learning_rate": 2.142840666170354e-06, + "loss": 0.0614, + "step": 843 + }, + { + "epoch": 8.194174757281553, + "grad_norm": 0.5179106274827621, + "learning_rate": 2.1378100692543637e-06, + "loss": 0.075, + "step": 844 + }, + { + "epoch": 8.20388349514563, + "grad_norm": 0.6248672879860213, + "learning_rate": 2.1327809698803354e-06, + "loss": 0.1265, + "step": 845 + }, + { + "epoch": 8.21359223300971, + "grad_norm": 0.6770539867953491, + "learning_rate": 2.1277533888420203e-06, + "loss": 0.1378, + "step": 846 + }, + { + "epoch": 8.223300970873787, + "grad_norm": 0.5868543420862828, + "learning_rate": 2.1227273469268932e-06, + "loss": 0.0833, + "step": 847 + }, + { + "epoch": 8.233009708737864, + "grad_norm": 0.6864008955143607, + "learning_rate": 2.117702864916063e-06, + "loss": 0.0868, + "step": 848 + }, + { + "epoch": 8.242718446601941, + "grad_norm": 1.103909018912092, + "learning_rate": 2.1126799635841897e-06, + "loss": 0.1473, + "step": 849 + }, + { + "epoch": 8.25242718446602, + "grad_norm": 0.8183458395730451, + "learning_rate": 2.1076586636993975e-06, + "loss": 0.1373, + "step": 850 + }, + { + "epoch": 8.262135922330097, + "grad_norm": 0.715445629716324, + "learning_rate": 2.102638986023189e-06, + "loss": 0.105, + "step": 851 + }, + { + "epoch": 8.271844660194175, + "grad_norm": 0.7455938407063636, + "learning_rate": 2.0976209513103604e-06, + "loss": 0.1504, + "step": 852 + }, + { + "epoch": 8.281553398058252, + "grad_norm": 0.7023437585403879, + "learning_rate": 2.0926045803089135e-06, + "loss": 0.0817, + "step": 853 + }, + { + "epoch": 8.29126213592233, + "grad_norm": 1.2518758042906253, + "learning_rate": 2.087589893759972e-06, + "loss": 0.0936, + "step": 854 + }, + { + "epoch": 8.300970873786408, + "grad_norm": 0.5636684222747409, + "learning_rate": 2.0825769123976954e-06, + "loss": 0.1198, + "step": 855 + }, + { + "epoch": 8.310679611650485, + "grad_norm": 0.7849653948615573, + "learning_rate": 2.077565656949191e-06, + "loss": 0.1976, + "step": 856 + }, + { + "epoch": 8.320388349514563, + "grad_norm": 0.785519727376682, + "learning_rate": 2.072556148134431e-06, + "loss": 0.1825, + "step": 857 + }, + { + "epoch": 8.330097087378642, + "grad_norm": 0.7359895734584899, + "learning_rate": 2.0675484066661666e-06, + "loss": 0.1522, + "step": 858 + }, + { + "epoch": 8.339805825242719, + "grad_norm": 0.5593109629965097, + "learning_rate": 2.0625424532498407e-06, + "loss": 0.0722, + "step": 859 + }, + { + "epoch": 8.349514563106796, + "grad_norm": 0.6111935300174707, + "learning_rate": 2.057538308583502e-06, + "loss": 0.0648, + "step": 860 + }, + { + "epoch": 8.359223300970873, + "grad_norm": 1.1576569133730004, + "learning_rate": 2.0525359933577243e-06, + "loss": 0.2763, + "step": 861 + }, + { + "epoch": 8.368932038834952, + "grad_norm": 0.8296811567162217, + "learning_rate": 2.047535528255514e-06, + "loss": 0.1847, + "step": 862 + }, + { + "epoch": 8.37864077669903, + "grad_norm": 0.8205871431038276, + "learning_rate": 2.0425369339522276e-06, + "loss": 0.0166, + "step": 863 + }, + { + "epoch": 8.388349514563107, + "grad_norm": 0.6447713146205751, + "learning_rate": 2.0375402311154886e-06, + "loss": 0.1415, + "step": 864 + }, + { + "epoch": 8.398058252427184, + "grad_norm": 0.4030106740717843, + "learning_rate": 2.0325454404050983e-06, + "loss": 0.0142, + "step": 865 + }, + { + "epoch": 8.407766990291263, + "grad_norm": 1.0171802020707985, + "learning_rate": 2.0275525824729523e-06, + "loss": 0.2016, + "step": 866 + }, + { + "epoch": 8.41747572815534, + "grad_norm": 0.6994124188590608, + "learning_rate": 2.022561677962955e-06, + "loss": 0.1537, + "step": 867 + }, + { + "epoch": 8.427184466019417, + "grad_norm": 0.7402451386749993, + "learning_rate": 2.017572747510934e-06, + "loss": 0.1046, + "step": 868 + }, + { + "epoch": 8.436893203883495, + "grad_norm": 0.7846549321219498, + "learning_rate": 2.012585811744555e-06, + "loss": 0.1425, + "step": 869 + }, + { + "epoch": 8.446601941747574, + "grad_norm": 1.440584552164248, + "learning_rate": 2.0076008912832355e-06, + "loss": 0.2161, + "step": 870 + }, + { + "epoch": 8.45631067961165, + "grad_norm": 0.9407034033302534, + "learning_rate": 2.002618006738061e-06, + "loss": 0.2049, + "step": 871 + }, + { + "epoch": 8.466019417475728, + "grad_norm": 0.742221931800229, + "learning_rate": 1.9976371787116992e-06, + "loss": 0.1399, + "step": 872 + }, + { + "epoch": 8.475728155339805, + "grad_norm": 0.5517495585998253, + "learning_rate": 1.9926584277983134e-06, + "loss": 0.0586, + "step": 873 + }, + { + "epoch": 8.485436893203884, + "grad_norm": 0.9431771754400559, + "learning_rate": 1.9876817745834805e-06, + "loss": 0.1939, + "step": 874 + }, + { + "epoch": 8.495145631067961, + "grad_norm": 0.6915916778894676, + "learning_rate": 1.9827072396441044e-06, + "loss": 0.1943, + "step": 875 + }, + { + "epoch": 8.504854368932039, + "grad_norm": 0.7708115028681229, + "learning_rate": 1.9777348435483285e-06, + "loss": 0.1584, + "step": 876 + }, + { + "epoch": 8.514563106796116, + "grad_norm": 1.105190714246728, + "learning_rate": 1.972764606855454e-06, + "loss": 0.2334, + "step": 877 + }, + { + "epoch": 8.524271844660195, + "grad_norm": 0.6628366560401108, + "learning_rate": 1.9677965501158534e-06, + "loss": 0.1279, + "step": 878 + }, + { + "epoch": 8.533980582524272, + "grad_norm": 0.6913345579869764, + "learning_rate": 1.9628306938708857e-06, + "loss": 0.1757, + "step": 879 + }, + { + "epoch": 8.54368932038835, + "grad_norm": 0.6844557174447111, + "learning_rate": 1.957867058652812e-06, + "loss": 0.125, + "step": 880 + }, + { + "epoch": 8.553398058252426, + "grad_norm": 0.6999128351223273, + "learning_rate": 1.952905664984709e-06, + "loss": 0.2437, + "step": 881 + }, + { + "epoch": 8.563106796116505, + "grad_norm": 0.575971990243284, + "learning_rate": 1.947946533380387e-06, + "loss": 0.0885, + "step": 882 + }, + { + "epoch": 8.572815533980583, + "grad_norm": 0.5928962221916307, + "learning_rate": 1.9429896843443025e-06, + "loss": 0.0689, + "step": 883 + }, + { + "epoch": 8.58252427184466, + "grad_norm": 0.7868027361400175, + "learning_rate": 1.938035138371474e-06, + "loss": 0.3051, + "step": 884 + }, + { + "epoch": 8.592233009708737, + "grad_norm": 0.5972483182228643, + "learning_rate": 1.933082915947398e-06, + "loss": 0.1386, + "step": 885 + }, + { + "epoch": 8.601941747572816, + "grad_norm": 0.6175357677063205, + "learning_rate": 1.928133037547963e-06, + "loss": 0.1086, + "step": 886 + }, + { + "epoch": 8.611650485436893, + "grad_norm": 0.7543183263678916, + "learning_rate": 1.9231855236393677e-06, + "loss": 0.1329, + "step": 887 + }, + { + "epoch": 8.62135922330097, + "grad_norm": 0.6446977874615103, + "learning_rate": 1.9182403946780316e-06, + "loss": 0.1087, + "step": 888 + }, + { + "epoch": 8.631067961165048, + "grad_norm": 0.79165417035794, + "learning_rate": 1.9132976711105146e-06, + "loss": 0.2162, + "step": 889 + }, + { + "epoch": 8.640776699029127, + "grad_norm": 0.6474212890207318, + "learning_rate": 1.9083573733734328e-06, + "loss": 0.1667, + "step": 890 + }, + { + "epoch": 8.650485436893204, + "grad_norm": 0.938194675661121, + "learning_rate": 1.903419521893369e-06, + "loss": 0.1331, + "step": 891 + }, + { + "epoch": 8.660194174757281, + "grad_norm": 0.7444633617064056, + "learning_rate": 1.898484137086794e-06, + "loss": 0.1258, + "step": 892 + }, + { + "epoch": 8.669902912621358, + "grad_norm": 0.7271455868832031, + "learning_rate": 1.8935512393599784e-06, + "loss": 0.0347, + "step": 893 + }, + { + "epoch": 8.679611650485437, + "grad_norm": 1.0122920027200322, + "learning_rate": 1.8886208491089095e-06, + "loss": 0.0897, + "step": 894 + }, + { + "epoch": 8.689320388349515, + "grad_norm": 0.610405675841603, + "learning_rate": 1.8836929867192077e-06, + "loss": 0.0614, + "step": 895 + }, + { + "epoch": 8.699029126213592, + "grad_norm": 0.7295850031852491, + "learning_rate": 1.8787676725660405e-06, + "loss": 0.0909, + "step": 896 + }, + { + "epoch": 8.70873786407767, + "grad_norm": 0.832667949302075, + "learning_rate": 1.8738449270140404e-06, + "loss": 0.2159, + "step": 897 + }, + { + "epoch": 8.718446601941748, + "grad_norm": 0.6861487310656976, + "learning_rate": 1.8689247704172187e-06, + "loss": 0.1567, + "step": 898 + }, + { + "epoch": 8.728155339805825, + "grad_norm": 0.7588351556503447, + "learning_rate": 1.8640072231188825e-06, + "loss": 0.1879, + "step": 899 + }, + { + "epoch": 8.737864077669903, + "grad_norm": 0.7201360467114077, + "learning_rate": 1.8590923054515504e-06, + "loss": 0.1627, + "step": 900 + }, + { + "epoch": 8.74757281553398, + "grad_norm": 0.7254000831754608, + "learning_rate": 1.8541800377368673e-06, + "loss": 0.1887, + "step": 901 + }, + { + "epoch": 8.757281553398059, + "grad_norm": 0.8622921617060856, + "learning_rate": 1.8492704402855229e-06, + "loss": 0.1916, + "step": 902 + }, + { + "epoch": 8.766990291262136, + "grad_norm": 0.693010331212834, + "learning_rate": 1.8443635333971643e-06, + "loss": 0.0764, + "step": 903 + }, + { + "epoch": 8.776699029126213, + "grad_norm": 0.9849662506213444, + "learning_rate": 1.8394593373603173e-06, + "loss": 0.1249, + "step": 904 + }, + { + "epoch": 8.78640776699029, + "grad_norm": 0.603276080839406, + "learning_rate": 1.8345578724522957e-06, + "loss": 0.1143, + "step": 905 + }, + { + "epoch": 8.79611650485437, + "grad_norm": 0.8844794167233913, + "learning_rate": 1.8296591589391227e-06, + "loss": 0.1003, + "step": 906 + }, + { + "epoch": 8.805825242718447, + "grad_norm": 0.6706572895742536, + "learning_rate": 1.8247632170754443e-06, + "loss": 0.1094, + "step": 907 + }, + { + "epoch": 8.815533980582524, + "grad_norm": 0.7048555162130203, + "learning_rate": 1.8198700671044477e-06, + "loss": 0.0615, + "step": 908 + }, + { + "epoch": 8.825242718446601, + "grad_norm": 0.7951541866582822, + "learning_rate": 1.8149797292577757e-06, + "loss": 0.1881, + "step": 909 + }, + { + "epoch": 8.83495145631068, + "grad_norm": 1.0779476607088172, + "learning_rate": 1.8100922237554442e-06, + "loss": 0.2077, + "step": 910 + }, + { + "epoch": 8.844660194174757, + "grad_norm": 0.5376687871944611, + "learning_rate": 1.8052075708057581e-06, + "loss": 0.0418, + "step": 911 + }, + { + "epoch": 8.854368932038835, + "grad_norm": 0.5817902698395092, + "learning_rate": 1.8003257906052284e-06, + "loss": 0.0272, + "step": 912 + }, + { + "epoch": 8.864077669902912, + "grad_norm": 0.9047000568519542, + "learning_rate": 1.7954469033384868e-06, + "loss": 0.1319, + "step": 913 + }, + { + "epoch": 8.87378640776699, + "grad_norm": 0.7285928029702656, + "learning_rate": 1.790570929178206e-06, + "loss": 0.1669, + "step": 914 + }, + { + "epoch": 8.883495145631068, + "grad_norm": 0.6615354057172695, + "learning_rate": 1.7856978882850112e-06, + "loss": 0.1078, + "step": 915 + }, + { + "epoch": 8.893203883495145, + "grad_norm": 0.684988992100525, + "learning_rate": 1.780827800807401e-06, + "loss": 0.1277, + "step": 916 + }, + { + "epoch": 8.902912621359224, + "grad_norm": 1.0329628068804662, + "learning_rate": 1.7759606868816623e-06, + "loss": 0.1635, + "step": 917 + }, + { + "epoch": 8.912621359223301, + "grad_norm": 0.519382288792405, + "learning_rate": 1.771096566631788e-06, + "loss": 0.0937, + "step": 918 + }, + { + "epoch": 8.922330097087379, + "grad_norm": 0.8421607606627751, + "learning_rate": 1.766235460169392e-06, + "loss": 0.1776, + "step": 919 + }, + { + "epoch": 8.932038834951456, + "grad_norm": 1.1862589863385231, + "learning_rate": 1.7613773875936274e-06, + "loss": 0.053, + "step": 920 + }, + { + "epoch": 8.941747572815533, + "grad_norm": 0.8864371486994366, + "learning_rate": 1.7565223689911038e-06, + "loss": 0.0515, + "step": 921 + }, + { + "epoch": 8.951456310679612, + "grad_norm": 0.5967041565663332, + "learning_rate": 1.7516704244358018e-06, + "loss": 0.1032, + "step": 922 + }, + { + "epoch": 8.96116504854369, + "grad_norm": 0.6128809380818757, + "learning_rate": 1.7468215739889941e-06, + "loss": 0.0798, + "step": 923 + }, + { + "epoch": 8.970873786407767, + "grad_norm": 0.9034318595552594, + "learning_rate": 1.741975837699158e-06, + "loss": 0.2227, + "step": 924 + }, + { + "epoch": 8.980582524271846, + "grad_norm": 0.5952541990579747, + "learning_rate": 1.7371332356018972e-06, + "loss": 0.1644, + "step": 925 + }, + { + "epoch": 8.990291262135923, + "grad_norm": 0.8057774859794358, + "learning_rate": 1.7322937877198545e-06, + "loss": 0.1789, + "step": 926 + }, + { + "epoch": 9.0, + "grad_norm": 0.5973155402909465, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.0662, + "step": 927 + }, + { + "epoch": 9.009708737864077, + "grad_norm": 0.46982841443298656, + "learning_rate": 1.7226244346267063e-06, + "loss": 0.1126, + "step": 928 + }, + { + "epoch": 9.019417475728156, + "grad_norm": 0.6395071722994249, + "learning_rate": 1.7177945693953486e-06, + "loss": 0.1169, + "step": 929 + }, + { + "epoch": 9.029126213592233, + "grad_norm": 0.6000833565715967, + "learning_rate": 1.7129679383385384e-06, + "loss": 0.1407, + "step": 930 + }, + { + "epoch": 9.03883495145631, + "grad_norm": 0.5328547699863273, + "learning_rate": 1.7081445614128845e-06, + "loss": 0.0644, + "step": 931 + }, + { + "epoch": 9.048543689320388, + "grad_norm": 0.698674577006318, + "learning_rate": 1.7033244585615393e-06, + "loss": 0.2139, + "step": 932 + }, + { + "epoch": 9.058252427184467, + "grad_norm": 0.9070203866078601, + "learning_rate": 1.698507649714121e-06, + "loss": 0.1712, + "step": 933 + }, + { + "epoch": 9.067961165048544, + "grad_norm": 0.5121865723375197, + "learning_rate": 1.6936941547866248e-06, + "loss": 0.0602, + "step": 934 + }, + { + "epoch": 9.077669902912621, + "grad_norm": 0.6197111730604409, + "learning_rate": 1.688883993681345e-06, + "loss": 0.1063, + "step": 935 + }, + { + "epoch": 9.087378640776699, + "grad_norm": 0.7859446210738993, + "learning_rate": 1.6840771862867922e-06, + "loss": 0.0857, + "step": 936 + }, + { + "epoch": 9.097087378640778, + "grad_norm": 1.1642241783373757, + "learning_rate": 1.6792737524776093e-06, + "loss": 0.1344, + "step": 937 + }, + { + "epoch": 9.106796116504855, + "grad_norm": 0.4302682219192299, + "learning_rate": 1.674473712114492e-06, + "loss": 0.0379, + "step": 938 + }, + { + "epoch": 9.116504854368932, + "grad_norm": 0.621250635600905, + "learning_rate": 1.6696770850441036e-06, + "loss": 0.0401, + "step": 939 + }, + { + "epoch": 9.12621359223301, + "grad_norm": 0.9740610618310332, + "learning_rate": 1.6648838910989955e-06, + "loss": 0.2171, + "step": 940 + }, + { + "epoch": 9.135922330097088, + "grad_norm": 0.8966783169812266, + "learning_rate": 1.6600941500975237e-06, + "loss": 0.0965, + "step": 941 + }, + { + "epoch": 9.145631067961165, + "grad_norm": 0.7400396070349328, + "learning_rate": 1.6553078818437678e-06, + "loss": 0.1748, + "step": 942 + }, + { + "epoch": 9.155339805825243, + "grad_norm": 0.8321559134472875, + "learning_rate": 1.6505251061274492e-06, + "loss": 0.1614, + "step": 943 + }, + { + "epoch": 9.16504854368932, + "grad_norm": 0.8790366855460469, + "learning_rate": 1.6457458427238464e-06, + "loss": 0.2559, + "step": 944 + }, + { + "epoch": 9.174757281553399, + "grad_norm": 0.8881109198480825, + "learning_rate": 1.6409701113937182e-06, + "loss": 0.0951, + "step": 945 + }, + { + "epoch": 9.184466019417476, + "grad_norm": 0.8440100111279388, + "learning_rate": 1.6361979318832173e-06, + "loss": 0.0804, + "step": 946 + }, + { + "epoch": 9.194174757281553, + "grad_norm": 0.6968010735521492, + "learning_rate": 1.6314293239238134e-06, + "loss": 0.1062, + "step": 947 + }, + { + "epoch": 9.20388349514563, + "grad_norm": 0.5054689570887443, + "learning_rate": 1.626664307232207e-06, + "loss": 0.0801, + "step": 948 + }, + { + "epoch": 9.21359223300971, + "grad_norm": 0.7768640631942998, + "learning_rate": 1.62190290151025e-06, + "loss": 0.1664, + "step": 949 + }, + { + "epoch": 9.223300970873787, + "grad_norm": 0.5024211551473686, + "learning_rate": 1.617145126444864e-06, + "loss": 0.0903, + "step": 950 + }, + { + "epoch": 9.233009708737864, + "grad_norm": 0.6883186340673217, + "learning_rate": 1.6123910017079591e-06, + "loss": 0.1663, + "step": 951 + }, + { + "epoch": 9.242718446601941, + "grad_norm": 0.4722355032472164, + "learning_rate": 1.6076405469563533e-06, + "loss": 0.0675, + "step": 952 + }, + { + "epoch": 9.25242718446602, + "grad_norm": 0.6836772775774017, + "learning_rate": 1.6028937818316889e-06, + "loss": 0.1429, + "step": 953 + }, + { + "epoch": 9.262135922330097, + "grad_norm": 0.5257068429565035, + "learning_rate": 1.598150725960354e-06, + "loss": 0.0209, + "step": 954 + }, + { + "epoch": 9.271844660194175, + "grad_norm": 0.7877874466415006, + "learning_rate": 1.5934113989533992e-06, + "loss": 0.0966, + "step": 955 + }, + { + "epoch": 9.281553398058252, + "grad_norm": 1.044912299647016, + "learning_rate": 1.5886758204064582e-06, + "loss": 0.0712, + "step": 956 + }, + { + "epoch": 9.29126213592233, + "grad_norm": 0.33787344891092924, + "learning_rate": 1.583944009899665e-06, + "loss": 0.0273, + "step": 957 + }, + { + "epoch": 9.300970873786408, + "grad_norm": 0.7332496066271933, + "learning_rate": 1.579215986997575e-06, + "loss": 0.0734, + "step": 958 + }, + { + "epoch": 9.310679611650485, + "grad_norm": 0.9742544872879907, + "learning_rate": 1.5744917712490821e-06, + "loss": 0.1911, + "step": 959 + }, + { + "epoch": 9.320388349514563, + "grad_norm": 0.421873246338127, + "learning_rate": 1.5697713821873401e-06, + "loss": 0.0244, + "step": 960 + }, + { + "epoch": 9.330097087378642, + "grad_norm": 0.5145290760788898, + "learning_rate": 1.5650548393296788e-06, + "loss": 0.0457, + "step": 961 + }, + { + "epoch": 9.339805825242719, + "grad_norm": 0.7881806168450413, + "learning_rate": 1.5603421621775273e-06, + "loss": 0.1336, + "step": 962 + }, + { + "epoch": 9.349514563106796, + "grad_norm": 0.7928543632419307, + "learning_rate": 1.555633370216329e-06, + "loss": 0.0845, + "step": 963 + }, + { + "epoch": 9.359223300970873, + "grad_norm": 0.5771454392471547, + "learning_rate": 1.5509284829154652e-06, + "loss": 0.1106, + "step": 964 + }, + { + "epoch": 9.368932038834952, + "grad_norm": 0.6558019594502399, + "learning_rate": 1.5462275197281717e-06, + "loss": 0.0633, + "step": 965 + }, + { + "epoch": 9.37864077669903, + "grad_norm": 1.1584092476213597, + "learning_rate": 1.5415305000914587e-06, + "loss": 0.1725, + "step": 966 + }, + { + "epoch": 9.388349514563107, + "grad_norm": 0.5452649508861512, + "learning_rate": 1.536837443426032e-06, + "loss": 0.0297, + "step": 967 + }, + { + "epoch": 9.398058252427184, + "grad_norm": 0.4314536564363454, + "learning_rate": 1.5321483691362121e-06, + "loss": 0.0797, + "step": 968 + }, + { + "epoch": 9.407766990291263, + "grad_norm": 0.6324718727678328, + "learning_rate": 1.5274632966098527e-06, + "loss": 0.0332, + "step": 969 + }, + { + "epoch": 9.41747572815534, + "grad_norm": 0.4549599787302605, + "learning_rate": 1.5227822452182617e-06, + "loss": 0.0556, + "step": 970 + }, + { + "epoch": 9.427184466019417, + "grad_norm": 0.7287131128875295, + "learning_rate": 1.5181052343161212e-06, + "loss": 0.1135, + "step": 971 + }, + { + "epoch": 9.436893203883495, + "grad_norm": 0.3758449427962483, + "learning_rate": 1.5134322832414066e-06, + "loss": 0.0328, + "step": 972 + }, + { + "epoch": 9.446601941747574, + "grad_norm": 0.629439947427426, + "learning_rate": 1.508763411315308e-06, + "loss": 0.1158, + "step": 973 + }, + { + "epoch": 9.45631067961165, + "grad_norm": 0.5492590601077479, + "learning_rate": 1.5040986378421485e-06, + "loss": 0.0621, + "step": 974 + }, + { + "epoch": 9.466019417475728, + "grad_norm": 0.47227042867230984, + "learning_rate": 1.499437982109305e-06, + "loss": 0.0823, + "step": 975 + }, + { + "epoch": 9.475728155339805, + "grad_norm": 0.8874051976671528, + "learning_rate": 1.4947814633871316e-06, + "loss": 0.0789, + "step": 976 + }, + { + "epoch": 9.485436893203884, + "grad_norm": 0.5313571637739412, + "learning_rate": 1.4901291009288748e-06, + "loss": 0.048, + "step": 977 + }, + { + "epoch": 9.495145631067961, + "grad_norm": 0.6634950979669227, + "learning_rate": 1.4854809139705961e-06, + "loss": 0.129, + "step": 978 + }, + { + "epoch": 9.504854368932039, + "grad_norm": 0.7211690230550263, + "learning_rate": 1.4808369217310937e-06, + "loss": 0.1095, + "step": 979 + }, + { + "epoch": 9.514563106796116, + "grad_norm": 0.8721245712399799, + "learning_rate": 1.4761971434118207e-06, + "loss": 0.1667, + "step": 980 + }, + { + "epoch": 9.524271844660195, + "grad_norm": 0.7538810617432105, + "learning_rate": 1.4715615981968088e-06, + "loss": 0.082, + "step": 981 + }, + { + "epoch": 9.533980582524272, + "grad_norm": 0.6613308701670603, + "learning_rate": 1.4669303052525852e-06, + "loss": 0.12, + "step": 982 + }, + { + "epoch": 9.54368932038835, + "grad_norm": 0.40582479242542924, + "learning_rate": 1.4623032837280971e-06, + "loss": 0.012, + "step": 983 + }, + { + "epoch": 9.553398058252426, + "grad_norm": 0.9306885236611109, + "learning_rate": 1.4576805527546293e-06, + "loss": 0.1777, + "step": 984 + }, + { + "epoch": 9.563106796116505, + "grad_norm": 1.0023614165143215, + "learning_rate": 1.4530621314457255e-06, + "loss": 0.0978, + "step": 985 + }, + { + "epoch": 9.572815533980583, + "grad_norm": 0.9617385737325488, + "learning_rate": 1.4484480388971141e-06, + "loss": 0.2374, + "step": 986 + }, + { + "epoch": 9.58252427184466, + "grad_norm": 0.9212729138833889, + "learning_rate": 1.4438382941866224e-06, + "loss": 0.1527, + "step": 987 + }, + { + "epoch": 9.592233009708737, + "grad_norm": 0.6306564260185002, + "learning_rate": 1.4392329163741015e-06, + "loss": 0.1009, + "step": 988 + }, + { + "epoch": 9.601941747572816, + "grad_norm": 0.5734759604323216, + "learning_rate": 1.4346319245013463e-06, + "loss": 0.1903, + "step": 989 + }, + { + "epoch": 9.611650485436893, + "grad_norm": 0.9291804725028308, + "learning_rate": 1.430035337592018e-06, + "loss": 0.0629, + "step": 990 + }, + { + "epoch": 9.62135922330097, + "grad_norm": 0.5720126721481944, + "learning_rate": 1.425443174651564e-06, + "loss": 0.0699, + "step": 991 + }, + { + "epoch": 9.631067961165048, + "grad_norm": 0.6827408883331223, + "learning_rate": 1.4208554546671407e-06, + "loss": 0.1166, + "step": 992 + }, + { + "epoch": 9.640776699029127, + "grad_norm": 0.9468184547358578, + "learning_rate": 1.4162721966075323e-06, + "loss": 0.1508, + "step": 993 + }, + { + "epoch": 9.650485436893204, + "grad_norm": 0.5185651892093444, + "learning_rate": 1.411693419423078e-06, + "loss": 0.0747, + "step": 994 + }, + { + "epoch": 9.660194174757281, + "grad_norm": 0.7182852834221573, + "learning_rate": 1.4071191420455873e-06, + "loss": 0.0868, + "step": 995 + }, + { + "epoch": 9.669902912621358, + "grad_norm": 0.5124393893502427, + "learning_rate": 1.4025493833882645e-06, + "loss": 0.0544, + "step": 996 + }, + { + "epoch": 9.679611650485437, + "grad_norm": 0.4519760606619813, + "learning_rate": 1.3979841623456309e-06, + "loss": 0.0298, + "step": 997 + }, + { + "epoch": 9.689320388349515, + "grad_norm": 0.7723673765353795, + "learning_rate": 1.3934234977934463e-06, + "loss": 0.1117, + "step": 998 + }, + { + "epoch": 9.699029126213592, + "grad_norm": 0.5591540435943934, + "learning_rate": 1.3888674085886302e-06, + "loss": 0.0856, + "step": 999 + }, + { + "epoch": 9.70873786407767, + "grad_norm": 0.5572779776561083, + "learning_rate": 1.3843159135691859e-06, + "loss": 0.1138, + "step": 1000 + }, + { + "epoch": 9.718446601941748, + "grad_norm": 0.579120584588862, + "learning_rate": 1.3797690315541193e-06, + "loss": 0.091, + "step": 1001 + }, + { + "epoch": 9.728155339805825, + "grad_norm": 0.7351017325830426, + "learning_rate": 1.3752267813433645e-06, + "loss": 0.1019, + "step": 1002 + }, + { + "epoch": 9.737864077669903, + "grad_norm": 0.44019226102987263, + "learning_rate": 1.3706891817177036e-06, + "loss": 0.0632, + "step": 1003 + }, + { + "epoch": 9.74757281553398, + "grad_norm": 0.6462349810299945, + "learning_rate": 1.3661562514386895e-06, + "loss": 0.1481, + "step": 1004 + }, + { + "epoch": 9.757281553398059, + "grad_norm": 0.8965376719847379, + "learning_rate": 1.3616280092485719e-06, + "loss": 0.1183, + "step": 1005 + }, + { + "epoch": 9.766990291262136, + "grad_norm": 0.7108013520860924, + "learning_rate": 1.357104473870213e-06, + "loss": 0.168, + "step": 1006 + }, + { + "epoch": 9.776699029126213, + "grad_norm": 0.6653262047892645, + "learning_rate": 1.3525856640070156e-06, + "loss": 0.0664, + "step": 1007 + }, + { + "epoch": 9.78640776699029, + "grad_norm": 0.5630299546987264, + "learning_rate": 1.3480715983428433e-06, + "loss": 0.07, + "step": 1008 + }, + { + "epoch": 9.79611650485437, + "grad_norm": 0.5795784482057218, + "learning_rate": 1.3435622955419447e-06, + "loss": 0.1229, + "step": 1009 + }, + { + "epoch": 9.805825242718447, + "grad_norm": 0.45648098304255846, + "learning_rate": 1.3390577742488747e-06, + "loss": 0.0891, + "step": 1010 + }, + { + "epoch": 9.815533980582524, + "grad_norm": 0.5330321441449665, + "learning_rate": 1.334558053088419e-06, + "loss": 0.0854, + "step": 1011 + }, + { + "epoch": 9.825242718446601, + "grad_norm": 0.8383784644182197, + "learning_rate": 1.3300631506655148e-06, + "loss": 0.1271, + "step": 1012 + }, + { + "epoch": 9.83495145631068, + "grad_norm": 0.7994557312933233, + "learning_rate": 1.3255730855651772e-06, + "loss": 0.154, + "step": 1013 + }, + { + "epoch": 9.844660194174757, + "grad_norm": 0.6121104973654957, + "learning_rate": 1.3210878763524186e-06, + "loss": 0.1244, + "step": 1014 + }, + { + "epoch": 9.854368932038835, + "grad_norm": 0.8704001110194098, + "learning_rate": 1.3166075415721762e-06, + "loss": 0.1757, + "step": 1015 + }, + { + "epoch": 9.864077669902912, + "grad_norm": 0.5125263843444212, + "learning_rate": 1.3121320997492305e-06, + "loss": 0.0905, + "step": 1016 + }, + { + "epoch": 9.87378640776699, + "grad_norm": 0.6892296957249096, + "learning_rate": 1.307661569388132e-06, + "loss": 0.0987, + "step": 1017 + }, + { + "epoch": 9.883495145631068, + "grad_norm": 0.7269487551833638, + "learning_rate": 1.3031959689731236e-06, + "loss": 0.0787, + "step": 1018 + }, + { + "epoch": 9.893203883495145, + "grad_norm": 0.6176952721891387, + "learning_rate": 1.2987353169680667e-06, + "loss": 0.1318, + "step": 1019 + }, + { + "epoch": 9.902912621359224, + "grad_norm": 0.6021673456770801, + "learning_rate": 1.2942796318163595e-06, + "loss": 0.0492, + "step": 1020 + }, + { + "epoch": 9.912621359223301, + "grad_norm": 0.6496396475871585, + "learning_rate": 1.2898289319408653e-06, + "loss": 0.0956, + "step": 1021 + }, + { + "epoch": 9.922330097087379, + "grad_norm": 0.9117599759177222, + "learning_rate": 1.2853832357438346e-06, + "loss": 0.0815, + "step": 1022 + }, + { + "epoch": 9.932038834951456, + "grad_norm": 0.6343515859693896, + "learning_rate": 1.2809425616068288e-06, + "loss": 0.1379, + "step": 1023 + }, + { + "epoch": 9.941747572815533, + "grad_norm": 0.8668978005843916, + "learning_rate": 1.2765069278906456e-06, + "loss": 0.1431, + "step": 1024 + }, + { + "epoch": 9.951456310679612, + "grad_norm": 0.5567206920970875, + "learning_rate": 1.2720763529352415e-06, + "loss": 0.0251, + "step": 1025 + }, + { + "epoch": 9.96116504854369, + "grad_norm": 0.7044628563483055, + "learning_rate": 1.2676508550596562e-06, + "loss": 0.0398, + "step": 1026 + }, + { + "epoch": 9.970873786407767, + "grad_norm": 0.7421745735260384, + "learning_rate": 1.2632304525619388e-06, + "loss": 0.0626, + "step": 1027 + }, + { + "epoch": 9.980582524271846, + "grad_norm": 1.1608591744618904, + "learning_rate": 1.2588151637190687e-06, + "loss": 0.2441, + "step": 1028 + }, + { + "epoch": 9.990291262135923, + "grad_norm": 0.5136678107573567, + "learning_rate": 1.2544050067868834e-06, + "loss": 0.0588, + "step": 1029 + }, + { + "epoch": 10.0, + "grad_norm": 0.44097012341303254, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.098, + "step": 1030 + }, + { + "epoch": 10.009708737864077, + "grad_norm": 0.7139105592958839, + "learning_rate": 1.2456001615717445e-06, + "loss": 0.0817, + "step": 1031 + }, + { + "epoch": 10.019417475728156, + "grad_norm": 0.5817012295383788, + "learning_rate": 1.2412055096940692e-06, + "loss": 0.0855, + "step": 1032 + }, + { + "epoch": 10.029126213592233, + "grad_norm": 0.9943377613107669, + "learning_rate": 1.2368160625374835e-06, + "loss": 0.139, + "step": 1033 + }, + { + "epoch": 10.03883495145631, + "grad_norm": 0.3686137208464406, + "learning_rate": 1.2324318382509787e-06, + "loss": 0.0559, + "step": 1034 + }, + { + "epoch": 10.048543689320388, + "grad_norm": 0.38577477799819054, + "learning_rate": 1.2280528549619487e-06, + "loss": 0.0288, + "step": 1035 + }, + { + "epoch": 10.058252427184467, + "grad_norm": 0.6582129917764945, + "learning_rate": 1.2236791307761184e-06, + "loss": 0.1647, + "step": 1036 + }, + { + "epoch": 10.067961165048544, + "grad_norm": 0.7512464872879958, + "learning_rate": 1.2193106837774678e-06, + "loss": 0.0545, + "step": 1037 + }, + { + "epoch": 10.077669902912621, + "grad_norm": 0.5425451526026376, + "learning_rate": 1.2149475320281578e-06, + "loss": 0.0398, + "step": 1038 + }, + { + "epoch": 10.087378640776699, + "grad_norm": 0.7960650124670342, + "learning_rate": 1.2105896935684545e-06, + "loss": 0.082, + "step": 1039 + }, + { + "epoch": 10.097087378640778, + "grad_norm": 0.5338834907010502, + "learning_rate": 1.2062371864166553e-06, + "loss": 0.082, + "step": 1040 + }, + { + "epoch": 10.106796116504855, + "grad_norm": 0.5672377297057393, + "learning_rate": 1.2018900285690148e-06, + "loss": 0.0588, + "step": 1041 + }, + { + "epoch": 10.116504854368932, + "grad_norm": 0.5383809127022262, + "learning_rate": 1.1975482379996697e-06, + "loss": 0.0963, + "step": 1042 + }, + { + "epoch": 10.12621359223301, + "grad_norm": 0.554544672201788, + "learning_rate": 1.1932118326605644e-06, + "loss": 0.0802, + "step": 1043 + }, + { + "epoch": 10.135922330097088, + "grad_norm": 0.40501491454406263, + "learning_rate": 1.188880830481377e-06, + "loss": 0.0155, + "step": 1044 + }, + { + "epoch": 10.145631067961165, + "grad_norm": 0.4982365542620892, + "learning_rate": 1.1845552493694462e-06, + "loss": 0.0624, + "step": 1045 + }, + { + "epoch": 10.155339805825243, + "grad_norm": 0.44284165252569085, + "learning_rate": 1.1802351072096948e-06, + "loss": 0.0389, + "step": 1046 + }, + { + "epoch": 10.16504854368932, + "grad_norm": 0.43688194002614283, + "learning_rate": 1.1759204218645577e-06, + "loss": 0.0331, + "step": 1047 + }, + { + "epoch": 10.174757281553399, + "grad_norm": 0.9610954492065917, + "learning_rate": 1.1716112111739095e-06, + "loss": 0.057, + "step": 1048 + }, + { + "epoch": 10.184466019417476, + "grad_norm": 0.7894221279495568, + "learning_rate": 1.167307492954986e-06, + "loss": 0.1321, + "step": 1049 + }, + { + "epoch": 10.194174757281553, + "grad_norm": 0.7105486043207161, + "learning_rate": 1.1630092850023148e-06, + "loss": 0.117, + "step": 1050 + }, + { + "epoch": 10.20388349514563, + "grad_norm": 0.8709137289348258, + "learning_rate": 1.15871660508764e-06, + "loss": 0.1173, + "step": 1051 + }, + { + "epoch": 10.21359223300971, + "grad_norm": 0.7568948585913298, + "learning_rate": 1.1544294709598491e-06, + "loss": 0.1474, + "step": 1052 + }, + { + "epoch": 10.223300970873787, + "grad_norm": 0.6724147094520684, + "learning_rate": 1.1501479003448992e-06, + "loss": 0.0697, + "step": 1053 + }, + { + "epoch": 10.233009708737864, + "grad_norm": 0.4949654633371321, + "learning_rate": 1.1458719109457445e-06, + "loss": 0.0578, + "step": 1054 + }, + { + "epoch": 10.242718446601941, + "grad_norm": 0.8068088269521827, + "learning_rate": 1.141601520442262e-06, + "loss": 0.1246, + "step": 1055 + }, + { + "epoch": 10.25242718446602, + "grad_norm": 0.43316042424228557, + "learning_rate": 1.1373367464911798e-06, + "loss": 0.0454, + "step": 1056 + }, + { + "epoch": 10.262135922330097, + "grad_norm": 0.6652579848799286, + "learning_rate": 1.1330776067260026e-06, + "loss": 0.1283, + "step": 1057 + }, + { + "epoch": 10.271844660194175, + "grad_norm": 0.4042774504841769, + "learning_rate": 1.12882411875694e-06, + "loss": 0.0252, + "step": 1058 + }, + { + "epoch": 10.281553398058252, + "grad_norm": 0.5385885477070604, + "learning_rate": 1.1245763001708326e-06, + "loss": 0.0388, + "step": 1059 + }, + { + "epoch": 10.29126213592233, + "grad_norm": 0.6052457558453441, + "learning_rate": 1.120334168531081e-06, + "loss": 0.0734, + "step": 1060 + }, + { + "epoch": 10.300970873786408, + "grad_norm": 0.5293111516776242, + "learning_rate": 1.1160977413775704e-06, + "loss": 0.1185, + "step": 1061 + }, + { + "epoch": 10.310679611650485, + "grad_norm": 0.43792304060112175, + "learning_rate": 1.1118670362266003e-06, + "loss": 0.0504, + "step": 1062 + }, + { + "epoch": 10.320388349514563, + "grad_norm": 0.5208009686668422, + "learning_rate": 1.1076420705708137e-06, + "loss": 0.049, + "step": 1063 + }, + { + "epoch": 10.330097087378642, + "grad_norm": 0.32151178708408873, + "learning_rate": 1.1034228618791197e-06, + "loss": 0.0048, + "step": 1064 + }, + { + "epoch": 10.339805825242719, + "grad_norm": 0.40783412915581874, + "learning_rate": 1.0992094275966256e-06, + "loss": 0.0396, + "step": 1065 + }, + { + "epoch": 10.349514563106796, + "grad_norm": 0.8301501331889736, + "learning_rate": 1.0950017851445624e-06, + "loss": 0.0771, + "step": 1066 + }, + { + "epoch": 10.359223300970873, + "grad_norm": 0.656158454275523, + "learning_rate": 1.0907999519202142e-06, + "loss": 0.1427, + "step": 1067 + }, + { + "epoch": 10.368932038834952, + "grad_norm": 0.6226103543328013, + "learning_rate": 1.0866039452968464e-06, + "loss": 0.0849, + "step": 1068 + }, + { + "epoch": 10.37864077669903, + "grad_norm": 0.30026497224539017, + "learning_rate": 1.0824137826236318e-06, + "loss": 0.0053, + "step": 1069 + }, + { + "epoch": 10.388349514563107, + "grad_norm": 0.33866413080844127, + "learning_rate": 1.078229481225582e-06, + "loss": 0.0183, + "step": 1070 + }, + { + "epoch": 10.398058252427184, + "grad_norm": 0.5566432505296565, + "learning_rate": 1.074051058403472e-06, + "loss": 0.0759, + "step": 1071 + }, + { + "epoch": 10.407766990291263, + "grad_norm": 0.49851774518490277, + "learning_rate": 1.069878531433773e-06, + "loss": 0.0528, + "step": 1072 + }, + { + "epoch": 10.41747572815534, + "grad_norm": 0.4860505947440488, + "learning_rate": 1.0657119175685776e-06, + "loss": 0.0484, + "step": 1073 + }, + { + "epoch": 10.427184466019417, + "grad_norm": 0.3145673776783807, + "learning_rate": 1.061551234035529e-06, + "loss": 0.0153, + "step": 1074 + }, + { + "epoch": 10.436893203883495, + "grad_norm": 0.6816619685019172, + "learning_rate": 1.0573964980377517e-06, + "loss": 0.1271, + "step": 1075 + }, + { + "epoch": 10.446601941747574, + "grad_norm": 0.5845770900591787, + "learning_rate": 1.0532477267537772e-06, + "loss": 0.1469, + "step": 1076 + }, + { + "epoch": 10.45631067961165, + "grad_norm": 0.7233633949979615, + "learning_rate": 1.0491049373374762e-06, + "loss": 0.2243, + "step": 1077 + }, + { + "epoch": 10.466019417475728, + "grad_norm": 0.5464884216114049, + "learning_rate": 1.044968146917986e-06, + "loss": 0.0812, + "step": 1078 + }, + { + "epoch": 10.475728155339805, + "grad_norm": 0.3391211976990657, + "learning_rate": 1.0408373725996386e-06, + "loss": 0.0188, + "step": 1079 + }, + { + "epoch": 10.485436893203884, + "grad_norm": 0.6029803423460801, + "learning_rate": 1.0367126314618946e-06, + "loss": 0.0608, + "step": 1080 + }, + { + "epoch": 10.495145631067961, + "grad_norm": 0.8359118873064847, + "learning_rate": 1.0325939405592661e-06, + "loss": 0.1468, + "step": 1081 + }, + { + "epoch": 10.504854368932039, + "grad_norm": 0.4613807440413381, + "learning_rate": 1.0284813169212502e-06, + "loss": 0.0419, + "step": 1082 + }, + { + "epoch": 10.514563106796116, + "grad_norm": 0.7117820265098361, + "learning_rate": 1.024374777552258e-06, + "loss": 0.0602, + "step": 1083 + }, + { + "epoch": 10.524271844660195, + "grad_norm": 0.673361722656393, + "learning_rate": 1.0202743394315444e-06, + "loss": 0.0572, + "step": 1084 + }, + { + "epoch": 10.533980582524272, + "grad_norm": 0.718842425297297, + "learning_rate": 1.0161800195131372e-06, + "loss": 0.0709, + "step": 1085 + }, + { + "epoch": 10.54368932038835, + "grad_norm": 0.519489444086323, + "learning_rate": 1.0120918347257669e-06, + "loss": 0.0652, + "step": 1086 + }, + { + "epoch": 10.553398058252426, + "grad_norm": 0.531214341067294, + "learning_rate": 1.0080098019727979e-06, + "loss": 0.0405, + "step": 1087 + }, + { + "epoch": 10.563106796116505, + "grad_norm": 0.8237012418593747, + "learning_rate": 1.0039339381321572e-06, + "loss": 0.1696, + "step": 1088 + }, + { + "epoch": 10.572815533980583, + "grad_norm": 0.6702768856937782, + "learning_rate": 9.998642600562664e-07, + "loss": 0.1737, + "step": 1089 + }, + { + "epoch": 10.58252427184466, + "grad_norm": 0.6967567433213261, + "learning_rate": 9.95800784571969e-07, + "loss": 0.1259, + "step": 1090 + }, + { + "epoch": 10.592233009708737, + "grad_norm": 0.7311688346400493, + "learning_rate": 9.91743528480464e-07, + "loss": 0.1139, + "step": 1091 + }, + { + "epoch": 10.601941747572816, + "grad_norm": 0.8124930399760492, + "learning_rate": 9.876925085572365e-07, + "loss": 0.0526, + "step": 1092 + }, + { + "epoch": 10.611650485436893, + "grad_norm": 0.6821859039678345, + "learning_rate": 9.836477415519843e-07, + "loss": 0.0798, + "step": 1093 + }, + { + "epoch": 10.62135922330097, + "grad_norm": 0.5388868436932894, + "learning_rate": 9.79609244188553e-07, + "loss": 0.0595, + "step": 1094 + }, + { + "epoch": 10.631067961165048, + "grad_norm": 0.46754323131853315, + "learning_rate": 9.755770331648642e-07, + "loss": 0.0724, + "step": 1095 + }, + { + "epoch": 10.640776699029127, + "grad_norm": 0.6883957401661763, + "learning_rate": 9.715511251528486e-07, + "loss": 0.1118, + "step": 1096 + }, + { + "epoch": 10.650485436893204, + "grad_norm": 0.6882504391474683, + "learning_rate": 9.67531536798375e-07, + "loss": 0.1209, + "step": 1097 + }, + { + "epoch": 10.660194174757281, + "grad_norm": 0.5983944209437926, + "learning_rate": 9.635182847211827e-07, + "loss": 0.1064, + "step": 1098 + }, + { + "epoch": 10.669902912621358, + "grad_norm": 0.6811521493632275, + "learning_rate": 9.595113855148128e-07, + "loss": 0.0937, + "step": 1099 + }, + { + "epoch": 10.679611650485437, + "grad_norm": 0.6779393838141433, + "learning_rate": 9.555108557465383e-07, + "loss": 0.1937, + "step": 1100 + }, + { + "epoch": 10.689320388349515, + "grad_norm": 0.6498698531590674, + "learning_rate": 9.51516711957298e-07, + "loss": 0.1226, + "step": 1101 + }, + { + "epoch": 10.699029126213592, + "grad_norm": 0.530378508456918, + "learning_rate": 9.475289706616256e-07, + "loss": 0.0828, + "step": 1102 + }, + { + "epoch": 10.70873786407767, + "grad_norm": 0.7058714411604254, + "learning_rate": 9.435476483475825e-07, + "loss": 0.0793, + "step": 1103 + }, + { + "epoch": 10.718446601941748, + "grad_norm": 0.45348419126683187, + "learning_rate": 9.395727614766903e-07, + "loss": 0.0799, + "step": 1104 + }, + { + "epoch": 10.728155339805825, + "grad_norm": 0.6579676572313675, + "learning_rate": 9.356043264838607e-07, + "loss": 0.0686, + "step": 1105 + }, + { + "epoch": 10.737864077669903, + "grad_norm": 0.5367393390599688, + "learning_rate": 9.316423597773316e-07, + "loss": 0.0398, + "step": 1106 + }, + { + "epoch": 10.74757281553398, + "grad_norm": 0.6485846910490585, + "learning_rate": 9.276868777385942e-07, + "loss": 0.1605, + "step": 1107 + }, + { + "epoch": 10.757281553398059, + "grad_norm": 0.4379559794587696, + "learning_rate": 9.237378967223279e-07, + "loss": 0.0705, + "step": 1108 + }, + { + "epoch": 10.766990291262136, + "grad_norm": 0.5300271333343034, + "learning_rate": 9.197954330563327e-07, + "loss": 0.0732, + "step": 1109 + }, + { + "epoch": 10.776699029126213, + "grad_norm": 0.43694024379416824, + "learning_rate": 9.158595030414621e-07, + "loss": 0.0752, + "step": 1110 + }, + { + "epoch": 10.78640776699029, + "grad_norm": 0.5706126142150842, + "learning_rate": 9.11930122951554e-07, + "loss": 0.052, + "step": 1111 + }, + { + "epoch": 10.79611650485437, + "grad_norm": 0.4968147443898269, + "learning_rate": 9.080073090333646e-07, + "loss": 0.0618, + "step": 1112 + }, + { + "epoch": 10.805825242718447, + "grad_norm": 0.7309923946940867, + "learning_rate": 9.040910775065015e-07, + "loss": 0.1172, + "step": 1113 + }, + { + "epoch": 10.815533980582524, + "grad_norm": 0.4875038724359515, + "learning_rate": 9.001814445633558e-07, + "loss": 0.03, + "step": 1114 + }, + { + "epoch": 10.825242718446601, + "grad_norm": 0.8623721100042515, + "learning_rate": 8.962784263690358e-07, + "loss": 0.13, + "step": 1115 + }, + { + "epoch": 10.83495145631068, + "grad_norm": 0.6094686102538099, + "learning_rate": 8.923820390612991e-07, + "loss": 0.0742, + "step": 1116 + }, + { + "epoch": 10.844660194174757, + "grad_norm": 0.492347526443207, + "learning_rate": 8.884922987504882e-07, + "loss": 0.0564, + "step": 1117 + }, + { + "epoch": 10.854368932038835, + "grad_norm": 0.6685413110043293, + "learning_rate": 8.846092215194607e-07, + "loss": 0.1173, + "step": 1118 + }, + { + "epoch": 10.864077669902912, + "grad_norm": 0.34671831539541903, + "learning_rate": 8.807328234235254e-07, + "loss": 0.0343, + "step": 1119 + }, + { + "epoch": 10.87378640776699, + "grad_norm": 0.8551886410623927, + "learning_rate": 8.768631204903738e-07, + "loss": 0.1335, + "step": 1120 + }, + { + "epoch": 10.883495145631068, + "grad_norm": 0.8845986465738103, + "learning_rate": 8.730001287200177e-07, + "loss": 0.1715, + "step": 1121 + }, + { + "epoch": 10.893203883495145, + "grad_norm": 0.4515371919319129, + "learning_rate": 8.691438640847177e-07, + "loss": 0.0566, + "step": 1122 + }, + { + "epoch": 10.902912621359224, + "grad_norm": 0.8524620005123392, + "learning_rate": 8.652943425289206e-07, + "loss": 0.1265, + "step": 1123 + }, + { + "epoch": 10.912621359223301, + "grad_norm": 0.45736310100320554, + "learning_rate": 8.61451579969193e-07, + "loss": 0.0841, + "step": 1124 + }, + { + "epoch": 10.922330097087379, + "grad_norm": 0.528927869401848, + "learning_rate": 8.576155922941548e-07, + "loss": 0.0996, + "step": 1125 + }, + { + "epoch": 10.932038834951456, + "grad_norm": 0.5638408786011265, + "learning_rate": 8.537863953644138e-07, + "loss": 0.0894, + "step": 1126 + }, + { + "epoch": 10.941747572815533, + "grad_norm": 0.706509560577999, + "learning_rate": 8.499640050125007e-07, + "loss": 0.0952, + "step": 1127 + }, + { + "epoch": 10.951456310679612, + "grad_norm": 0.5818787000984781, + "learning_rate": 8.461484370428025e-07, + "loss": 0.0874, + "step": 1128 + }, + { + "epoch": 10.96116504854369, + "grad_norm": 0.5492742060252098, + "learning_rate": 8.423397072314985e-07, + "loss": 0.0736, + "step": 1129 + }, + { + "epoch": 10.970873786407767, + "grad_norm": 0.6223368740910458, + "learning_rate": 8.385378313264933e-07, + "loss": 0.0789, + "step": 1130 + }, + { + "epoch": 10.980582524271846, + "grad_norm": 0.7563283702541171, + "learning_rate": 8.347428250473541e-07, + "loss": 0.0494, + "step": 1131 + }, + { + "epoch": 10.990291262135923, + "grad_norm": 0.8246887369704925, + "learning_rate": 8.309547040852434e-07, + "loss": 0.0263, + "step": 1132 + }, + { + "epoch": 11.0, + "grad_norm": 0.6209593471191105, + "learning_rate": 8.271734841028553e-07, + "loss": 0.0908, + "step": 1133 + }, + { + "epoch": 11.009708737864077, + "grad_norm": 0.6181167641935551, + "learning_rate": 8.233991807343497e-07, + "loss": 0.101, + "step": 1134 + }, + { + "epoch": 11.019417475728156, + "grad_norm": 0.3294479029172337, + "learning_rate": 8.196318095852909e-07, + "loss": 0.0329, + "step": 1135 + }, + { + "epoch": 11.029126213592233, + "grad_norm": 0.5181029223146717, + "learning_rate": 8.158713862325782e-07, + "loss": 0.0914, + "step": 1136 + }, + { + "epoch": 11.03883495145631, + "grad_norm": 0.38511569448960764, + "learning_rate": 8.12117926224385e-07, + "loss": 0.0414, + "step": 1137 + }, + { + "epoch": 11.048543689320388, + "grad_norm": 0.3120588007580457, + "learning_rate": 8.08371445080093e-07, + "loss": 0.0287, + "step": 1138 + }, + { + "epoch": 11.058252427184467, + "grad_norm": 0.35699123657831555, + "learning_rate": 8.04631958290229e-07, + "loss": 0.0224, + "step": 1139 + }, + { + "epoch": 11.067961165048544, + "grad_norm": 0.6874946805767921, + "learning_rate": 8.008994813163995e-07, + "loss": 0.142, + "step": 1140 + }, + { + "epoch": 11.077669902912621, + "grad_norm": 0.3712955806485938, + "learning_rate": 7.971740295912289e-07, + "loss": 0.0484, + "step": 1141 + }, + { + "epoch": 11.087378640776699, + "grad_norm": 0.4014946057982372, + "learning_rate": 7.934556185182928e-07, + "loss": 0.0498, + "step": 1142 + }, + { + "epoch": 11.097087378640778, + "grad_norm": 0.3599712596998677, + "learning_rate": 7.897442634720576e-07, + "loss": 0.0478, + "step": 1143 + }, + { + "epoch": 11.106796116504855, + "grad_norm": 0.6865673878538356, + "learning_rate": 7.860399797978138e-07, + "loss": 0.19, + "step": 1144 + }, + { + "epoch": 11.116504854368932, + "grad_norm": 0.6849682241558045, + "learning_rate": 7.823427828116148e-07, + "loss": 0.1168, + "step": 1145 + }, + { + "epoch": 11.12621359223301, + "grad_norm": 0.33934326981903534, + "learning_rate": 7.786526878002126e-07, + "loss": 0.0274, + "step": 1146 + }, + { + "epoch": 11.135922330097088, + "grad_norm": 0.5005017045058894, + "learning_rate": 7.749697100209947e-07, + "loss": 0.0795, + "step": 1147 + }, + { + "epoch": 11.145631067961165, + "grad_norm": 0.5020843300505056, + "learning_rate": 7.7129386470192e-07, + "loss": 0.0436, + "step": 1148 + }, + { + "epoch": 11.155339805825243, + "grad_norm": 0.4480201862020612, + "learning_rate": 7.6762516704146e-07, + "loss": 0.0674, + "step": 1149 + }, + { + "epoch": 11.16504854368932, + "grad_norm": 1.0455476424219226, + "learning_rate": 7.6396363220853e-07, + "loss": 0.1638, + "step": 1150 + }, + { + "epoch": 11.174757281553399, + "grad_norm": 0.576435965844432, + "learning_rate": 7.603092753424298e-07, + "loss": 0.0766, + "step": 1151 + }, + { + "epoch": 11.184466019417476, + "grad_norm": 0.32921646457176373, + "learning_rate": 7.566621115527811e-07, + "loss": 0.0348, + "step": 1152 + }, + { + "epoch": 11.194174757281553, + "grad_norm": 0.3367784456418941, + "learning_rate": 7.530221559194643e-07, + "loss": 0.0243, + "step": 1153 + }, + { + "epoch": 11.20388349514563, + "grad_norm": 0.38545867844467285, + "learning_rate": 7.493894234925558e-07, + "loss": 0.039, + "step": 1154 + }, + { + "epoch": 11.21359223300971, + "grad_norm": 0.45936186805463514, + "learning_rate": 7.457639292922675e-07, + "loss": 0.0729, + "step": 1155 + }, + { + "epoch": 11.223300970873787, + "grad_norm": 0.4905506201117723, + "learning_rate": 7.421456883088826e-07, + "loss": 0.0171, + "step": 1156 + }, + { + "epoch": 11.233009708737864, + "grad_norm": 0.6665752104633995, + "learning_rate": 7.385347155026934e-07, + "loss": 0.1204, + "step": 1157 + }, + { + "epoch": 11.242718446601941, + "grad_norm": 0.6519589690169562, + "learning_rate": 7.349310258039441e-07, + "loss": 0.1025, + "step": 1158 + }, + { + "epoch": 11.25242718446602, + "grad_norm": 0.6646274396055756, + "learning_rate": 7.31334634112762e-07, + "loss": 0.1342, + "step": 1159 + }, + { + "epoch": 11.262135922330097, + "grad_norm": 0.6617933047697685, + "learning_rate": 7.277455552991011e-07, + "loss": 0.1056, + "step": 1160 + }, + { + "epoch": 11.271844660194175, + "grad_norm": 0.28557786336015395, + "learning_rate": 7.241638042026783e-07, + "loss": 0.033, + "step": 1161 + }, + { + "epoch": 11.281553398058252, + "grad_norm": 0.3674990070012525, + "learning_rate": 7.20589395632913e-07, + "loss": 0.047, + "step": 1162 + }, + { + "epoch": 11.29126213592233, + "grad_norm": 0.7990437129788919, + "learning_rate": 7.170223443688654e-07, + "loss": 0.1382, + "step": 1163 + }, + { + "epoch": 11.300970873786408, + "grad_norm": 0.519463264590884, + "learning_rate": 7.134626651591758e-07, + "loss": 0.1124, + "step": 1164 + }, + { + "epoch": 11.310679611650485, + "grad_norm": 0.36441443966860854, + "learning_rate": 7.099103727220024e-07, + "loss": 0.0158, + "step": 1165 + }, + { + "epoch": 11.320388349514563, + "grad_norm": 1.0816424946810301, + "learning_rate": 7.063654817449638e-07, + "loss": 0.1172, + "step": 1166 + }, + { + "epoch": 11.330097087378642, + "grad_norm": 0.2813942915433234, + "learning_rate": 7.028280068850734e-07, + "loss": 0.0191, + "step": 1167 + }, + { + "epoch": 11.339805825242719, + "grad_norm": 0.5559575454325191, + "learning_rate": 6.992979627686821e-07, + "loss": 0.071, + "step": 1168 + }, + { + "epoch": 11.349514563106796, + "grad_norm": 0.6406408723152291, + "learning_rate": 6.957753639914175e-07, + "loss": 0.1288, + "step": 1169 + }, + { + "epoch": 11.359223300970873, + "grad_norm": 0.5616743245613947, + "learning_rate": 6.922602251181221e-07, + "loss": 0.0625, + "step": 1170 + }, + { + "epoch": 11.368932038834952, + "grad_norm": 0.6733872123620726, + "learning_rate": 6.887525606827947e-07, + "loss": 0.0958, + "step": 1171 + }, + { + "epoch": 11.37864077669903, + "grad_norm": 0.34814787820573256, + "learning_rate": 6.852523851885295e-07, + "loss": 0.0136, + "step": 1172 + }, + { + "epoch": 11.388349514563107, + "grad_norm": 0.45047065458077723, + "learning_rate": 6.817597131074566e-07, + "loss": 0.0339, + "step": 1173 + }, + { + "epoch": 11.398058252427184, + "grad_norm": 0.6006060313215769, + "learning_rate": 6.782745588806811e-07, + "loss": 0.0393, + "step": 1174 + }, + { + "epoch": 11.407766990291263, + "grad_norm": 0.4142872047428767, + "learning_rate": 6.747969369182248e-07, + "loss": 0.0461, + "step": 1175 + }, + { + "epoch": 11.41747572815534, + "grad_norm": 0.6117430165933601, + "learning_rate": 6.713268615989654e-07, + "loss": 0.08, + "step": 1176 + }, + { + "epoch": 11.427184466019417, + "grad_norm": 0.5949255359920863, + "learning_rate": 6.678643472705773e-07, + "loss": 0.1192, + "step": 1177 + }, + { + "epoch": 11.436893203883495, + "grad_norm": 0.46402567424048835, + "learning_rate": 6.644094082494746e-07, + "loss": 0.0223, + "step": 1178 + }, + { + "epoch": 11.446601941747574, + "grad_norm": 0.47547059564197874, + "learning_rate": 6.609620588207474e-07, + "loss": 0.0596, + "step": 1179 + }, + { + "epoch": 11.45631067961165, + "grad_norm": 0.5177539890623492, + "learning_rate": 6.575223132381067e-07, + "loss": 0.0997, + "step": 1180 + }, + { + "epoch": 11.466019417475728, + "grad_norm": 0.3197005241753569, + "learning_rate": 6.540901857238233e-07, + "loss": 0.03, + "step": 1181 + }, + { + "epoch": 11.475728155339805, + "grad_norm": 0.45523175279424344, + "learning_rate": 6.506656904686698e-07, + "loss": 0.0629, + "step": 1182 + }, + { + "epoch": 11.485436893203884, + "grad_norm": 0.4295816991757006, + "learning_rate": 6.472488416318621e-07, + "loss": 0.0689, + "step": 1183 + }, + { + "epoch": 11.495145631067961, + "grad_norm": 0.5455657421626049, + "learning_rate": 6.438396533410002e-07, + "loss": 0.069, + "step": 1184 + }, + { + "epoch": 11.504854368932039, + "grad_norm": 0.5744593180389926, + "learning_rate": 6.4043813969201e-07, + "loss": 0.034, + "step": 1185 + }, + { + "epoch": 11.514563106796116, + "grad_norm": 0.9974821460316966, + "learning_rate": 6.370443147490857e-07, + "loss": 0.0938, + "step": 1186 + }, + { + "epoch": 11.524271844660195, + "grad_norm": 0.37997934488556223, + "learning_rate": 6.336581925446309e-07, + "loss": 0.052, + "step": 1187 + }, + { + "epoch": 11.533980582524272, + "grad_norm": 0.5888434151251862, + "learning_rate": 6.302797870792007e-07, + "loss": 0.0605, + "step": 1188 + }, + { + "epoch": 11.54368932038835, + "grad_norm": 0.4110534807034854, + "learning_rate": 6.269091123214438e-07, + "loss": 0.0465, + "step": 1189 + }, + { + "epoch": 11.553398058252426, + "grad_norm": 0.6312639822083126, + "learning_rate": 6.235461822080449e-07, + "loss": 0.0984, + "step": 1190 + }, + { + "epoch": 11.563106796116505, + "grad_norm": 0.4184886778766774, + "learning_rate": 6.201910106436673e-07, + "loss": 0.0488, + "step": 1191 + }, + { + "epoch": 11.572815533980583, + "grad_norm": 0.5720300965783884, + "learning_rate": 6.168436115008941e-07, + "loss": 0.1142, + "step": 1192 + }, + { + "epoch": 11.58252427184466, + "grad_norm": 0.34247681006710967, + "learning_rate": 6.135039986201744e-07, + "loss": 0.0212, + "step": 1193 + }, + { + "epoch": 11.592233009708737, + "grad_norm": 0.8696585733772886, + "learning_rate": 6.101721858097606e-07, + "loss": 0.1344, + "step": 1194 + }, + { + "epoch": 11.601941747572816, + "grad_norm": 0.5733831310709949, + "learning_rate": 6.068481868456558e-07, + "loss": 0.1033, + "step": 1195 + }, + { + "epoch": 11.611650485436893, + "grad_norm": 0.3726614583489184, + "learning_rate": 6.035320154715549e-07, + "loss": 0.0509, + "step": 1196 + }, + { + "epoch": 11.62135922330097, + "grad_norm": 0.4551557088282536, + "learning_rate": 6.00223685398788e-07, + "loss": 0.027, + "step": 1197 + }, + { + "epoch": 11.631067961165048, + "grad_norm": 0.3944998808707215, + "learning_rate": 5.969232103062647e-07, + "loss": 0.0715, + "step": 1198 + }, + { + "epoch": 11.640776699029127, + "grad_norm": 0.12933923216594015, + "learning_rate": 5.936306038404158e-07, + "loss": 0.0041, + "step": 1199 + }, + { + "epoch": 11.650485436893204, + "grad_norm": 0.7601718028278395, + "learning_rate": 5.903458796151382e-07, + "loss": 0.2222, + "step": 1200 + }, + { + "epoch": 11.660194174757281, + "grad_norm": 0.33435340501447186, + "learning_rate": 5.870690512117377e-07, + "loss": 0.0368, + "step": 1201 + }, + { + "epoch": 11.669902912621358, + "grad_norm": 0.4520959141289102, + "learning_rate": 5.838001321788744e-07, + "loss": 0.0293, + "step": 1202 + }, + { + "epoch": 11.679611650485437, + "grad_norm": 0.3901390181712294, + "learning_rate": 5.80539136032505e-07, + "loss": 0.0468, + "step": 1203 + }, + { + "epoch": 11.689320388349515, + "grad_norm": 0.675514183694151, + "learning_rate": 5.772860762558269e-07, + "loss": 0.1079, + "step": 1204 + }, + { + "epoch": 11.699029126213592, + "grad_norm": 0.4486143842474179, + "learning_rate": 5.740409662992244e-07, + "loss": 0.0563, + "step": 1205 + }, + { + "epoch": 11.70873786407767, + "grad_norm": 0.543813104820745, + "learning_rate": 5.708038195802098e-07, + "loss": 0.0363, + "step": 1206 + }, + { + "epoch": 11.718446601941748, + "grad_norm": 0.6554627134569425, + "learning_rate": 5.675746494833733e-07, + "loss": 0.0514, + "step": 1207 + }, + { + "epoch": 11.728155339805825, + "grad_norm": 0.4505479059998464, + "learning_rate": 5.643534693603214e-07, + "loss": 0.027, + "step": 1208 + }, + { + "epoch": 11.737864077669903, + "grad_norm": 0.5220937013677023, + "learning_rate": 5.61140292529625e-07, + "loss": 0.0843, + "step": 1209 + }, + { + "epoch": 11.74757281553398, + "grad_norm": 0.3609261316739773, + "learning_rate": 5.579351322767643e-07, + "loss": 0.0334, + "step": 1210 + }, + { + "epoch": 11.757281553398059, + "grad_norm": 0.5351464600044358, + "learning_rate": 5.547380018540735e-07, + "loss": 0.1097, + "step": 1211 + }, + { + "epoch": 11.766990291262136, + "grad_norm": 0.277458376852476, + "learning_rate": 5.515489144806862e-07, + "loss": 0.0319, + "step": 1212 + }, + { + "epoch": 11.776699029126213, + "grad_norm": 0.8927290578987768, + "learning_rate": 5.483678833424796e-07, + "loss": 0.1248, + "step": 1213 + }, + { + "epoch": 11.78640776699029, + "grad_norm": 0.48393508803090785, + "learning_rate": 5.451949215920221e-07, + "loss": 0.0574, + "step": 1214 + }, + { + "epoch": 11.79611650485437, + "grad_norm": 0.5362198231627286, + "learning_rate": 5.420300423485167e-07, + "loss": 0.0403, + "step": 1215 + }, + { + "epoch": 11.805825242718447, + "grad_norm": 0.6831834341558954, + "learning_rate": 5.38873258697748e-07, + "loss": 0.0537, + "step": 1216 + }, + { + "epoch": 11.815533980582524, + "grad_norm": 0.7235561973898877, + "learning_rate": 5.357245836920286e-07, + "loss": 0.0887, + "step": 1217 + }, + { + "epoch": 11.825242718446601, + "grad_norm": 0.5011302176239788, + "learning_rate": 5.325840303501431e-07, + "loss": 0.0639, + "step": 1218 + }, + { + "epoch": 11.83495145631068, + "grad_norm": 0.48804341236318166, + "learning_rate": 5.29451611657297e-07, + "loss": 0.0798, + "step": 1219 + }, + { + "epoch": 11.844660194174757, + "grad_norm": 0.5695494446583874, + "learning_rate": 5.263273405650601e-07, + "loss": 0.0955, + "step": 1220 + }, + { + "epoch": 11.854368932038835, + "grad_norm": 0.5618369954982938, + "learning_rate": 5.232112299913151e-07, + "loss": 0.0821, + "step": 1221 + }, + { + "epoch": 11.864077669902912, + "grad_norm": 0.4770004457987746, + "learning_rate": 5.201032928202043e-07, + "loss": 0.0906, + "step": 1222 + }, + { + "epoch": 11.87378640776699, + "grad_norm": 0.7242922898548877, + "learning_rate": 5.17003541902075e-07, + "loss": 0.1936, + "step": 1223 + }, + { + "epoch": 11.883495145631068, + "grad_norm": 0.50876950009869, + "learning_rate": 5.139119900534259e-07, + "loss": 0.1009, + "step": 1224 + }, + { + "epoch": 11.893203883495145, + "grad_norm": 0.4007899567960424, + "learning_rate": 5.108286500568562e-07, + "loss": 0.0333, + "step": 1225 + }, + { + "epoch": 11.902912621359224, + "grad_norm": 0.5343766514104662, + "learning_rate": 5.077535346610115e-07, + "loss": 0.0554, + "step": 1226 + }, + { + "epoch": 11.912621359223301, + "grad_norm": 0.535666198151551, + "learning_rate": 5.046866565805311e-07, + "loss": 0.0292, + "step": 1227 + }, + { + "epoch": 11.922330097087379, + "grad_norm": 0.4527597562080739, + "learning_rate": 5.016280284959957e-07, + "loss": 0.043, + "step": 1228 + }, + { + "epoch": 11.932038834951456, + "grad_norm": 0.6442865775092774, + "learning_rate": 4.985776630538746e-07, + "loss": 0.0828, + "step": 1229 + }, + { + "epoch": 11.941747572815533, + "grad_norm": 0.3977555677630664, + "learning_rate": 4.95535572866474e-07, + "loss": 0.0296, + "step": 1230 + }, + { + "epoch": 11.951456310679612, + "grad_norm": 0.27529002957180204, + "learning_rate": 4.925017705118843e-07, + "loss": 0.0157, + "step": 1231 + }, + { + "epoch": 11.96116504854369, + "grad_norm": 0.7429628409554702, + "learning_rate": 4.89476268533928e-07, + "loss": 0.0759, + "step": 1232 + }, + { + "epoch": 11.970873786407767, + "grad_norm": 0.6138000975659442, + "learning_rate": 4.864590794421092e-07, + "loss": 0.0529, + "step": 1233 + }, + { + "epoch": 11.980582524271846, + "grad_norm": 0.6182825584807163, + "learning_rate": 4.834502157115597e-07, + "loss": 0.092, + "step": 1234 + }, + { + "epoch": 11.990291262135923, + "grad_norm": 0.24435544743564855, + "learning_rate": 4.804496897829883e-07, + "loss": 0.0125, + "step": 1235 + }, + { + "epoch": 12.0, + "grad_norm": 0.8424484319909388, + "learning_rate": 4.774575140626317e-07, + "loss": 0.1078, + "step": 1236 + }, + { + "epoch": 12.009708737864077, + "grad_norm": 0.5129391571852897, + "learning_rate": 4.744737009221986e-07, + "loss": 0.0749, + "step": 1237 + }, + { + "epoch": 12.019417475728156, + "grad_norm": 0.5919150233331142, + "learning_rate": 4.7149826269882294e-07, + "loss": 0.0702, + "step": 1238 + }, + { + "epoch": 12.029126213592233, + "grad_norm": 0.44284697506749743, + "learning_rate": 4.6853121169500914e-07, + "loss": 0.0745, + "step": 1239 + }, + { + "epoch": 12.03883495145631, + "grad_norm": 0.2865977034088059, + "learning_rate": 4.6557256017858485e-07, + "loss": 0.0215, + "step": 1240 + }, + { + "epoch": 12.048543689320388, + "grad_norm": 0.4160486540359403, + "learning_rate": 4.626223203826477e-07, + "loss": 0.0407, + "step": 1241 + }, + { + "epoch": 12.058252427184467, + "grad_norm": 0.2749010600950671, + "learning_rate": 4.5968050450551527e-07, + "loss": 0.0199, + "step": 1242 + }, + { + "epoch": 12.067961165048544, + "grad_norm": 0.45897302010048824, + "learning_rate": 4.56747124710675e-07, + "loss": 0.0625, + "step": 1243 + }, + { + "epoch": 12.077669902912621, + "grad_norm": 0.5477002655080845, + "learning_rate": 4.5382219312673364e-07, + "loss": 0.0808, + "step": 1244 + }, + { + "epoch": 12.087378640776699, + "grad_norm": 0.2698283023459079, + "learning_rate": 4.5090572184736863e-07, + "loss": 0.0212, + "step": 1245 + }, + { + "epoch": 12.097087378640778, + "grad_norm": 0.7558827780843274, + "learning_rate": 4.4799772293127486e-07, + "loss": 0.1129, + "step": 1246 + }, + { + "epoch": 12.106796116504855, + "grad_norm": 0.5006893727164596, + "learning_rate": 4.4509820840211745e-07, + "loss": 0.0552, + "step": 1247 + }, + { + "epoch": 12.116504854368932, + "grad_norm": 0.41471815721736377, + "learning_rate": 4.422071902484812e-07, + "loss": 0.0522, + "step": 1248 + }, + { + "epoch": 12.12621359223301, + "grad_norm": 0.34642625579244757, + "learning_rate": 4.3932468042382075e-07, + "loss": 0.0473, + "step": 1249 + }, + { + "epoch": 12.135922330097088, + "grad_norm": 0.4724063216096118, + "learning_rate": 4.3645069084641195e-07, + "loss": 0.0693, + "step": 1250 + }, + { + "epoch": 12.145631067961165, + "grad_norm": 0.5018916517574505, + "learning_rate": 4.335852333993018e-07, + "loss": 0.0828, + "step": 1251 + }, + { + "epoch": 12.155339805825243, + "grad_norm": 0.47914801086777986, + "learning_rate": 4.3072831993025895e-07, + "loss": 0.0661, + "step": 1252 + }, + { + "epoch": 12.16504854368932, + "grad_norm": 0.47409307088022173, + "learning_rate": 4.278799622517274e-07, + "loss": 0.0635, + "step": 1253 + }, + { + "epoch": 12.174757281553399, + "grad_norm": 0.4932946423501068, + "learning_rate": 4.2504017214077374e-07, + "loss": 0.0719, + "step": 1254 + }, + { + "epoch": 12.184466019417476, + "grad_norm": 0.2552904501885662, + "learning_rate": 4.222089613390412e-07, + "loss": 0.0125, + "step": 1255 + }, + { + "epoch": 12.194174757281553, + "grad_norm": 0.3765642214308931, + "learning_rate": 4.1938634155269944e-07, + "loss": 0.0429, + "step": 1256 + }, + { + "epoch": 12.20388349514563, + "grad_norm": 0.47304984449216975, + "learning_rate": 4.165723244523978e-07, + "loss": 0.0658, + "step": 1257 + }, + { + "epoch": 12.21359223300971, + "grad_norm": 0.5467448402302595, + "learning_rate": 4.1376692167321626e-07, + "loss": 0.082, + "step": 1258 + }, + { + "epoch": 12.223300970873787, + "grad_norm": 0.4153879006735531, + "learning_rate": 4.109701448146164e-07, + "loss": 0.0567, + "step": 1259 + }, + { + "epoch": 12.233009708737864, + "grad_norm": 0.5149132605681401, + "learning_rate": 4.0818200544039484e-07, + "loss": 0.0642, + "step": 1260 + }, + { + "epoch": 12.242718446601941, + "grad_norm": 0.3664582297237133, + "learning_rate": 4.054025150786356e-07, + "loss": 0.0207, + "step": 1261 + }, + { + "epoch": 12.25242718446602, + "grad_norm": 0.4704662984536062, + "learning_rate": 4.026316852216605e-07, + "loss": 0.0571, + "step": 1262 + }, + { + "epoch": 12.262135922330097, + "grad_norm": 0.4316859066370346, + "learning_rate": 3.998695273259834e-07, + "loss": 0.0687, + "step": 1263 + }, + { + "epoch": 12.271844660194175, + "grad_norm": 0.5327787979259686, + "learning_rate": 3.971160528122622e-07, + "loss": 0.0856, + "step": 1264 + }, + { + "epoch": 12.281553398058252, + "grad_norm": 0.5018036756970643, + "learning_rate": 3.9437127306525295e-07, + "loss": 0.1052, + "step": 1265 + }, + { + "epoch": 12.29126213592233, + "grad_norm": 0.3177685723026178, + "learning_rate": 3.9163519943375973e-07, + "loss": 0.0283, + "step": 1266 + }, + { + "epoch": 12.300970873786408, + "grad_norm": 0.5165156442833573, + "learning_rate": 3.889078432305904e-07, + "loss": 0.118, + "step": 1267 + }, + { + "epoch": 12.310679611650485, + "grad_norm": 0.5068137023886546, + "learning_rate": 3.8618921573250896e-07, + "loss": 0.0775, + "step": 1268 + }, + { + "epoch": 12.320388349514563, + "grad_norm": 0.48634984455464964, + "learning_rate": 3.834793281801891e-07, + "loss": 0.0344, + "step": 1269 + }, + { + "epoch": 12.330097087378642, + "grad_norm": 0.5145224870057581, + "learning_rate": 3.8077819177816695e-07, + "loss": 0.1188, + "step": 1270 + }, + { + "epoch": 12.339805825242719, + "grad_norm": 0.7335439939955951, + "learning_rate": 3.780858176947963e-07, + "loss": 0.0506, + "step": 1271 + }, + { + "epoch": 12.349514563106796, + "grad_norm": 0.2890103308208032, + "learning_rate": 3.754022170622007e-07, + "loss": 0.0195, + "step": 1272 + }, + { + "epoch": 12.359223300970873, + "grad_norm": 0.5274645907133224, + "learning_rate": 3.7272740097622884e-07, + "loss": 0.1081, + "step": 1273 + }, + { + "epoch": 12.368932038834952, + "grad_norm": 0.5229710405073345, + "learning_rate": 3.700613804964073e-07, + "loss": 0.0967, + "step": 1274 + }, + { + "epoch": 12.37864077669903, + "grad_norm": 0.5402500562513454, + "learning_rate": 3.6740416664589634e-07, + "loss": 0.0806, + "step": 1275 + }, + { + "epoch": 12.388349514563107, + "grad_norm": 0.6932722573513473, + "learning_rate": 3.6475577041144324e-07, + "loss": 0.1607, + "step": 1276 + }, + { + "epoch": 12.398058252427184, + "grad_norm": 0.13364800561868725, + "learning_rate": 3.6211620274333727e-07, + "loss": 0.0039, + "step": 1277 + }, + { + "epoch": 12.407766990291263, + "grad_norm": 0.5635217291309577, + "learning_rate": 3.594854745553636e-07, + "loss": 0.089, + "step": 1278 + }, + { + "epoch": 12.41747572815534, + "grad_norm": 0.8039998796146036, + "learning_rate": 3.568635967247605e-07, + "loss": 0.1989, + "step": 1279 + }, + { + "epoch": 12.427184466019417, + "grad_norm": 0.44452718171739464, + "learning_rate": 3.5425058009217193e-07, + "loss": 0.0505, + "step": 1280 + }, + { + "epoch": 12.436893203883495, + "grad_norm": 0.4779630622559031, + "learning_rate": 3.516464354616031e-07, + "loss": 0.0523, + "step": 1281 + }, + { + "epoch": 12.446601941747574, + "grad_norm": 0.5081805906166073, + "learning_rate": 3.4905117360037683e-07, + "loss": 0.0823, + "step": 1282 + }, + { + "epoch": 12.45631067961165, + "grad_norm": 0.3588244687459001, + "learning_rate": 3.4646480523908813e-07, + "loss": 0.0347, + "step": 1283 + }, + { + "epoch": 12.466019417475728, + "grad_norm": 0.6863092097683883, + "learning_rate": 3.43887341071561e-07, + "loss": 0.1559, + "step": 1284 + }, + { + "epoch": 12.475728155339805, + "grad_norm": 0.20582305371798984, + "learning_rate": 3.413187917548019e-07, + "loss": 0.0101, + "step": 1285 + }, + { + "epoch": 12.485436893203884, + "grad_norm": 0.6067333614215648, + "learning_rate": 3.3875916790895883e-07, + "loss": 0.1478, + "step": 1286 + }, + { + "epoch": 12.495145631067961, + "grad_norm": 0.3103903276686346, + "learning_rate": 3.3620848011727437e-07, + "loss": 0.0157, + "step": 1287 + }, + { + "epoch": 12.504854368932039, + "grad_norm": 0.2398685670975744, + "learning_rate": 3.336667389260445e-07, + "loss": 0.0115, + "step": 1288 + }, + { + "epoch": 12.514563106796116, + "grad_norm": 0.34028815868720313, + "learning_rate": 3.311339548445727e-07, + "loss": 0.0188, + "step": 1289 + }, + { + "epoch": 12.524271844660195, + "grad_norm": 0.5281529937098564, + "learning_rate": 3.2861013834512844e-07, + "loss": 0.0705, + "step": 1290 + }, + { + "epoch": 12.533980582524272, + "grad_norm": 0.44922395870868254, + "learning_rate": 3.2609529986290246e-07, + "loss": 0.0594, + "step": 1291 + }, + { + "epoch": 12.54368932038835, + "grad_norm": 0.41450177685889456, + "learning_rate": 3.235894497959649e-07, + "loss": 0.0323, + "step": 1292 + }, + { + "epoch": 12.553398058252426, + "grad_norm": 0.4860873761111312, + "learning_rate": 3.2109259850522045e-07, + "loss": 0.0639, + "step": 1293 + }, + { + "epoch": 12.563106796116505, + "grad_norm": 0.5025904279404237, + "learning_rate": 3.186047563143685e-07, + "loss": 0.0705, + "step": 1294 + }, + { + "epoch": 12.572815533980583, + "grad_norm": 0.375714081361163, + "learning_rate": 3.161259335098571e-07, + "loss": 0.035, + "step": 1295 + }, + { + "epoch": 12.58252427184466, + "grad_norm": 0.2256329375444976, + "learning_rate": 3.1365614034084224e-07, + "loss": 0.012, + "step": 1296 + }, + { + "epoch": 12.592233009708737, + "grad_norm": 0.2881991746431682, + "learning_rate": 3.111953870191459e-07, + "loss": 0.0255, + "step": 1297 + }, + { + "epoch": 12.601941747572816, + "grad_norm": 0.6335108729127409, + "learning_rate": 3.087436837192118e-07, + "loss": 0.1165, + "step": 1298 + }, + { + "epoch": 12.611650485436893, + "grad_norm": 0.47028695989481234, + "learning_rate": 3.0630104057806616e-07, + "loss": 0.1114, + "step": 1299 + }, + { + "epoch": 12.62135922330097, + "grad_norm": 0.5121407246325066, + "learning_rate": 3.0386746769527323e-07, + "loss": 0.0878, + "step": 1300 + }, + { + "epoch": 12.631067961165048, + "grad_norm": 0.31821815756972943, + "learning_rate": 3.0144297513289483e-07, + "loss": 0.0224, + "step": 1301 + }, + { + "epoch": 12.640776699029127, + "grad_norm": 0.36131410103922523, + "learning_rate": 2.9902757291544905e-07, + "loss": 0.0338, + "step": 1302 + }, + { + "epoch": 12.650485436893204, + "grad_norm": 0.18546100236750176, + "learning_rate": 2.966212710298674e-07, + "loss": 0.0077, + "step": 1303 + }, + { + "epoch": 12.660194174757281, + "grad_norm": 0.4403213642112429, + "learning_rate": 2.94224079425455e-07, + "loss": 0.0444, + "step": 1304 + }, + { + "epoch": 12.669902912621358, + "grad_norm": 0.30588085097166257, + "learning_rate": 2.9183600801384853e-07, + "loss": 0.0166, + "step": 1305 + }, + { + "epoch": 12.679611650485437, + "grad_norm": 0.19655227972846973, + "learning_rate": 2.8945706666897555e-07, + "loss": 0.0067, + "step": 1306 + }, + { + "epoch": 12.689320388349515, + "grad_norm": 0.4034524699481664, + "learning_rate": 2.870872652270129e-07, + "loss": 0.0485, + "step": 1307 + }, + { + "epoch": 12.699029126213592, + "grad_norm": 0.3342688517721637, + "learning_rate": 2.8472661348634883e-07, + "loss": 0.0224, + "step": 1308 + }, + { + "epoch": 12.70873786407767, + "grad_norm": 0.35443206447047615, + "learning_rate": 2.82375121207539e-07, + "loss": 0.051, + "step": 1309 + }, + { + "epoch": 12.718446601941748, + "grad_norm": 0.413534023109782, + "learning_rate": 2.8003279811326724e-07, + "loss": 0.0508, + "step": 1310 + }, + { + "epoch": 12.728155339805825, + "grad_norm": 0.19635527159953645, + "learning_rate": 2.776996538883062e-07, + "loss": 0.0085, + "step": 1311 + }, + { + "epoch": 12.737864077669903, + "grad_norm": 0.5416519429825231, + "learning_rate": 2.7537569817947694e-07, + "loss": 0.0942, + "step": 1312 + }, + { + "epoch": 12.74757281553398, + "grad_norm": 0.40253806362920896, + "learning_rate": 2.730609405956083e-07, + "loss": 0.0523, + "step": 1313 + }, + { + "epoch": 12.757281553398059, + "grad_norm": 0.49734033038996395, + "learning_rate": 2.707553907074989e-07, + "loss": 0.0739, + "step": 1314 + }, + { + "epoch": 12.766990291262136, + "grad_norm": 0.5385383009087314, + "learning_rate": 2.684590580478749e-07, + "loss": 0.0546, + "step": 1315 + }, + { + "epoch": 12.776699029126213, + "grad_norm": 0.6286671447796734, + "learning_rate": 2.6617195211135343e-07, + "loss": 0.0952, + "step": 1316 + }, + { + "epoch": 12.78640776699029, + "grad_norm": 0.5855989296835651, + "learning_rate": 2.638940823544012e-07, + "loss": 0.0667, + "step": 1317 + }, + { + "epoch": 12.79611650485437, + "grad_norm": 0.33409793915536806, + "learning_rate": 2.6162545819529624e-07, + "loss": 0.036, + "step": 1318 + }, + { + "epoch": 12.805825242718447, + "grad_norm": 0.5781144234706445, + "learning_rate": 2.593660890140895e-07, + "loss": 0.0608, + "step": 1319 + }, + { + "epoch": 12.815533980582524, + "grad_norm": 0.5362480167302143, + "learning_rate": 2.57115984152565e-07, + "loss": 0.0719, + "step": 1320 + }, + { + "epoch": 12.825242718446601, + "grad_norm": 0.4294606474451055, + "learning_rate": 2.548751529142018e-07, + "loss": 0.0856, + "step": 1321 + }, + { + "epoch": 12.83495145631068, + "grad_norm": 0.6234632237849053, + "learning_rate": 2.526436045641351e-07, + "loss": 0.1454, + "step": 1322 + }, + { + "epoch": 12.844660194174757, + "grad_norm": 0.4658449714656103, + "learning_rate": 2.504213483291193e-07, + "loss": 0.0475, + "step": 1323 + }, + { + "epoch": 12.854368932038835, + "grad_norm": 0.23326161655035757, + "learning_rate": 2.482083933974883e-07, + "loss": 0.0105, + "step": 1324 + }, + { + "epoch": 12.864077669902912, + "grad_norm": 0.5872406960043914, + "learning_rate": 2.4600474891911696e-07, + "loss": 0.0925, + "step": 1325 + }, + { + "epoch": 12.87378640776699, + "grad_norm": 0.2975400185848686, + "learning_rate": 2.43810424005386e-07, + "loss": 0.0382, + "step": 1326 + }, + { + "epoch": 12.883495145631068, + "grad_norm": 0.40439938966679095, + "learning_rate": 2.416254277291416e-07, + "loss": 0.0424, + "step": 1327 + }, + { + "epoch": 12.893203883495145, + "grad_norm": 0.14455878990664323, + "learning_rate": 2.3944976912465916e-07, + "loss": 0.0043, + "step": 1328 + }, + { + "epoch": 12.902912621359224, + "grad_norm": 0.5413563478617743, + "learning_rate": 2.3728345718760622e-07, + "loss": 0.0858, + "step": 1329 + }, + { + "epoch": 12.912621359223301, + "grad_norm": 0.6114421008718739, + "learning_rate": 2.3512650087500338e-07, + "loss": 0.1262, + "step": 1330 + }, + { + "epoch": 12.922330097087379, + "grad_norm": 0.40434143898326247, + "learning_rate": 2.3297890910519093e-07, + "loss": 0.0438, + "step": 1331 + }, + { + "epoch": 12.932038834951456, + "grad_norm": 0.15674358972140642, + "learning_rate": 2.3084069075778758e-07, + "loss": 0.0042, + "step": 1332 + }, + { + "epoch": 12.941747572815533, + "grad_norm": 0.4385566852023257, + "learning_rate": 2.287118546736572e-07, + "loss": 0.0527, + "step": 1333 + }, + { + "epoch": 12.951456310679612, + "grad_norm": 0.726154070219994, + "learning_rate": 2.2659240965487023e-07, + "loss": 0.0914, + "step": 1334 + }, + { + "epoch": 12.96116504854369, + "grad_norm": 0.6620758511943872, + "learning_rate": 2.2448236446466847e-07, + "loss": 0.0509, + "step": 1335 + }, + { + "epoch": 12.970873786407767, + "grad_norm": 0.6769976034708189, + "learning_rate": 2.2238172782742763e-07, + "loss": 0.1015, + "step": 1336 + }, + { + "epoch": 12.980582524271846, + "grad_norm": 0.24635745023823036, + "learning_rate": 2.2029050842862277e-07, + "loss": 0.0112, + "step": 1337 + }, + { + "epoch": 12.990291262135923, + "grad_norm": 0.5211584464733092, + "learning_rate": 2.1820871491479102e-07, + "loss": 0.0667, + "step": 1338 + }, + { + "epoch": 13.0, + "grad_norm": 0.5913539447878209, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.0904, + "step": 1339 + }, + { + "epoch": 13.009708737864077, + "grad_norm": 0.3101489157097074, + "learning_rate": 2.140734399332975e-07, + "loss": 0.0542, + "step": 1340 + }, + { + "epoch": 13.019417475728156, + "grad_norm": 0.5351936690061951, + "learning_rate": 2.1201997556370284e-07, + "loss": 0.0846, + "step": 1341 + }, + { + "epoch": 13.029126213592233, + "grad_norm": 0.3577628010330391, + "learning_rate": 2.0997597127514507e-07, + "loss": 0.0232, + "step": 1342 + }, + { + "epoch": 13.03883495145631, + "grad_norm": 0.4025795427933299, + "learning_rate": 2.079414355189427e-07, + "loss": 0.0669, + "step": 1343 + }, + { + "epoch": 13.048543689320388, + "grad_norm": 0.48650418922975114, + "learning_rate": 2.059163767072639e-07, + "loss": 0.0605, + "step": 1344 + }, + { + "epoch": 13.058252427184467, + "grad_norm": 0.3135425232894008, + "learning_rate": 2.0390080321309236e-07, + "loss": 0.0269, + "step": 1345 + }, + { + "epoch": 13.067961165048544, + "grad_norm": 0.08376709705022621, + "learning_rate": 2.01894723370194e-07, + "loss": 0.0022, + "step": 1346 + }, + { + "epoch": 13.077669902912621, + "grad_norm": 0.33362664095446987, + "learning_rate": 1.9989814547308056e-07, + "loss": 0.037, + "step": 1347 + }, + { + "epoch": 13.087378640776699, + "grad_norm": 0.6004311575355316, + "learning_rate": 1.9791107777697633e-07, + "loss": 0.1411, + "step": 1348 + }, + { + "epoch": 13.097087378640778, + "grad_norm": 0.34278443971860567, + "learning_rate": 1.9593352849778453e-07, + "loss": 0.0254, + "step": 1349 + }, + { + "epoch": 13.106796116504855, + "grad_norm": 0.3286684875471459, + "learning_rate": 1.9396550581205208e-07, + "loss": 0.0228, + "step": 1350 + }, + { + "epoch": 13.116504854368932, + "grad_norm": 0.34666070981368075, + "learning_rate": 1.920070178569361e-07, + "loss": 0.0311, + "step": 1351 + }, + { + "epoch": 13.12621359223301, + "grad_norm": 0.49387258406448203, + "learning_rate": 1.900580727301718e-07, + "loss": 0.0555, + "step": 1352 + }, + { + "epoch": 13.135922330097088, + "grad_norm": 0.5181298337375579, + "learning_rate": 1.8811867849003684e-07, + "loss": 0.0442, + "step": 1353 + }, + { + "epoch": 13.145631067961165, + "grad_norm": 0.5372907594532009, + "learning_rate": 1.8618884315531939e-07, + "loss": 0.0758, + "step": 1354 + }, + { + "epoch": 13.155339805825243, + "grad_norm": 0.3637821275916941, + "learning_rate": 1.8426857470528414e-07, + "loss": 0.0646, + "step": 1355 + }, + { + "epoch": 13.16504854368932, + "grad_norm": 0.5236428238911527, + "learning_rate": 1.8235788107963948e-07, + "loss": 0.1017, + "step": 1356 + }, + { + "epoch": 13.174757281553399, + "grad_norm": 0.16816534480055276, + "learning_rate": 1.8045677017850595e-07, + "loss": 0.0062, + "step": 1357 + }, + { + "epoch": 13.184466019417476, + "grad_norm": 0.31536745363842367, + "learning_rate": 1.785652498623816e-07, + "loss": 0.0448, + "step": 1358 + }, + { + "epoch": 13.194174757281553, + "grad_norm": 0.5779882605598496, + "learning_rate": 1.7668332795211074e-07, + "loss": 0.0749, + "step": 1359 + }, + { + "epoch": 13.20388349514563, + "grad_norm": 0.2541991656984927, + "learning_rate": 1.7481101222885126e-07, + "loss": 0.0157, + "step": 1360 + }, + { + "epoch": 13.21359223300971, + "grad_norm": 0.3365028596139971, + "learning_rate": 1.7294831043404264e-07, + "loss": 0.0388, + "step": 1361 + }, + { + "epoch": 13.223300970873787, + "grad_norm": 0.48615199324448427, + "learning_rate": 1.7109523026937302e-07, + "loss": 0.0977, + "step": 1362 + }, + { + "epoch": 13.233009708737864, + "grad_norm": 0.4739890157105656, + "learning_rate": 1.6925177939674936e-07, + "loss": 0.0525, + "step": 1363 + }, + { + "epoch": 13.242718446601941, + "grad_norm": 0.5274812968875578, + "learning_rate": 1.6741796543826321e-07, + "loss": 0.0833, + "step": 1364 + }, + { + "epoch": 13.25242718446602, + "grad_norm": 0.5190394218310534, + "learning_rate": 1.6559379597616136e-07, + "loss": 0.0458, + "step": 1365 + }, + { + "epoch": 13.262135922330097, + "grad_norm": 0.5907235640627251, + "learning_rate": 1.6377927855281362e-07, + "loss": 0.1027, + "step": 1366 + }, + { + "epoch": 13.271844660194175, + "grad_norm": 0.5009182999514356, + "learning_rate": 1.6197442067068136e-07, + "loss": 0.0922, + "step": 1367 + }, + { + "epoch": 13.281553398058252, + "grad_norm": 0.2994761075980326, + "learning_rate": 1.6017922979228662e-07, + "loss": 0.0286, + "step": 1368 + }, + { + "epoch": 13.29126213592233, + "grad_norm": 0.4167528295855181, + "learning_rate": 1.5839371334018193e-07, + "loss": 0.048, + "step": 1369 + }, + { + "epoch": 13.300970873786408, + "grad_norm": 0.568993358454377, + "learning_rate": 1.5661787869691858e-07, + "loss": 0.0872, + "step": 1370 + }, + { + "epoch": 13.310679611650485, + "grad_norm": 0.6080611595176447, + "learning_rate": 1.5485173320501673e-07, + "loss": 0.0818, + "step": 1371 + }, + { + "epoch": 13.320388349514563, + "grad_norm": 0.4176513366927112, + "learning_rate": 1.5309528416693503e-07, + "loss": 0.0689, + "step": 1372 + }, + { + "epoch": 13.330097087378642, + "grad_norm": 0.38487885624551355, + "learning_rate": 1.513485388450403e-07, + "loss": 0.0369, + "step": 1373 + }, + { + "epoch": 13.339805825242719, + "grad_norm": 0.3808252984699954, + "learning_rate": 1.4961150446157759e-07, + "loss": 0.0523, + "step": 1374 + }, + { + "epoch": 13.349514563106796, + "grad_norm": 0.43465823014950594, + "learning_rate": 1.4788418819864037e-07, + "loss": 0.0417, + "step": 1375 + }, + { + "epoch": 13.359223300970873, + "grad_norm": 0.6908558471861086, + "learning_rate": 1.461665971981402e-07, + "loss": 0.1031, + "step": 1376 + }, + { + "epoch": 13.368932038834952, + "grad_norm": 0.22220724572717085, + "learning_rate": 1.444587385617785e-07, + "loss": 0.0108, + "step": 1377 + }, + { + "epoch": 13.37864077669903, + "grad_norm": 0.6187017283324233, + "learning_rate": 1.4276061935101586e-07, + "loss": 0.0797, + "step": 1378 + }, + { + "epoch": 13.388349514563107, + "grad_norm": 0.35673209366579156, + "learning_rate": 1.4107224658704288e-07, + "loss": 0.0278, + "step": 1379 + }, + { + "epoch": 13.398058252427184, + "grad_norm": 0.2088978860127839, + "learning_rate": 1.3939362725075344e-07, + "loss": 0.0096, + "step": 1380 + }, + { + "epoch": 13.407766990291263, + "grad_norm": 0.5737431049471658, + "learning_rate": 1.3772476828271236e-07, + "loss": 0.146, + "step": 1381 + }, + { + "epoch": 13.41747572815534, + "grad_norm": 0.5013268774854202, + "learning_rate": 1.360656765831289e-07, + "loss": 0.0851, + "step": 1382 + }, + { + "epoch": 13.427184466019417, + "grad_norm": 0.3552375372327559, + "learning_rate": 1.3441635901182803e-07, + "loss": 0.0332, + "step": 1383 + }, + { + "epoch": 13.436893203883495, + "grad_norm": 0.4105442051073008, + "learning_rate": 1.3277682238822142e-07, + "loss": 0.0427, + "step": 1384 + }, + { + "epoch": 13.446601941747574, + "grad_norm": 0.43977774703292793, + "learning_rate": 1.3114707349127954e-07, + "loss": 0.0495, + "step": 1385 + }, + { + "epoch": 13.45631067961165, + "grad_norm": 0.42583520614554343, + "learning_rate": 1.2952711905950377e-07, + "loss": 0.046, + "step": 1386 + }, + { + "epoch": 13.466019417475728, + "grad_norm": 0.4654975107942523, + "learning_rate": 1.279169657908988e-07, + "loss": 0.0662, + "step": 1387 + }, + { + "epoch": 13.475728155339805, + "grad_norm": 0.316054598646831, + "learning_rate": 1.263166203429439e-07, + "loss": 0.0362, + "step": 1388 + }, + { + "epoch": 13.485436893203884, + "grad_norm": 0.44344850299453137, + "learning_rate": 1.2472608933256637e-07, + "loss": 0.0266, + "step": 1389 + }, + { + "epoch": 13.495145631067961, + "grad_norm": 0.5686958760472558, + "learning_rate": 1.2314537933611425e-07, + "loss": 0.0526, + "step": 1390 + }, + { + "epoch": 13.504854368932039, + "grad_norm": 0.3714309782597695, + "learning_rate": 1.2157449688932872e-07, + "loss": 0.051, + "step": 1391 + }, + { + "epoch": 13.514563106796116, + "grad_norm": 0.43841752239263004, + "learning_rate": 1.2001344848731612e-07, + "loss": 0.0416, + "step": 1392 + }, + { + "epoch": 13.524271844660195, + "grad_norm": 0.27187015113253077, + "learning_rate": 1.1846224058452316e-07, + "loss": 0.0286, + "step": 1393 + }, + { + "epoch": 13.533980582524272, + "grad_norm": 0.36456437143328213, + "learning_rate": 1.1692087959470882e-07, + "loss": 0.0362, + "step": 1394 + }, + { + "epoch": 13.54368932038835, + "grad_norm": 0.5055952692825199, + "learning_rate": 1.1538937189091825e-07, + "loss": 0.0755, + "step": 1395 + }, + { + "epoch": 13.553398058252426, + "grad_norm": 0.3439908449532863, + "learning_rate": 1.1386772380545669e-07, + "loss": 0.0434, + "step": 1396 + }, + { + "epoch": 13.563106796116505, + "grad_norm": 0.3739766241535124, + "learning_rate": 1.1235594162986168e-07, + "loss": 0.0494, + "step": 1397 + }, + { + "epoch": 13.572815533980583, + "grad_norm": 0.568010631241099, + "learning_rate": 1.1085403161488012e-07, + "loss": 0.1044, + "step": 1398 + }, + { + "epoch": 13.58252427184466, + "grad_norm": 0.17116162913172464, + "learning_rate": 1.09361999970439e-07, + "loss": 0.0067, + "step": 1399 + }, + { + "epoch": 13.592233009708737, + "grad_norm": 0.3321360792703964, + "learning_rate": 1.0787985286562219e-07, + "loss": 0.0363, + "step": 1400 + }, + { + "epoch": 13.601941747572816, + "grad_norm": 0.47803315897026527, + "learning_rate": 1.0640759642864401e-07, + "loss": 0.0494, + "step": 1401 + }, + { + "epoch": 13.611650485436893, + "grad_norm": 0.5771243658845532, + "learning_rate": 1.0494523674682372e-07, + "loss": 0.0916, + "step": 1402 + }, + { + "epoch": 13.62135922330097, + "grad_norm": 0.5059429937647754, + "learning_rate": 1.0349277986656081e-07, + "loss": 0.0721, + "step": 1403 + }, + { + "epoch": 13.631067961165048, + "grad_norm": 0.24550599941178855, + "learning_rate": 1.0205023179330975e-07, + "loss": 0.0214, + "step": 1404 + }, + { + "epoch": 13.640776699029127, + "grad_norm": 0.6067893059454117, + "learning_rate": 1.00617598491555e-07, + "loss": 0.1443, + "step": 1405 + }, + { + "epoch": 13.650485436893204, + "grad_norm": 0.3674398742489763, + "learning_rate": 9.919488588478715e-08, + "loss": 0.0268, + "step": 1406 + }, + { + "epoch": 13.660194174757281, + "grad_norm": 0.5471451347636245, + "learning_rate": 9.778209985547682e-08, + "loss": 0.0937, + "step": 1407 + }, + { + "epoch": 13.669902912621358, + "grad_norm": 0.40045155813855254, + "learning_rate": 9.637924624505191e-08, + "loss": 0.0511, + "step": 1408 + }, + { + "epoch": 13.679611650485437, + "grad_norm": 0.3763845426376623, + "learning_rate": 9.498633085387343e-08, + "loss": 0.0407, + "step": 1409 + }, + { + "epoch": 13.689320388349515, + "grad_norm": 0.3101617655771819, + "learning_rate": 9.360335944121029e-08, + "loss": 0.0322, + "step": 1410 + }, + { + "epoch": 13.699029126213592, + "grad_norm": 0.3418703008016437, + "learning_rate": 9.223033772521594e-08, + "loss": 0.0257, + "step": 1411 + }, + { + "epoch": 13.70873786407767, + "grad_norm": 0.44907257872282763, + "learning_rate": 9.086727138290535e-08, + "loss": 0.0514, + "step": 1412 + }, + { + "epoch": 13.718446601941748, + "grad_norm": 0.3842567901939408, + "learning_rate": 8.951416605013114e-08, + "loss": 0.0429, + "step": 1413 + }, + { + "epoch": 13.728155339805825, + "grad_norm": 0.2966384701948387, + "learning_rate": 8.817102732155996e-08, + "loss": 0.0245, + "step": 1414 + }, + { + "epoch": 13.737864077669903, + "grad_norm": 0.5763576401759796, + "learning_rate": 8.683786075065065e-08, + "loss": 0.0675, + "step": 1415 + }, + { + "epoch": 13.74757281553398, + "grad_norm": 0.5002820834330115, + "learning_rate": 8.55146718496283e-08, + "loss": 0.0994, + "step": 1416 + }, + { + "epoch": 13.757281553398059, + "grad_norm": 0.42837675868305464, + "learning_rate": 8.420146608946605e-08, + "loss": 0.0633, + "step": 1417 + }, + { + "epoch": 13.766990291262136, + "grad_norm": 0.35423588956574104, + "learning_rate": 8.28982488998581e-08, + "loss": 0.0263, + "step": 1418 + }, + { + "epoch": 13.776699029126213, + "grad_norm": 0.5133064138909689, + "learning_rate": 8.160502566919942e-08, + "loss": 0.08, + "step": 1419 + }, + { + "epoch": 13.78640776699029, + "grad_norm": 0.42893161447189443, + "learning_rate": 8.032180174456283e-08, + "loss": 0.0423, + "step": 1420 + }, + { + "epoch": 13.79611650485437, + "grad_norm": 0.3124843126356028, + "learning_rate": 7.904858243167806e-08, + "loss": 0.0408, + "step": 1421 + }, + { + "epoch": 13.805825242718447, + "grad_norm": 0.4065666684492401, + "learning_rate": 7.778537299490796e-08, + "loss": 0.0637, + "step": 1422 + }, + { + "epoch": 13.815533980582524, + "grad_norm": 0.4457701399914273, + "learning_rate": 7.653217865722817e-08, + "loss": 0.0678, + "step": 1423 + }, + { + "epoch": 13.825242718446601, + "grad_norm": 0.3569532201060994, + "learning_rate": 7.528900460020444e-08, + "loss": 0.0383, + "step": 1424 + }, + { + "epoch": 13.83495145631068, + "grad_norm": 0.46545111674165585, + "learning_rate": 7.405585596397314e-08, + "loss": 0.0559, + "step": 1425 + }, + { + "epoch": 13.844660194174757, + "grad_norm": 0.5259399357202312, + "learning_rate": 7.283273784721739e-08, + "loss": 0.1231, + "step": 1426 + }, + { + "epoch": 13.854368932038835, + "grad_norm": 0.3144082807257404, + "learning_rate": 7.161965530714743e-08, + "loss": 0.0257, + "step": 1427 + }, + { + "epoch": 13.864077669902912, + "grad_norm": 0.4046497084629934, + "learning_rate": 7.041661335948024e-08, + "loss": 0.037, + "step": 1428 + }, + { + "epoch": 13.87378640776699, + "grad_norm": 0.6242338725168409, + "learning_rate": 6.92236169784169e-08, + "loss": 0.0617, + "step": 1429 + }, + { + "epoch": 13.883495145631068, + "grad_norm": 0.4927064101341263, + "learning_rate": 6.804067109662443e-08, + "loss": 0.0615, + "step": 1430 + }, + { + "epoch": 13.893203883495145, + "grad_norm": 0.4815006421883216, + "learning_rate": 6.68677806052137e-08, + "loss": 0.0757, + "step": 1431 + }, + { + "epoch": 13.902912621359224, + "grad_norm": 0.38866205557135447, + "learning_rate": 6.57049503537191e-08, + "loss": 0.0523, + "step": 1432 + }, + { + "epoch": 13.912621359223301, + "grad_norm": 0.45644868532380084, + "learning_rate": 6.455218515008049e-08, + "loss": 0.0538, + "step": 1433 + }, + { + "epoch": 13.922330097087379, + "grad_norm": 0.6472398646291064, + "learning_rate": 6.340948976062023e-08, + "loss": 0.0763, + "step": 1434 + }, + { + "epoch": 13.932038834951456, + "grad_norm": 0.6919248428459206, + "learning_rate": 6.227686891002671e-08, + "loss": 0.1206, + "step": 1435 + }, + { + "epoch": 13.941747572815533, + "grad_norm": 0.5771683769037051, + "learning_rate": 6.115432728133198e-08, + "loss": 0.0981, + "step": 1436 + }, + { + "epoch": 13.951456310679612, + "grad_norm": 0.4918068717506031, + "learning_rate": 6.004186951589414e-08, + "loss": 0.0886, + "step": 1437 + }, + { + "epoch": 13.96116504854369, + "grad_norm": 0.18476223002745681, + "learning_rate": 5.8939500213378296e-08, + "loss": 0.0061, + "step": 1438 + }, + { + "epoch": 13.970873786407767, + "grad_norm": 0.5073076174865154, + "learning_rate": 5.7847223931735974e-08, + "loss": 0.0667, + "step": 1439 + }, + { + "epoch": 13.980582524271846, + "grad_norm": 0.4571146557509417, + "learning_rate": 5.6765045187187614e-08, + "loss": 0.0726, + "step": 1440 + }, + { + "epoch": 13.990291262135923, + "grad_norm": 0.5620996547576711, + "learning_rate": 5.569296845420375e-08, + "loss": 0.0942, + "step": 1441 + }, + { + "epoch": 14.0, + "grad_norm": 0.37701892224712136, + "learning_rate": 5.463099816548578e-08, + "loss": 0.0525, + "step": 1442 + }, + { + "epoch": 14.009708737864077, + "grad_norm": 0.27852037202811797, + "learning_rate": 5.3579138711948587e-08, + "loss": 0.0142, + "step": 1443 + }, + { + "epoch": 14.019417475728156, + "grad_norm": 0.5002120793072816, + "learning_rate": 5.253739444270128e-08, + "loss": 0.0811, + "step": 1444 + }, + { + "epoch": 14.029126213592233, + "grad_norm": 0.4954344423768613, + "learning_rate": 5.150576966503063e-08, + "loss": 0.0648, + "step": 1445 + }, + { + "epoch": 14.03883495145631, + "grad_norm": 0.8070824057658714, + "learning_rate": 5.048426864438183e-08, + "loss": 0.1415, + "step": 1446 + }, + { + "epoch": 14.048543689320388, + "grad_norm": 0.2485266305298399, + "learning_rate": 4.9472895604341655e-08, + "loss": 0.0173, + "step": 1447 + }, + { + "epoch": 14.058252427184467, + "grad_norm": 0.5002219465557189, + "learning_rate": 4.8471654726621464e-08, + "loss": 0.1107, + "step": 1448 + }, + { + "epoch": 14.067961165048544, + "grad_norm": 0.48859206182415127, + "learning_rate": 4.7480550151038365e-08, + "loss": 0.0607, + "step": 1449 + }, + { + "epoch": 14.077669902912621, + "grad_norm": 0.4401242445739508, + "learning_rate": 4.649958597549964e-08, + "loss": 0.0418, + "step": 1450 + }, + { + "epoch": 14.087378640776699, + "grad_norm": 0.5063356236988111, + "learning_rate": 4.552876625598501e-08, + "loss": 0.0795, + "step": 1451 + }, + { + "epoch": 14.097087378640778, + "grad_norm": 0.46075177267272516, + "learning_rate": 4.4568095006529975e-08, + "loss": 0.0687, + "step": 1452 + }, + { + "epoch": 14.106796116504855, + "grad_norm": 0.2271681561635824, + "learning_rate": 4.361757619920942e-08, + "loss": 0.0095, + "step": 1453 + }, + { + "epoch": 14.116504854368932, + "grad_norm": 0.22316895376169477, + "learning_rate": 4.2677213764120986e-08, + "loss": 0.0104, + "step": 1454 + }, + { + "epoch": 14.12621359223301, + "grad_norm": 0.40179318592167806, + "learning_rate": 4.174701158936895e-08, + "loss": 0.0273, + "step": 1455 + }, + { + "epoch": 14.135922330097088, + "grad_norm": 0.4980320711294089, + "learning_rate": 4.082697352104814e-08, + "loss": 0.0628, + "step": 1456 + }, + { + "epoch": 14.145631067961165, + "grad_norm": 0.443797410910131, + "learning_rate": 3.991710336322757e-08, + "loss": 0.0529, + "step": 1457 + }, + { + "epoch": 14.155339805825243, + "grad_norm": 0.4147350657878042, + "learning_rate": 3.9017404877935986e-08, + "loss": 0.0422, + "step": 1458 + }, + { + "epoch": 14.16504854368932, + "grad_norm": 0.6320743323439815, + "learning_rate": 3.812788178514437e-08, + "loss": 0.1207, + "step": 1459 + }, + { + "epoch": 14.174757281553399, + "grad_norm": 0.2851768686756557, + "learning_rate": 3.7248537762752666e-08, + "loss": 0.0156, + "step": 1460 + }, + { + "epoch": 14.184466019417476, + "grad_norm": 0.4898600075169955, + "learning_rate": 3.637937644657308e-08, + "loss": 0.0609, + "step": 1461 + }, + { + "epoch": 14.194174757281553, + "grad_norm": 0.6112591760337616, + "learning_rate": 3.55204014303151e-08, + "loss": 0.1028, + "step": 1462 + }, + { + "epoch": 14.20388349514563, + "grad_norm": 0.5361184519943515, + "learning_rate": 3.467161626557164e-08, + "loss": 0.1553, + "step": 1463 + }, + { + "epoch": 14.21359223300971, + "grad_norm": 0.3749624794722601, + "learning_rate": 3.3833024461803756e-08, + "loss": 0.0425, + "step": 1464 + }, + { + "epoch": 14.223300970873787, + "grad_norm": 0.3687071096568528, + "learning_rate": 3.300462948632593e-08, + "loss": 0.0528, + "step": 1465 + }, + { + "epoch": 14.233009708737864, + "grad_norm": 0.5065834596804688, + "learning_rate": 3.218643476429167e-08, + "loss": 0.1048, + "step": 1466 + }, + { + "epoch": 14.242718446601941, + "grad_norm": 0.2498819248286375, + "learning_rate": 3.1378443678680706e-08, + "loss": 0.0256, + "step": 1467 + }, + { + "epoch": 14.25242718446602, + "grad_norm": 0.4454945892622417, + "learning_rate": 3.0580659570282886e-08, + "loss": 0.0552, + "step": 1468 + }, + { + "epoch": 14.262135922330097, + "grad_norm": 0.46476529726448046, + "learning_rate": 2.979308573768547e-08, + "loss": 0.0736, + "step": 1469 + }, + { + "epoch": 14.271844660194175, + "grad_norm": 0.3614514478408299, + "learning_rate": 2.9015725437259724e-08, + "loss": 0.0444, + "step": 1470 + }, + { + "epoch": 14.281553398058252, + "grad_norm": 0.23856721195733988, + "learning_rate": 2.8248581883147387e-08, + "loss": 0.0134, + "step": 1471 + }, + { + "epoch": 14.29126213592233, + "grad_norm": 0.4462682057295648, + "learning_rate": 2.7491658247246478e-08, + "loss": 0.0674, + "step": 1472 + }, + { + "epoch": 14.300970873786408, + "grad_norm": 0.4735946001772513, + "learning_rate": 2.6744957659199376e-08, + "loss": 0.0662, + "step": 1473 + }, + { + "epoch": 14.310679611650485, + "grad_norm": 0.401590281873634, + "learning_rate": 2.6008483206379497e-08, + "loss": 0.0656, + "step": 1474 + }, + { + "epoch": 14.320388349514563, + "grad_norm": 0.5538260679883152, + "learning_rate": 2.5282237933877962e-08, + "loss": 0.0609, + "step": 1475 + }, + { + "epoch": 14.330097087378642, + "grad_norm": 0.36741254932703427, + "learning_rate": 2.4566224844491393e-08, + "loss": 0.0389, + "step": 1476 + }, + { + "epoch": 14.339805825242719, + "grad_norm": 0.47615607691212797, + "learning_rate": 2.38604468987097e-08, + "loss": 0.0685, + "step": 1477 + }, + { + "epoch": 14.349514563106796, + "grad_norm": 0.2774257621570567, + "learning_rate": 2.316490701470414e-08, + "loss": 0.0174, + "step": 1478 + }, + { + "epoch": 14.359223300970873, + "grad_norm": 0.4715962885247446, + "learning_rate": 2.247960806831373e-08, + "loss": 0.0725, + "step": 1479 + }, + { + "epoch": 14.368932038834952, + "grad_norm": 0.41298032350196945, + "learning_rate": 2.180455289303579e-08, + "loss": 0.0657, + "step": 1480 + }, + { + "epoch": 14.37864077669903, + "grad_norm": 0.4392490725579392, + "learning_rate": 2.113974428001153e-08, + "loss": 0.0629, + "step": 1481 + }, + { + "epoch": 14.388349514563107, + "grad_norm": 0.33456818961908225, + "learning_rate": 2.0485184978016604e-08, + "loss": 0.0247, + "step": 1482 + }, + { + "epoch": 14.398058252427184, + "grad_norm": 0.4210642043933347, + "learning_rate": 1.984087769344889e-08, + "loss": 0.0578, + "step": 1483 + }, + { + "epoch": 14.407766990291263, + "grad_norm": 0.4547431665479732, + "learning_rate": 1.9206825090317126e-08, + "loss": 0.0644, + "step": 1484 + }, + { + "epoch": 14.41747572815534, + "grad_norm": 0.4137452093457974, + "learning_rate": 1.8583029790230356e-08, + "loss": 0.0684, + "step": 1485 + }, + { + "epoch": 14.427184466019417, + "grad_norm": 0.5768594770321711, + "learning_rate": 1.796949437238682e-08, + "loss": 0.096, + "step": 1486 + }, + { + "epoch": 14.436893203883495, + "grad_norm": 0.5188349954764778, + "learning_rate": 1.736622137356342e-08, + "loss": 0.0814, + "step": 1487 + }, + { + "epoch": 14.446601941747574, + "grad_norm": 0.3045994084525141, + "learning_rate": 1.677321328810516e-08, + "loss": 0.0114, + "step": 1488 + }, + { + "epoch": 14.45631067961165, + "grad_norm": 0.42995889830896394, + "learning_rate": 1.6190472567914617e-08, + "loss": 0.0794, + "step": 1489 + }, + { + "epoch": 14.466019417475728, + "grad_norm": 0.44410933161737604, + "learning_rate": 1.561800162244248e-08, + "loss": 0.0538, + "step": 1490 + }, + { + "epoch": 14.475728155339805, + "grad_norm": 0.5574574290398137, + "learning_rate": 1.5055802818676745e-08, + "loss": 0.1046, + "step": 1491 + }, + { + "epoch": 14.485436893203884, + "grad_norm": 0.33854090541431436, + "learning_rate": 1.450387848113327e-08, + "loss": 0.0422, + "step": 1492 + }, + { + "epoch": 14.495145631067961, + "grad_norm": 0.3988422398212394, + "learning_rate": 1.3962230891846618e-08, + "loss": 0.043, + "step": 1493 + }, + { + "epoch": 14.504854368932039, + "grad_norm": 0.49523592862180527, + "learning_rate": 1.3430862290359781e-08, + "loss": 0.0687, + "step": 1494 + }, + { + "epoch": 14.514563106796116, + "grad_norm": 0.5839454104438518, + "learning_rate": 1.2909774873715585e-08, + "loss": 0.1464, + "step": 1495 + }, + { + "epoch": 14.524271844660195, + "grad_norm": 0.5374831629152403, + "learning_rate": 1.2398970796447807e-08, + "loss": 0.0745, + "step": 1496 + }, + { + "epoch": 14.533980582524272, + "grad_norm": 0.3781775887894092, + "learning_rate": 1.1898452170570618e-08, + "loss": 0.072, + "step": 1497 + }, + { + "epoch": 14.54368932038835, + "grad_norm": 0.2968854268436536, + "learning_rate": 1.140822106557249e-08, + "loss": 0.0205, + "step": 1498 + }, + { + "epoch": 14.553398058252426, + "grad_norm": 0.23045972300528747, + "learning_rate": 1.0928279508405082e-08, + "loss": 0.011, + "step": 1499 + }, + { + "epoch": 14.563106796116505, + "grad_norm": 0.4716850829610772, + "learning_rate": 1.0458629483476868e-08, + "loss": 0.0827, + "step": 1500 + }, + { + "epoch": 14.572815533980583, + "grad_norm": 0.4419550519402443, + "learning_rate": 9.999272932643134e-09, + "loss": 0.0346, + "step": 1501 + }, + { + "epoch": 14.58252427184466, + "grad_norm": 0.8818612325287888, + "learning_rate": 9.550211755199879e-09, + "loss": 0.0911, + "step": 1502 + }, + { + "epoch": 14.592233009708737, + "grad_norm": 0.5946985929189472, + "learning_rate": 9.111447807874374e-09, + "loss": 0.0219, + "step": 1503 + }, + { + "epoch": 14.601941747572816, + "grad_norm": 0.10701016268942182, + "learning_rate": 8.682982904817948e-09, + "loss": 0.0028, + "step": 1504 + }, + { + "epoch": 14.611650485436893, + "grad_norm": 0.6231038615943904, + "learning_rate": 8.264818817599052e-09, + "loss": 0.1465, + "step": 1505 + }, + { + "epoch": 14.62135922330097, + "grad_norm": 0.32493446734913417, + "learning_rate": 7.856957275194921e-09, + "loss": 0.0239, + "step": 1506 + }, + { + "epoch": 14.631067961165048, + "grad_norm": 0.30621128115752955, + "learning_rate": 7.459399963985758e-09, + "loss": 0.0219, + "step": 1507 + }, + { + "epoch": 14.640776699029127, + "grad_norm": 0.5681331507606439, + "learning_rate": 7.072148527746403e-09, + "loss": 0.0612, + "step": 1508 + }, + { + "epoch": 14.650485436893204, + "grad_norm": 0.4678809532098972, + "learning_rate": 6.6952045676405005e-09, + "loss": 0.0734, + "step": 1509 + }, + { + "epoch": 14.660194174757281, + "grad_norm": 0.42191270347256216, + "learning_rate": 6.328569642212734e-09, + "loss": 0.042, + "step": 1510 + }, + { + "epoch": 14.669902912621358, + "grad_norm": 0.6089098976587644, + "learning_rate": 5.972245267384102e-09, + "loss": 0.1034, + "step": 1511 + }, + { + "epoch": 14.679611650485437, + "grad_norm": 0.6347427788645178, + "learning_rate": 5.62623291644443e-09, + "loss": 0.0419, + "step": 1512 + }, + { + "epoch": 14.689320388349515, + "grad_norm": 0.21563629066961087, + "learning_rate": 5.290534020046256e-09, + "loss": 0.0139, + "step": 1513 + }, + { + "epoch": 14.699029126213592, + "grad_norm": 0.5066555604026627, + "learning_rate": 4.965149966199567e-09, + "loss": 0.046, + "step": 1514 + }, + { + "epoch": 14.70873786407767, + "grad_norm": 0.40413935811547114, + "learning_rate": 4.6500821002654075e-09, + "loss": 0.0652, + "step": 1515 + }, + { + "epoch": 14.718446601941748, + "grad_norm": 0.5986384312203215, + "learning_rate": 4.345331724950885e-09, + "loss": 0.0543, + "step": 1516 + }, + { + "epoch": 14.728155339805825, + "grad_norm": 0.4450958848994586, + "learning_rate": 4.050900100303068e-09, + "loss": 0.0459, + "step": 1517 + }, + { + "epoch": 14.737864077669903, + "grad_norm": 0.3924298698415564, + "learning_rate": 3.766788443705094e-09, + "loss": 0.0361, + "step": 1518 + }, + { + "epoch": 14.74757281553398, + "grad_norm": 0.35509867963290737, + "learning_rate": 3.492997929869235e-09, + "loss": 0.0375, + "step": 1519 + }, + { + "epoch": 14.757281553398059, + "grad_norm": 0.32188506207739537, + "learning_rate": 3.2295296908338437e-09, + "loss": 0.0254, + "step": 1520 + }, + { + "epoch": 14.766990291262136, + "grad_norm": 0.3947000336245188, + "learning_rate": 2.976384815957245e-09, + "loss": 0.03, + "step": 1521 + }, + { + "epoch": 14.776699029126213, + "grad_norm": 0.4224276282074339, + "learning_rate": 2.7335643519144086e-09, + "loss": 0.0383, + "step": 1522 + }, + { + "epoch": 14.78640776699029, + "grad_norm": 0.5089503895148835, + "learning_rate": 2.5010693026922273e-09, + "loss": 0.1007, + "step": 1523 + }, + { + "epoch": 14.79611650485437, + "grad_norm": 0.18020715011955366, + "learning_rate": 2.278900629584524e-09, + "loss": 0.0066, + "step": 1524 + }, + { + "epoch": 14.805825242718447, + "grad_norm": 0.6643910364446401, + "learning_rate": 2.067059251189274e-09, + "loss": 0.1364, + "step": 1525 + }, + { + "epoch": 14.815533980582524, + "grad_norm": 0.14798425026090903, + "learning_rate": 1.8655460434044427e-09, + "loss": 0.0044, + "step": 1526 + }, + { + "epoch": 14.825242718446601, + "grad_norm": 0.5631198394596968, + "learning_rate": 1.6743618394238215e-09, + "loss": 0.0981, + "step": 1527 + }, + { + "epoch": 14.83495145631068, + "grad_norm": 0.45409258209552955, + "learning_rate": 1.493507429734531e-09, + "loss": 0.0465, + "step": 1528 + }, + { + "epoch": 14.844660194174757, + "grad_norm": 0.42087852066764153, + "learning_rate": 1.3229835621125786e-09, + "loss": 0.0783, + "step": 1529 + }, + { + "epoch": 14.854368932038835, + "grad_norm": 0.36289372361154304, + "learning_rate": 1.1627909416211947e-09, + "loss": 0.0504, + "step": 1530 + }, + { + "epoch": 14.864077669902912, + "grad_norm": 0.35522890887727715, + "learning_rate": 1.0129302306061128e-09, + "loss": 0.0414, + "step": 1531 + }, + { + "epoch": 14.87378640776699, + "grad_norm": 0.4579376664607033, + "learning_rate": 8.734020486950157e-10, + "loss": 0.0432, + "step": 1532 + }, + { + "epoch": 14.883495145631068, + "grad_norm": 0.13707830522982126, + "learning_rate": 7.442069727930934e-10, + "loss": 0.0048, + "step": 1533 + }, + { + "epoch": 14.893203883495145, + "grad_norm": 0.4002252720770533, + "learning_rate": 6.253455370811012e-10, + "loss": 0.0371, + "step": 1534 + }, + { + "epoch": 14.902912621359224, + "grad_norm": 0.3147483793059573, + "learning_rate": 5.168182330145266e-10, + "loss": 0.0295, + "step": 1535 + }, + { + "epoch": 14.912621359223301, + "grad_norm": 0.3919323958341479, + "learning_rate": 4.186255093194258e-10, + "loss": 0.0575, + "step": 1536 + }, + { + "epoch": 14.922330097087379, + "grad_norm": 0.39816059669182213, + "learning_rate": 3.3076777199186894e-10, + "loss": 0.0455, + "step": 1537 + }, + { + "epoch": 14.932038834951456, + "grad_norm": 0.3752851863862166, + "learning_rate": 2.532453842965521e-10, + "loss": 0.0503, + "step": 1538 + }, + { + "epoch": 14.941747572815533, + "grad_norm": 0.35630558512507376, + "learning_rate": 1.8605866676374428e-10, + "loss": 0.0323, + "step": 1539 + }, + { + "epoch": 14.951456310679612, + "grad_norm": 0.39981192379176855, + "learning_rate": 1.292078971898425e-10, + "loss": 0.0875, + "step": 1540 + }, + { + "epoch": 14.96116504854369, + "grad_norm": 0.06618155040773933, + "learning_rate": 8.269331063459618e-11, + "loss": 0.0014, + "step": 1541 + }, + { + "epoch": 14.970873786407767, + "grad_norm": 0.6044643732379602, + "learning_rate": 4.651509942193988e-11, + "loss": 0.0481, + "step": 1542 + }, + { + "epoch": 14.980582524271846, + "grad_norm": 0.4479730592528947, + "learning_rate": 2.06734131366626e-11, + "loss": 0.0538, + "step": 1543 + }, + { + "epoch": 14.990291262135923, + "grad_norm": 0.3639756177331702, + "learning_rate": 5.168358626628234e-12, + "loss": 0.037, + "step": 1544 + }, + { + "epoch": 15.0, + "grad_norm": 0.26971244188279725, + "learning_rate": 0.0, + "loss": 0.0139, + "step": 1545 + } + ], + "logging_steps": 1, + "max_steps": 1545, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 171438419165184.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}