diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.7987685418415897, + "epoch": 3.0, "eval_steps": 500, - "global_step": 10000, + "global_step": 10719, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -70008,6 +70008,5039 @@ "learning_rate": 1.2284763283065159e-06, "loss": 0.4598, "step": 10000 + }, + { + "epoch": 2.799048418695774, + "grad_norm": 0.2623289816675507, + "learning_rate": 1.225080257965755e-06, + "loss": 0.4304, + "step": 10001 + }, + { + "epoch": 2.799328295549958, + "grad_norm": 0.25573096800671996, + "learning_rate": 1.2216888300491559e-06, + "loss": 0.4428, + "step": 10002 + }, + { + "epoch": 2.799608172404142, + "grad_norm": 0.2668555762116913, + "learning_rate": 1.2183020448795212e-06, + "loss": 0.4618, + "step": 10003 + }, + { + "epoch": 2.7998880492583265, + "grad_norm": 0.26229813824137216, + "learning_rate": 1.2149199027791992e-06, + "loss": 0.4494, + "step": 10004 + }, + { + "epoch": 2.8001679261125103, + "grad_norm": 0.25880574709037374, + "learning_rate": 1.211542404070104e-06, + "loss": 0.4527, + "step": 10005 + }, + { + "epoch": 2.8004478029666946, + "grad_norm": 0.24930135750690222, + "learning_rate": 1.2081695490737178e-06, + "loss": 0.4423, + "step": 10006 + }, + { + "epoch": 2.800727679820879, + "grad_norm": 0.25562693305388845, + "learning_rate": 1.2048013381110611e-06, + "loss": 0.4355, + "step": 10007 + }, + { + "epoch": 2.8010075566750627, + "grad_norm": 0.2573744662139402, + "learning_rate": 1.2014377715027225e-06, + "loss": 0.4669, + "step": 10008 + }, + { + "epoch": 2.801287433529247, + "grad_norm": 0.2521840692937912, + "learning_rate": 1.198078849568851e-06, + "loss": 0.4362, + "step": 10009 + }, + { + "epoch": 2.8015673103834313, + "grad_norm": 0.25530231186014696, + "learning_rate": 1.1947245726291523e-06, + "loss": 0.418, + "step": 10010 + }, + { + "epoch": 2.8018471872376156, + "grad_norm": 0.25835305322672464, + "learning_rate": 1.1913749410028873e-06, + "loss": 0.4524, + "step": 10011 + }, + { + "epoch": 2.8021270640917995, + "grad_norm": 0.25387526679496275, + "learning_rate": 1.188029955008868e-06, + "loss": 0.4566, + "step": 10012 + }, + { + "epoch": 2.8024069409459837, + "grad_norm": 0.25688080755971887, + "learning_rate": 1.1846896149654785e-06, + "loss": 0.4442, + "step": 10013 + }, + { + "epoch": 2.802686817800168, + "grad_norm": 0.2509635911430038, + "learning_rate": 1.1813539211906587e-06, + "loss": 0.4494, + "step": 10014 + }, + { + "epoch": 2.8029666946543523, + "grad_norm": 0.2552682125098111, + "learning_rate": 1.1780228740018994e-06, + "loss": 0.4557, + "step": 10015 + }, + { + "epoch": 2.803246571508536, + "grad_norm": 0.26217435155468116, + "learning_rate": 1.174696473716247e-06, + "loss": 0.4307, + "step": 10016 + }, + { + "epoch": 2.8035264483627205, + "grad_norm": 0.2523713093152466, + "learning_rate": 1.1713747206503145e-06, + "loss": 0.4498, + "step": 10017 + }, + { + "epoch": 2.8038063252169048, + "grad_norm": 0.25678219498147636, + "learning_rate": 1.1680576151202717e-06, + "loss": 0.4608, + "step": 10018 + }, + { + "epoch": 2.8040862020710886, + "grad_norm": 0.2531984694325335, + "learning_rate": 1.1647451574418321e-06, + "loss": 0.4462, + "step": 10019 + }, + { + "epoch": 2.804366078925273, + "grad_norm": 0.2519919251645383, + "learning_rate": 1.1614373479302832e-06, + "loss": 0.4429, + "step": 10020 + }, + { + "epoch": 2.804645955779457, + "grad_norm": 0.2537678668261987, + "learning_rate": 1.1581341869004613e-06, + "loss": 0.4659, + "step": 10021 + }, + { + "epoch": 2.804925832633641, + "grad_norm": 0.25675874919280917, + "learning_rate": 1.1548356746667655e-06, + "loss": 0.435, + "step": 10022 + }, + { + "epoch": 2.8052057094878253, + "grad_norm": 0.2579020260027916, + "learning_rate": 1.1515418115431553e-06, + "loss": 0.4521, + "step": 10023 + }, + { + "epoch": 2.8054855863420096, + "grad_norm": 0.2618712561245846, + "learning_rate": 1.1482525978431357e-06, + "loss": 0.4553, + "step": 10024 + }, + { + "epoch": 2.8057654631961935, + "grad_norm": 0.26263634255425594, + "learning_rate": 1.1449680338797786e-06, + "loss": 0.4383, + "step": 10025 + }, + { + "epoch": 2.8060453400503778, + "grad_norm": 0.25277594804249975, + "learning_rate": 1.1416881199657059e-06, + "loss": 0.4436, + "step": 10026 + }, + { + "epoch": 2.806325216904562, + "grad_norm": 0.24431848116031987, + "learning_rate": 1.1384128564131069e-06, + "loss": 0.4566, + "step": 10027 + }, + { + "epoch": 2.806605093758746, + "grad_norm": 0.2609154288450035, + "learning_rate": 1.1351422435337211e-06, + "loss": 0.4437, + "step": 10028 + }, + { + "epoch": 2.80688497061293, + "grad_norm": 0.25695753681581046, + "learning_rate": 1.131876281638844e-06, + "loss": 0.432, + "step": 10029 + }, + { + "epoch": 2.8071648474671145, + "grad_norm": 0.2538585151636701, + "learning_rate": 1.1286149710393323e-06, + "loss": 0.4597, + "step": 10030 + }, + { + "epoch": 2.8074447243212988, + "grad_norm": 0.23442293755687932, + "learning_rate": 1.1253583120456102e-06, + "loss": 0.4393, + "step": 10031 + }, + { + "epoch": 2.807724601175483, + "grad_norm": 0.24571792553483443, + "learning_rate": 1.122106304967635e-06, + "loss": 0.4572, + "step": 10032 + }, + { + "epoch": 2.808004478029667, + "grad_norm": 0.24975815194917134, + "learning_rate": 1.1188589501149427e-06, + "loss": 0.4537, + "step": 10033 + }, + { + "epoch": 2.808284354883851, + "grad_norm": 0.2522156074258436, + "learning_rate": 1.1156162477966137e-06, + "loss": 0.4228, + "step": 10034 + }, + { + "epoch": 2.8085642317380355, + "grad_norm": 0.25234913236963324, + "learning_rate": 1.1123781983212955e-06, + "loss": 0.4632, + "step": 10035 + }, + { + "epoch": 2.8088441085922193, + "grad_norm": 0.2617953931586556, + "learning_rate": 1.1091448019971806e-06, + "loss": 0.4499, + "step": 10036 + }, + { + "epoch": 2.8091239854464036, + "grad_norm": 0.268164279012252, + "learning_rate": 1.1059160591320338e-06, + "loss": 0.4774, + "step": 10037 + }, + { + "epoch": 2.809403862300588, + "grad_norm": 0.2532397829516345, + "learning_rate": 1.1026919700331596e-06, + "loss": 0.4791, + "step": 10038 + }, + { + "epoch": 2.8096837391547718, + "grad_norm": 0.25454581027538686, + "learning_rate": 1.0994725350074342e-06, + "loss": 0.4517, + "step": 10039 + }, + { + "epoch": 2.809963616008956, + "grad_norm": 0.2641984689771092, + "learning_rate": 1.0962577543612795e-06, + "loss": 0.4472, + "step": 10040 + }, + { + "epoch": 2.8102434928631403, + "grad_norm": 0.36658894526412195, + "learning_rate": 1.0930476284006897e-06, + "loss": 0.4572, + "step": 10041 + }, + { + "epoch": 2.810523369717324, + "grad_norm": 0.25492350997962254, + "learning_rate": 1.0898421574312034e-06, + "loss": 0.4352, + "step": 10042 + }, + { + "epoch": 2.8108032465715085, + "grad_norm": 0.2533002255898336, + "learning_rate": 1.0866413417579103e-06, + "loss": 0.4584, + "step": 10043 + }, + { + "epoch": 2.8110831234256928, + "grad_norm": 0.25750211312832716, + "learning_rate": 1.0834451816854775e-06, + "loss": 0.444, + "step": 10044 + }, + { + "epoch": 2.8113630002798766, + "grad_norm": 0.25995239658813213, + "learning_rate": 1.080253677518106e-06, + "loss": 0.4479, + "step": 10045 + }, + { + "epoch": 2.811642877134061, + "grad_norm": 0.25958867059504953, + "learning_rate": 1.0770668295595755e-06, + "loss": 0.4433, + "step": 10046 + }, + { + "epoch": 2.811922753988245, + "grad_norm": 0.26457879133391693, + "learning_rate": 1.0738846381131983e-06, + "loss": 0.4362, + "step": 10047 + }, + { + "epoch": 2.8122026308424295, + "grad_norm": 0.2597805161367181, + "learning_rate": 1.0707071034818773e-06, + "loss": 0.4655, + "step": 10048 + }, + { + "epoch": 2.8124825076966133, + "grad_norm": 0.2573105787220753, + "learning_rate": 1.0675342259680366e-06, + "loss": 0.4514, + "step": 10049 + }, + { + "epoch": 2.8127623845507976, + "grad_norm": 0.2626867445065791, + "learning_rate": 1.0643660058736793e-06, + "loss": 0.4468, + "step": 10050 + }, + { + "epoch": 2.813042261404982, + "grad_norm": 0.25000800106284804, + "learning_rate": 1.0612024435003531e-06, + "loss": 0.4472, + "step": 10051 + }, + { + "epoch": 2.813322138259166, + "grad_norm": 0.2588721745046232, + "learning_rate": 1.0580435391491728e-06, + "loss": 0.4349, + "step": 10052 + }, + { + "epoch": 2.81360201511335, + "grad_norm": 0.25125222756847393, + "learning_rate": 1.0548892931208033e-06, + "loss": 0.4439, + "step": 10053 + }, + { + "epoch": 2.8138818919675344, + "grad_norm": 0.24978774064115405, + "learning_rate": 1.0517397057154654e-06, + "loss": 0.4489, + "step": 10054 + }, + { + "epoch": 2.8141617688217186, + "grad_norm": 0.25557688299734493, + "learning_rate": 1.048594777232942e-06, + "loss": 0.4524, + "step": 10055 + }, + { + "epoch": 2.8144416456759025, + "grad_norm": 0.2553222781900778, + "learning_rate": 1.0454545079725654e-06, + "loss": 0.442, + "step": 10056 + }, + { + "epoch": 2.814721522530087, + "grad_norm": 0.2625996339854685, + "learning_rate": 1.04231889823323e-06, + "loss": 0.4616, + "step": 10057 + }, + { + "epoch": 2.815001399384271, + "grad_norm": 0.26536237523411943, + "learning_rate": 1.0391879483133804e-06, + "loss": 0.4553, + "step": 10058 + }, + { + "epoch": 2.815281276238455, + "grad_norm": 0.2518798493314616, + "learning_rate": 1.0360616585110338e-06, + "loss": 0.444, + "step": 10059 + }, + { + "epoch": 2.815561153092639, + "grad_norm": 0.2674145425133259, + "learning_rate": 1.0329400291237413e-06, + "loss": 0.4639, + "step": 10060 + }, + { + "epoch": 2.8158410299468235, + "grad_norm": 0.25435428497609625, + "learning_rate": 1.0298230604486258e-06, + "loss": 0.449, + "step": 10061 + }, + { + "epoch": 2.8161209068010074, + "grad_norm": 0.2590284413753409, + "learning_rate": 1.0267107527823617e-06, + "loss": 0.4425, + "step": 10062 + }, + { + "epoch": 2.8164007836551916, + "grad_norm": 0.25201643767313303, + "learning_rate": 1.023603106421178e-06, + "loss": 0.4552, + "step": 10063 + }, + { + "epoch": 2.816680660509376, + "grad_norm": 0.26165099284783544, + "learning_rate": 1.0205001216608612e-06, + "loss": 0.4538, + "step": 10064 + }, + { + "epoch": 2.81696053736356, + "grad_norm": 0.27776039496702404, + "learning_rate": 1.0174017987967577e-06, + "loss": 0.4715, + "step": 10065 + }, + { + "epoch": 2.817240414217744, + "grad_norm": 0.25777329208975347, + "learning_rate": 1.0143081381237706e-06, + "loss": 0.4371, + "step": 10066 + }, + { + "epoch": 2.8175202910719284, + "grad_norm": 0.2690546753062122, + "learning_rate": 1.0112191399363534e-06, + "loss": 0.4494, + "step": 10067 + }, + { + "epoch": 2.8178001679261127, + "grad_norm": 0.25587850210427726, + "learning_rate": 1.008134804528521e-06, + "loss": 0.4486, + "step": 10068 + }, + { + "epoch": 2.818080044780297, + "grad_norm": 0.25385824906574944, + "learning_rate": 1.0050551321938384e-06, + "loss": 0.4499, + "step": 10069 + }, + { + "epoch": 2.818359921634481, + "grad_norm": 0.2529017673823629, + "learning_rate": 1.0019801232254322e-06, + "loss": 0.4263, + "step": 10070 + }, + { + "epoch": 2.818639798488665, + "grad_norm": 0.2634481033538776, + "learning_rate": 9.989097779159796e-07, + "loss": 0.4462, + "step": 10071 + }, + { + "epoch": 2.8189196753428494, + "grad_norm": 0.2487106793995691, + "learning_rate": 9.958440965577243e-07, + "loss": 0.4597, + "step": 10072 + }, + { + "epoch": 2.819199552197033, + "grad_norm": 0.2479782160319531, + "learning_rate": 9.927830794424553e-07, + "loss": 0.4496, + "step": 10073 + }, + { + "epoch": 2.8194794290512175, + "grad_norm": 0.2533961878193845, + "learning_rate": 9.897267268615284e-07, + "loss": 0.4307, + "step": 10074 + }, + { + "epoch": 2.819759305905402, + "grad_norm": 0.2466751349058953, + "learning_rate": 9.866750391058389e-07, + "loss": 0.4368, + "step": 10075 + }, + { + "epoch": 2.8200391827595856, + "grad_norm": 0.24374236508360309, + "learning_rate": 9.83628016465854e-07, + "loss": 0.4352, + "step": 10076 + }, + { + "epoch": 2.82031905961377, + "grad_norm": 0.2543213559113662, + "learning_rate": 9.80585659231592e-07, + "loss": 0.4259, + "step": 10077 + }, + { + "epoch": 2.8205989364679542, + "grad_norm": 0.25099459130558754, + "learning_rate": 9.775479676926269e-07, + "loss": 0.4377, + "step": 10078 + }, + { + "epoch": 2.820878813322138, + "grad_norm": 0.2572368803838283, + "learning_rate": 9.745149421380828e-07, + "loss": 0.4492, + "step": 10079 + }, + { + "epoch": 2.8211586901763224, + "grad_norm": 0.2601264172895387, + "learning_rate": 9.714865828566455e-07, + "loss": 0.4614, + "step": 10080 + }, + { + "epoch": 2.8214385670305067, + "grad_norm": 0.24809535239113745, + "learning_rate": 9.684628901365567e-07, + "loss": 0.4686, + "step": 10081 + }, + { + "epoch": 2.8217184438846905, + "grad_norm": 0.2583757231300886, + "learning_rate": 9.654438642656083e-07, + "loss": 0.4374, + "step": 10082 + }, + { + "epoch": 2.821998320738875, + "grad_norm": 0.25979970122250534, + "learning_rate": 9.624295055311704e-07, + "loss": 0.4878, + "step": 10083 + }, + { + "epoch": 2.822278197593059, + "grad_norm": 0.2627535858238464, + "learning_rate": 9.594198142201305e-07, + "loss": 0.4486, + "step": 10084 + }, + { + "epoch": 2.822558074447243, + "grad_norm": 0.24754921991327522, + "learning_rate": 9.564147906189703e-07, + "loss": 0.4519, + "step": 10085 + }, + { + "epoch": 2.8228379513014272, + "grad_norm": 0.27070248197086716, + "learning_rate": 9.534144350136942e-07, + "loss": 0.4693, + "step": 10086 + }, + { + "epoch": 2.8231178281556115, + "grad_norm": 0.24472943267326952, + "learning_rate": 9.504187476898907e-07, + "loss": 0.4334, + "step": 10087 + }, + { + "epoch": 2.823397705009796, + "grad_norm": 0.24744547999958846, + "learning_rate": 9.474277289326817e-07, + "loss": 0.4531, + "step": 10088 + }, + { + "epoch": 2.82367758186398, + "grad_norm": 0.25962567214023186, + "learning_rate": 9.444413790267558e-07, + "loss": 0.4222, + "step": 10089 + }, + { + "epoch": 2.823957458718164, + "grad_norm": 0.25505431645769044, + "learning_rate": 9.41459698256364e-07, + "loss": 0.456, + "step": 10090 + }, + { + "epoch": 2.8242373355723482, + "grad_norm": 0.26849471916892415, + "learning_rate": 9.384826869052898e-07, + "loss": 0.4498, + "step": 10091 + }, + { + "epoch": 2.8245172124265325, + "grad_norm": 0.2554869188266186, + "learning_rate": 9.355103452568958e-07, + "loss": 0.4382, + "step": 10092 + }, + { + "epoch": 2.8247970892807164, + "grad_norm": 0.253352558640311, + "learning_rate": 9.325426735940946e-07, + "loss": 0.4439, + "step": 10093 + }, + { + "epoch": 2.8250769661349007, + "grad_norm": 0.24457205420612252, + "learning_rate": 9.295796721993433e-07, + "loss": 0.4317, + "step": 10094 + }, + { + "epoch": 2.825356842989085, + "grad_norm": 0.2611003726754562, + "learning_rate": 9.266213413546609e-07, + "loss": 0.4707, + "step": 10095 + }, + { + "epoch": 2.825636719843269, + "grad_norm": 0.2572010642600289, + "learning_rate": 9.236676813416334e-07, + "loss": 0.4458, + "step": 10096 + }, + { + "epoch": 2.825916596697453, + "grad_norm": 0.23567654616912856, + "learning_rate": 9.207186924413858e-07, + "loss": 0.4161, + "step": 10097 + }, + { + "epoch": 2.8261964735516374, + "grad_norm": 0.2524806826636959, + "learning_rate": 9.17774374934599e-07, + "loss": 0.4612, + "step": 10098 + }, + { + "epoch": 2.8264763504058212, + "grad_norm": 0.2583721915497328, + "learning_rate": 9.148347291015158e-07, + "loss": 0.4534, + "step": 10099 + }, + { + "epoch": 2.8267562272600055, + "grad_norm": 0.2443844209188402, + "learning_rate": 9.1189975522194e-07, + "loss": 0.4351, + "step": 10100 + }, + { + "epoch": 2.82703610411419, + "grad_norm": 0.2592075075953771, + "learning_rate": 9.089694535752257e-07, + "loss": 0.4483, + "step": 10101 + }, + { + "epoch": 2.8273159809683737, + "grad_norm": 0.2627400968198566, + "learning_rate": 9.060438244402725e-07, + "loss": 0.4623, + "step": 10102 + }, + { + "epoch": 2.827595857822558, + "grad_norm": 0.24670093257172748, + "learning_rate": 9.031228680955517e-07, + "loss": 0.4376, + "step": 10103 + }, + { + "epoch": 2.8278757346767422, + "grad_norm": 0.25509140273022224, + "learning_rate": 9.002065848190744e-07, + "loss": 0.4488, + "step": 10104 + }, + { + "epoch": 2.8281556115309265, + "grad_norm": 0.2513643430278304, + "learning_rate": 8.97294974888413e-07, + "loss": 0.4434, + "step": 10105 + }, + { + "epoch": 2.8284354883851104, + "grad_norm": 0.25221197470587714, + "learning_rate": 8.943880385807068e-07, + "loss": 0.4452, + "step": 10106 + }, + { + "epoch": 2.8287153652392947, + "grad_norm": 0.25070011355054483, + "learning_rate": 8.914857761726292e-07, + "loss": 0.434, + "step": 10107 + }, + { + "epoch": 2.828995242093479, + "grad_norm": 0.2582137023259711, + "learning_rate": 8.885881879404201e-07, + "loss": 0.4528, + "step": 10108 + }, + { + "epoch": 2.8292751189476633, + "grad_norm": 0.2592773347128377, + "learning_rate": 8.856952741598756e-07, + "loss": 0.4697, + "step": 10109 + }, + { + "epoch": 2.829554995801847, + "grad_norm": 0.2629416907627084, + "learning_rate": 8.828070351063533e-07, + "loss": 0.4501, + "step": 10110 + }, + { + "epoch": 2.8298348726560314, + "grad_norm": 0.2631384755840109, + "learning_rate": 8.799234710547444e-07, + "loss": 0.45, + "step": 10111 + }, + { + "epoch": 2.8301147495102157, + "grad_norm": 0.24555445924895378, + "learning_rate": 8.770445822795126e-07, + "loss": 0.4541, + "step": 10112 + }, + { + "epoch": 2.8303946263643995, + "grad_norm": 0.25707171887557456, + "learning_rate": 8.741703690546721e-07, + "loss": 0.4402, + "step": 10113 + }, + { + "epoch": 2.830674503218584, + "grad_norm": 0.2556283650182902, + "learning_rate": 8.713008316537929e-07, + "loss": 0.4572, + "step": 10114 + }, + { + "epoch": 2.830954380072768, + "grad_norm": 0.2600383651816295, + "learning_rate": 8.684359703500067e-07, + "loss": 0.4621, + "step": 10115 + }, + { + "epoch": 2.831234256926952, + "grad_norm": 0.267705736677189, + "learning_rate": 8.65575785415973e-07, + "loss": 0.4598, + "step": 10116 + }, + { + "epoch": 2.8315141337811363, + "grad_norm": 0.24332224527834756, + "learning_rate": 8.627202771239462e-07, + "loss": 0.4593, + "step": 10117 + }, + { + "epoch": 2.8317940106353205, + "grad_norm": 0.2585644170136799, + "learning_rate": 8.598694457457035e-07, + "loss": 0.4578, + "step": 10118 + }, + { + "epoch": 2.8320738874895044, + "grad_norm": 0.2528586860303206, + "learning_rate": 8.570232915525888e-07, + "loss": 0.4631, + "step": 10119 + }, + { + "epoch": 2.8323537643436887, + "grad_norm": 0.26658090330048756, + "learning_rate": 8.541818148155079e-07, + "loss": 0.4456, + "step": 10120 + }, + { + "epoch": 2.832633641197873, + "grad_norm": 0.25805663437445653, + "learning_rate": 8.513450158049108e-07, + "loss": 0.4587, + "step": 10121 + }, + { + "epoch": 2.832913518052057, + "grad_norm": 0.26390238260937576, + "learning_rate": 8.485128947908039e-07, + "loss": 0.4603, + "step": 10122 + }, + { + "epoch": 2.833193394906241, + "grad_norm": 0.2628755722632489, + "learning_rate": 8.456854520427493e-07, + "loss": 0.437, + "step": 10123 + }, + { + "epoch": 2.8334732717604254, + "grad_norm": 0.26083288223354306, + "learning_rate": 8.428626878298707e-07, + "loss": 0.4331, + "step": 10124 + }, + { + "epoch": 2.8337531486146097, + "grad_norm": 0.25799908741638145, + "learning_rate": 8.400446024208309e-07, + "loss": 0.4564, + "step": 10125 + }, + { + "epoch": 2.834033025468794, + "grad_norm": 0.24081806697538238, + "learning_rate": 8.372311960838652e-07, + "loss": 0.4382, + "step": 10126 + }, + { + "epoch": 2.834312902322978, + "grad_norm": 0.2574413268314077, + "learning_rate": 8.344224690867485e-07, + "loss": 0.4434, + "step": 10127 + }, + { + "epoch": 2.834592779177162, + "grad_norm": 0.29761285533028925, + "learning_rate": 8.316184216968226e-07, + "loss": 0.453, + "step": 10128 + }, + { + "epoch": 2.8348726560313464, + "grad_norm": 0.26541255095918426, + "learning_rate": 8.288190541809738e-07, + "loss": 0.446, + "step": 10129 + }, + { + "epoch": 2.8351525328855303, + "grad_norm": 0.2551603304358025, + "learning_rate": 8.260243668056555e-07, + "loss": 0.4409, + "step": 10130 + }, + { + "epoch": 2.8354324097397146, + "grad_norm": 0.2681181178604902, + "learning_rate": 8.232343598368552e-07, + "loss": 0.454, + "step": 10131 + }, + { + "epoch": 2.835712286593899, + "grad_norm": 0.267916569740636, + "learning_rate": 8.204490335401382e-07, + "loss": 0.4453, + "step": 10132 + }, + { + "epoch": 2.8359921634480827, + "grad_norm": 0.25766534832339966, + "learning_rate": 8.17668388180609e-07, + "loss": 0.444, + "step": 10133 + }, + { + "epoch": 2.836272040302267, + "grad_norm": 0.25295408765533794, + "learning_rate": 8.148924240229283e-07, + "loss": 0.4301, + "step": 10134 + }, + { + "epoch": 2.8365519171564513, + "grad_norm": 0.2590464448281375, + "learning_rate": 8.121211413313178e-07, + "loss": 0.4527, + "step": 10135 + }, + { + "epoch": 2.836831794010635, + "grad_norm": 0.27162404825376435, + "learning_rate": 8.093545403695502e-07, + "loss": 0.4445, + "step": 10136 + }, + { + "epoch": 2.8371116708648194, + "grad_norm": 0.26622854836678916, + "learning_rate": 8.065926214009479e-07, + "loss": 0.4467, + "step": 10137 + }, + { + "epoch": 2.8373915477190037, + "grad_norm": 0.2540515828910898, + "learning_rate": 8.038353846884006e-07, + "loss": 0.4431, + "step": 10138 + }, + { + "epoch": 2.8376714245731876, + "grad_norm": 0.25453815423627246, + "learning_rate": 8.010828304943318e-07, + "loss": 0.4441, + "step": 10139 + }, + { + "epoch": 2.837951301427372, + "grad_norm": 0.2685794575098386, + "learning_rate": 7.983349590807376e-07, + "loss": 0.4476, + "step": 10140 + }, + { + "epoch": 2.838231178281556, + "grad_norm": 0.26090741048944566, + "learning_rate": 7.955917707091642e-07, + "loss": 0.444, + "step": 10141 + }, + { + "epoch": 2.8385110551357404, + "grad_norm": 0.24961955398553431, + "learning_rate": 7.928532656407029e-07, + "loss": 0.4578, + "step": 10142 + }, + { + "epoch": 2.8387909319899243, + "grad_norm": 0.261480653970001, + "learning_rate": 7.901194441360116e-07, + "loss": 0.4365, + "step": 10143 + }, + { + "epoch": 2.8390708088441086, + "grad_norm": 0.26065977212184543, + "learning_rate": 7.87390306455299e-07, + "loss": 0.4538, + "step": 10144 + }, + { + "epoch": 2.839350685698293, + "grad_norm": 0.24750876625727614, + "learning_rate": 7.846658528583184e-07, + "loss": 0.4522, + "step": 10145 + }, + { + "epoch": 2.839630562552477, + "grad_norm": 0.2553102416355813, + "learning_rate": 7.819460836043957e-07, + "loss": 0.4559, + "step": 10146 + }, + { + "epoch": 2.839910439406661, + "grad_norm": 0.24769185971067034, + "learning_rate": 7.79230998952385e-07, + "loss": 0.4552, + "step": 10147 + }, + { + "epoch": 2.8401903162608453, + "grad_norm": 0.2672748874110806, + "learning_rate": 7.765205991607238e-07, + "loss": 0.4659, + "step": 10148 + }, + { + "epoch": 2.8404701931150296, + "grad_norm": 0.2604953786766096, + "learning_rate": 7.738148844873783e-07, + "loss": 0.4744, + "step": 10149 + }, + { + "epoch": 2.8407500699692134, + "grad_norm": 0.2583480179782524, + "learning_rate": 7.711138551898867e-07, + "loss": 0.4479, + "step": 10150 + }, + { + "epoch": 2.8410299468233977, + "grad_norm": 0.2546325371505603, + "learning_rate": 7.684175115253378e-07, + "loss": 0.439, + "step": 10151 + }, + { + "epoch": 2.841309823677582, + "grad_norm": 0.26613969797264936, + "learning_rate": 7.657258537503598e-07, + "loss": 0.461, + "step": 10152 + }, + { + "epoch": 2.841589700531766, + "grad_norm": 0.24737645835701677, + "learning_rate": 7.630388821211587e-07, + "loss": 0.4282, + "step": 10153 + }, + { + "epoch": 2.84186957738595, + "grad_norm": 0.2624667092229066, + "learning_rate": 7.603565968934689e-07, + "loss": 0.4708, + "step": 10154 + }, + { + "epoch": 2.8421494542401344, + "grad_norm": 0.2420140631051068, + "learning_rate": 7.576789983226029e-07, + "loss": 0.4299, + "step": 10155 + }, + { + "epoch": 2.8424293310943183, + "grad_norm": 0.25496191355468883, + "learning_rate": 7.550060866634123e-07, + "loss": 0.4377, + "step": 10156 + }, + { + "epoch": 2.8427092079485026, + "grad_norm": 0.24663098820574658, + "learning_rate": 7.523378621703048e-07, + "loss": 0.4334, + "step": 10157 + }, + { + "epoch": 2.842989084802687, + "grad_norm": 0.25869772024576165, + "learning_rate": 7.496743250972494e-07, + "loss": 0.4677, + "step": 10158 + }, + { + "epoch": 2.8432689616568707, + "grad_norm": 0.259342332895863, + "learning_rate": 7.470154756977543e-07, + "loss": 0.4556, + "step": 10159 + }, + { + "epoch": 2.843548838511055, + "grad_norm": 0.25720323808780676, + "learning_rate": 7.443613142248951e-07, + "loss": 0.447, + "step": 10160 + }, + { + "epoch": 2.8438287153652393, + "grad_norm": 0.26139628238782625, + "learning_rate": 7.417118409312918e-07, + "loss": 0.4468, + "step": 10161 + }, + { + "epoch": 2.8441085922194236, + "grad_norm": 0.2630166216465488, + "learning_rate": 7.390670560691315e-07, + "loss": 0.455, + "step": 10162 + }, + { + "epoch": 2.844388469073608, + "grad_norm": 0.2659934121702161, + "learning_rate": 7.364269598901408e-07, + "loss": 0.4334, + "step": 10163 + }, + { + "epoch": 2.8446683459277917, + "grad_norm": 0.25686988035527164, + "learning_rate": 7.337915526456019e-07, + "loss": 0.4514, + "step": 10164 + }, + { + "epoch": 2.844948222781976, + "grad_norm": 0.251348655283329, + "learning_rate": 7.311608345863641e-07, + "loss": 0.4539, + "step": 10165 + }, + { + "epoch": 2.8452280996361603, + "grad_norm": 0.25654330476923476, + "learning_rate": 7.285348059628105e-07, + "loss": 0.4593, + "step": 10166 + }, + { + "epoch": 2.845507976490344, + "grad_norm": 0.24776546205526637, + "learning_rate": 7.259134670248968e-07, + "loss": 0.4672, + "step": 10167 + }, + { + "epoch": 2.8457878533445284, + "grad_norm": 0.2556660013031803, + "learning_rate": 7.232968180221122e-07, + "loss": 0.4325, + "step": 10168 + }, + { + "epoch": 2.8460677301987127, + "grad_norm": 0.2448147767428879, + "learning_rate": 7.206848592035242e-07, + "loss": 0.4377, + "step": 10169 + }, + { + "epoch": 2.8463476070528966, + "grad_norm": 0.25705771035811886, + "learning_rate": 7.180775908177339e-07, + "loss": 0.443, + "step": 10170 + }, + { + "epoch": 2.846627483907081, + "grad_norm": 0.24945762219879117, + "learning_rate": 7.154750131128984e-07, + "loss": 0.4569, + "step": 10171 + }, + { + "epoch": 2.846907360761265, + "grad_norm": 0.2540273156999752, + "learning_rate": 7.128771263367418e-07, + "loss": 0.4303, + "step": 10172 + }, + { + "epoch": 2.847187237615449, + "grad_norm": 0.2580889678828135, + "learning_rate": 7.102839307365272e-07, + "loss": 0.4505, + "step": 10173 + }, + { + "epoch": 2.8474671144696333, + "grad_norm": 0.2603424115721654, + "learning_rate": 7.076954265590741e-07, + "loss": 0.467, + "step": 10174 + }, + { + "epoch": 2.8477469913238176, + "grad_norm": 0.2530692836062023, + "learning_rate": 7.05111614050763e-07, + "loss": 0.4487, + "step": 10175 + }, + { + "epoch": 2.8480268681780014, + "grad_norm": 0.2559129157286122, + "learning_rate": 7.025324934575139e-07, + "loss": 0.4416, + "step": 10176 + }, + { + "epoch": 2.8483067450321857, + "grad_norm": 0.24773205322018566, + "learning_rate": 6.999580650248194e-07, + "loss": 0.4329, + "step": 10177 + }, + { + "epoch": 2.84858662188637, + "grad_norm": 0.25521126431205277, + "learning_rate": 6.973883289977112e-07, + "loss": 0.4543, + "step": 10178 + }, + { + "epoch": 2.8488664987405543, + "grad_norm": 0.2595084443543985, + "learning_rate": 6.948232856207771e-07, + "loss": 0.4529, + "step": 10179 + }, + { + "epoch": 2.849146375594738, + "grad_norm": 0.2606741603953674, + "learning_rate": 6.922629351381604e-07, + "loss": 0.4353, + "step": 10180 + }, + { + "epoch": 2.8494262524489224, + "grad_norm": 0.2557838497999989, + "learning_rate": 6.897072777935609e-07, + "loss": 0.4424, + "step": 10181 + }, + { + "epoch": 2.8497061293031067, + "grad_norm": 0.24571771064985215, + "learning_rate": 6.871563138302173e-07, + "loss": 0.4505, + "step": 10182 + }, + { + "epoch": 2.849986006157291, + "grad_norm": 0.26667683305745005, + "learning_rate": 6.846100434909353e-07, + "loss": 0.4542, + "step": 10183 + }, + { + "epoch": 2.850265883011475, + "grad_norm": 0.2600320263030447, + "learning_rate": 6.820684670180766e-07, + "loss": 0.435, + "step": 10184 + }, + { + "epoch": 2.850545759865659, + "grad_norm": 0.2537212753909362, + "learning_rate": 6.795315846535422e-07, + "loss": 0.4445, + "step": 10185 + }, + { + "epoch": 2.8508256367198435, + "grad_norm": 0.26412510291968333, + "learning_rate": 6.769993966387999e-07, + "loss": 0.4496, + "step": 10186 + }, + { + "epoch": 2.8511055135740273, + "grad_norm": 0.25422280946962045, + "learning_rate": 6.744719032148627e-07, + "loss": 0.45, + "step": 10187 + }, + { + "epoch": 2.8513853904282116, + "grad_norm": 0.2533516046710153, + "learning_rate": 6.719491046222992e-07, + "loss": 0.428, + "step": 10188 + }, + { + "epoch": 2.851665267282396, + "grad_norm": 0.24407088865412283, + "learning_rate": 6.694310011012284e-07, + "loss": 0.4572, + "step": 10189 + }, + { + "epoch": 2.8519451441365797, + "grad_norm": 0.24943438860892966, + "learning_rate": 6.669175928913251e-07, + "loss": 0.4165, + "step": 10190 + }, + { + "epoch": 2.852225020990764, + "grad_norm": 0.25713363585411575, + "learning_rate": 6.644088802318205e-07, + "loss": 0.4453, + "step": 10191 + }, + { + "epoch": 2.8525048978449483, + "grad_norm": 0.26567928250076694, + "learning_rate": 6.619048633614955e-07, + "loss": 0.4505, + "step": 10192 + }, + { + "epoch": 2.852784774699132, + "grad_norm": 0.25818673891369537, + "learning_rate": 6.594055425186763e-07, + "loss": 0.449, + "step": 10193 + }, + { + "epoch": 2.8530646515533165, + "grad_norm": 0.2540198476376862, + "learning_rate": 6.569109179412558e-07, + "loss": 0.4309, + "step": 10194 + }, + { + "epoch": 2.8533445284075007, + "grad_norm": 0.2545216361991795, + "learning_rate": 6.54420989866672e-07, + "loss": 0.4281, + "step": 10195 + }, + { + "epoch": 2.8536244052616846, + "grad_norm": 0.25968360621510583, + "learning_rate": 6.519357585319242e-07, + "loss": 0.468, + "step": 10196 + }, + { + "epoch": 2.853904282115869, + "grad_norm": 0.24687151281696984, + "learning_rate": 6.494552241735452e-07, + "loss": 0.4264, + "step": 10197 + }, + { + "epoch": 2.854184158970053, + "grad_norm": 0.2583821078491018, + "learning_rate": 6.469793870276464e-07, + "loss": 0.4752, + "step": 10198 + }, + { + "epoch": 2.8544640358242375, + "grad_norm": 0.2459137265381298, + "learning_rate": 6.445082473298669e-07, + "loss": 0.4725, + "step": 10199 + }, + { + "epoch": 2.8547439126784218, + "grad_norm": 0.24742088586054187, + "learning_rate": 6.420418053154243e-07, + "loss": 0.4407, + "step": 10200 + }, + { + "epoch": 2.8550237895326056, + "grad_norm": 0.25517969356297493, + "learning_rate": 6.395800612190639e-07, + "loss": 0.4355, + "step": 10201 + }, + { + "epoch": 2.85530366638679, + "grad_norm": 0.2598944885613344, + "learning_rate": 6.37123015275104e-07, + "loss": 0.4621, + "step": 10202 + }, + { + "epoch": 2.855583543240974, + "grad_norm": 0.25983576281556636, + "learning_rate": 6.346706677174075e-07, + "loss": 0.448, + "step": 10203 + }, + { + "epoch": 2.855863420095158, + "grad_norm": 0.2442368834802085, + "learning_rate": 6.322230187793876e-07, + "loss": 0.4305, + "step": 10204 + }, + { + "epoch": 2.8561432969493423, + "grad_norm": 0.25932837759420585, + "learning_rate": 6.297800686940081e-07, + "loss": 0.4534, + "step": 10205 + }, + { + "epoch": 2.8564231738035266, + "grad_norm": 0.26187189125601396, + "learning_rate": 6.273418176937995e-07, + "loss": 0.4604, + "step": 10206 + }, + { + "epoch": 2.8567030506577105, + "grad_norm": 0.2503901417519331, + "learning_rate": 6.249082660108318e-07, + "loss": 0.4152, + "step": 10207 + }, + { + "epoch": 2.8569829275118948, + "grad_norm": 0.2557843101499479, + "learning_rate": 6.224794138767309e-07, + "loss": 0.4565, + "step": 10208 + }, + { + "epoch": 2.857262804366079, + "grad_norm": 0.25384466886807133, + "learning_rate": 6.200552615226784e-07, + "loss": 0.4452, + "step": 10209 + }, + { + "epoch": 2.857542681220263, + "grad_norm": 0.2548841014459113, + "learning_rate": 6.176358091794011e-07, + "loss": 0.4368, + "step": 10210 + }, + { + "epoch": 2.857822558074447, + "grad_norm": 0.25757124576997104, + "learning_rate": 6.152210570771921e-07, + "loss": 0.4439, + "step": 10211 + }, + { + "epoch": 2.8581024349286315, + "grad_norm": 0.2569331306938241, + "learning_rate": 6.128110054458847e-07, + "loss": 0.4612, + "step": 10212 + }, + { + "epoch": 2.8583823117828153, + "grad_norm": 0.25282275686772765, + "learning_rate": 6.104056545148673e-07, + "loss": 0.4577, + "step": 10213 + }, + { + "epoch": 2.8586621886369996, + "grad_norm": 0.25896247226299324, + "learning_rate": 6.080050045130847e-07, + "loss": 0.4509, + "step": 10214 + }, + { + "epoch": 2.858942065491184, + "grad_norm": 0.2576693397622, + "learning_rate": 6.056090556690319e-07, + "loss": 0.4411, + "step": 10215 + }, + { + "epoch": 2.859221942345368, + "grad_norm": 0.2636597329257613, + "learning_rate": 6.032178082107542e-07, + "loss": 0.4533, + "step": 10216 + }, + { + "epoch": 2.859501819199552, + "grad_norm": 0.24697365713431213, + "learning_rate": 6.008312623658586e-07, + "loss": 0.4389, + "step": 10217 + }, + { + "epoch": 2.8597816960537363, + "grad_norm": 0.2500930445428214, + "learning_rate": 5.984494183614909e-07, + "loss": 0.4566, + "step": 10218 + }, + { + "epoch": 2.8600615729079206, + "grad_norm": 0.26216006480801274, + "learning_rate": 5.960722764243587e-07, + "loss": 0.467, + "step": 10219 + }, + { + "epoch": 2.860341449762105, + "grad_norm": 0.24926684599386656, + "learning_rate": 5.936998367807201e-07, + "loss": 0.4326, + "step": 10220 + }, + { + "epoch": 2.8606213266162888, + "grad_norm": 0.2562319805043352, + "learning_rate": 5.913320996563831e-07, + "loss": 0.4631, + "step": 10221 + }, + { + "epoch": 2.860901203470473, + "grad_norm": 0.2457629153440099, + "learning_rate": 5.889690652767177e-07, + "loss": 0.4501, + "step": 10222 + }, + { + "epoch": 2.8611810803246573, + "grad_norm": 0.26272183713329245, + "learning_rate": 5.866107338666271e-07, + "loss": 0.4568, + "step": 10223 + }, + { + "epoch": 2.861460957178841, + "grad_norm": 0.25666367549779606, + "learning_rate": 5.842571056505875e-07, + "loss": 0.4489, + "step": 10224 + }, + { + "epoch": 2.8617408340330255, + "grad_norm": 0.2705683877873388, + "learning_rate": 5.819081808526139e-07, + "loss": 0.4643, + "step": 10225 + }, + { + "epoch": 2.8620207108872098, + "grad_norm": 0.2543820024194871, + "learning_rate": 5.795639596962832e-07, + "loss": 0.4503, + "step": 10226 + }, + { + "epoch": 2.8623005877413936, + "grad_norm": 0.25693972439495794, + "learning_rate": 5.772244424047169e-07, + "loss": 0.4444, + "step": 10227 + }, + { + "epoch": 2.862580464595578, + "grad_norm": 0.25490621766923677, + "learning_rate": 5.748896292005868e-07, + "loss": 0.4486, + "step": 10228 + }, + { + "epoch": 2.862860341449762, + "grad_norm": 0.26485194854054805, + "learning_rate": 5.725595203061318e-07, + "loss": 0.4659, + "step": 10229 + }, + { + "epoch": 2.863140218303946, + "grad_norm": 0.24588014051314325, + "learning_rate": 5.702341159431246e-07, + "loss": 0.4458, + "step": 10230 + }, + { + "epoch": 2.8634200951581303, + "grad_norm": 0.25924629499595886, + "learning_rate": 5.679134163328992e-07, + "loss": 0.4423, + "step": 10231 + }, + { + "epoch": 2.8636999720123146, + "grad_norm": 0.2553099022686705, + "learning_rate": 5.655974216963456e-07, + "loss": 0.447, + "step": 10232 + }, + { + "epoch": 2.8639798488664985, + "grad_norm": 0.259680346674402, + "learning_rate": 5.632861322538985e-07, + "loss": 0.4425, + "step": 10233 + }, + { + "epoch": 2.8642597257206828, + "grad_norm": 0.26740786597677946, + "learning_rate": 5.609795482255486e-07, + "loss": 0.4423, + "step": 10234 + }, + { + "epoch": 2.864539602574867, + "grad_norm": 0.2680068141821788, + "learning_rate": 5.586776698308372e-07, + "loss": 0.4648, + "step": 10235 + }, + { + "epoch": 2.8648194794290514, + "grad_norm": 0.2563372948009287, + "learning_rate": 5.563804972888609e-07, + "loss": 0.4618, + "step": 10236 + }, + { + "epoch": 2.8650993562832356, + "grad_norm": 0.24611658743221432, + "learning_rate": 5.540880308182617e-07, + "loss": 0.4475, + "step": 10237 + }, + { + "epoch": 2.8653792331374195, + "grad_norm": 0.24907400613042632, + "learning_rate": 5.518002706372372e-07, + "loss": 0.4479, + "step": 10238 + }, + { + "epoch": 2.865659109991604, + "grad_norm": 0.2481189654103723, + "learning_rate": 5.495172169635409e-07, + "loss": 0.4535, + "step": 10239 + }, + { + "epoch": 2.865938986845788, + "grad_norm": 0.24972684085267743, + "learning_rate": 5.472388700144771e-07, + "loss": 0.4486, + "step": 10240 + }, + { + "epoch": 2.866218863699972, + "grad_norm": 0.2493928388889635, + "learning_rate": 5.449652300068997e-07, + "loss": 0.442, + "step": 10241 + }, + { + "epoch": 2.866498740554156, + "grad_norm": 0.25502178405778514, + "learning_rate": 5.426962971572081e-07, + "loss": 0.4567, + "step": 10242 + }, + { + "epoch": 2.8667786174083405, + "grad_norm": 0.25717676914814624, + "learning_rate": 5.404320716813683e-07, + "loss": 0.4245, + "step": 10243 + }, + { + "epoch": 2.8670584942625243, + "grad_norm": 0.26027840464450835, + "learning_rate": 5.381725537948856e-07, + "loss": 0.4528, + "step": 10244 + }, + { + "epoch": 2.8673383711167086, + "grad_norm": 0.258521387544155, + "learning_rate": 5.35917743712827e-07, + "loss": 0.4327, + "step": 10245 + }, + { + "epoch": 2.867618247970893, + "grad_norm": 0.25270403591986484, + "learning_rate": 5.336676416498043e-07, + "loss": 0.4459, + "step": 10246 + }, + { + "epoch": 2.8678981248250768, + "grad_norm": 0.2552451258449106, + "learning_rate": 5.314222478199793e-07, + "loss": 0.4565, + "step": 10247 + }, + { + "epoch": 2.868178001679261, + "grad_norm": 0.25016155667035667, + "learning_rate": 5.291815624370755e-07, + "loss": 0.4381, + "step": 10248 + }, + { + "epoch": 2.8684578785334454, + "grad_norm": 0.25687106582087943, + "learning_rate": 5.269455857143613e-07, + "loss": 0.4437, + "step": 10249 + }, + { + "epoch": 2.868737755387629, + "grad_norm": 0.25635311037631414, + "learning_rate": 5.247143178646552e-07, + "loss": 0.4523, + "step": 10250 + }, + { + "epoch": 2.8690176322418135, + "grad_norm": 0.2604944605147867, + "learning_rate": 5.224877591003374e-07, + "loss": 0.4581, + "step": 10251 + }, + { + "epoch": 2.869297509095998, + "grad_norm": 0.25014607563839375, + "learning_rate": 5.202659096333218e-07, + "loss": 0.4462, + "step": 10252 + }, + { + "epoch": 2.869577385950182, + "grad_norm": 0.26158734638337017, + "learning_rate": 5.180487696750946e-07, + "loss": 0.4418, + "step": 10253 + }, + { + "epoch": 2.869857262804366, + "grad_norm": 0.2628142041741432, + "learning_rate": 5.158363394366816e-07, + "loss": 0.4513, + "step": 10254 + }, + { + "epoch": 2.87013713965855, + "grad_norm": 0.25316528938359184, + "learning_rate": 5.136286191286643e-07, + "loss": 0.4393, + "step": 10255 + }, + { + "epoch": 2.8704170165127345, + "grad_norm": 0.2495107263010776, + "learning_rate": 5.114256089611747e-07, + "loss": 0.445, + "step": 10256 + }, + { + "epoch": 2.870696893366919, + "grad_norm": 0.25869314266387045, + "learning_rate": 5.092273091438948e-07, + "loss": 0.4538, + "step": 10257 + }, + { + "epoch": 2.8709767702211026, + "grad_norm": 0.2554676058510657, + "learning_rate": 5.070337198860631e-07, + "loss": 0.4623, + "step": 10258 + }, + { + "epoch": 2.871256647075287, + "grad_norm": 0.24537075832570931, + "learning_rate": 5.048448413964624e-07, + "loss": 0.4546, + "step": 10259 + }, + { + "epoch": 2.8715365239294712, + "grad_norm": 0.2672389569675875, + "learning_rate": 5.026606738834317e-07, + "loss": 0.4557, + "step": 10260 + }, + { + "epoch": 2.871816400783655, + "grad_norm": 0.24970304197948354, + "learning_rate": 5.004812175548656e-07, + "loss": 0.4586, + "step": 10261 + }, + { + "epoch": 2.8720962776378394, + "grad_norm": 0.26349661025087207, + "learning_rate": 4.983064726181986e-07, + "loss": 0.4649, + "step": 10262 + }, + { + "epoch": 2.8723761544920237, + "grad_norm": 0.24778771885993947, + "learning_rate": 4.961364392804313e-07, + "loss": 0.4433, + "step": 10263 + }, + { + "epoch": 2.8726560313462075, + "grad_norm": 0.26672455734691347, + "learning_rate": 4.939711177481099e-07, + "loss": 0.4429, + "step": 10264 + }, + { + "epoch": 2.872935908200392, + "grad_norm": 0.2548923418425755, + "learning_rate": 4.91810508227325e-07, + "loss": 0.4397, + "step": 10265 + }, + { + "epoch": 2.873215785054576, + "grad_norm": 0.25746339083089437, + "learning_rate": 4.896546109237232e-07, + "loss": 0.4322, + "step": 10266 + }, + { + "epoch": 2.87349566190876, + "grad_norm": 0.2571032628499338, + "learning_rate": 4.875034260425126e-07, + "loss": 0.4665, + "step": 10267 + }, + { + "epoch": 2.8737755387629442, + "grad_norm": 0.2611386216444229, + "learning_rate": 4.853569537884406e-07, + "loss": 0.4496, + "step": 10268 + }, + { + "epoch": 2.8740554156171285, + "grad_norm": 0.2554305595408042, + "learning_rate": 4.832151943658048e-07, + "loss": 0.4374, + "step": 10269 + }, + { + "epoch": 2.8743352924713124, + "grad_norm": 0.26483360105662895, + "learning_rate": 4.810781479784588e-07, + "loss": 0.455, + "step": 10270 + }, + { + "epoch": 2.8746151693254967, + "grad_norm": 0.252441584593939, + "learning_rate": 4.789458148298176e-07, + "loss": 0.4432, + "step": 10271 + }, + { + "epoch": 2.874895046179681, + "grad_norm": 0.2603269128030735, + "learning_rate": 4.768181951228301e-07, + "loss": 0.4651, + "step": 10272 + }, + { + "epoch": 2.8751749230338652, + "grad_norm": 0.24957648626480253, + "learning_rate": 4.746952890600065e-07, + "loss": 0.455, + "step": 10273 + }, + { + "epoch": 2.8754547998880495, + "grad_norm": 0.26391183669801865, + "learning_rate": 4.725770968434018e-07, + "loss": 0.4541, + "step": 10274 + }, + { + "epoch": 2.8757346767422334, + "grad_norm": 0.25716393251231623, + "learning_rate": 4.7046361867463807e-07, + "loss": 0.4608, + "step": 10275 + }, + { + "epoch": 2.8760145535964177, + "grad_norm": 0.26668582501615473, + "learning_rate": 4.683548547548655e-07, + "loss": 0.4512, + "step": 10276 + }, + { + "epoch": 2.876294430450602, + "grad_norm": 0.2455419391449943, + "learning_rate": 4.6625080528480137e-07, + "loss": 0.4527, + "step": 10277 + }, + { + "epoch": 2.876574307304786, + "grad_norm": 0.2570455687678307, + "learning_rate": 4.641514704647132e-07, + "loss": 0.457, + "step": 10278 + }, + { + "epoch": 2.87685418415897, + "grad_norm": 0.24449720654527113, + "learning_rate": 4.620568504944134e-07, + "loss": 0.4511, + "step": 10279 + }, + { + "epoch": 2.8771340610131544, + "grad_norm": 0.24351161087254086, + "learning_rate": 4.5996694557327025e-07, + "loss": 0.439, + "step": 10280 + }, + { + "epoch": 2.8774139378673382, + "grad_norm": 0.26099511112058876, + "learning_rate": 4.5788175590020246e-07, + "loss": 0.469, + "step": 10281 + }, + { + "epoch": 2.8776938147215225, + "grad_norm": 0.2582471683662392, + "learning_rate": 4.5580128167367895e-07, + "loss": 0.4309, + "step": 10282 + }, + { + "epoch": 2.877973691575707, + "grad_norm": 0.24795866638590033, + "learning_rate": 4.5372552309171925e-07, + "loss": 0.4668, + "step": 10283 + }, + { + "epoch": 2.8782535684298907, + "grad_norm": 0.25154649095883064, + "learning_rate": 4.516544803518985e-07, + "loss": 0.4477, + "step": 10284 + }, + { + "epoch": 2.878533445284075, + "grad_norm": 0.2505105586047212, + "learning_rate": 4.495881536513369e-07, + "loss": 0.419, + "step": 10285 + }, + { + "epoch": 2.8788133221382592, + "grad_norm": 0.25352983127315026, + "learning_rate": 4.4752654318670485e-07, + "loss": 0.4531, + "step": 10286 + }, + { + "epoch": 2.879093198992443, + "grad_norm": 0.25948613905647183, + "learning_rate": 4.4546964915423986e-07, + "loss": 0.4618, + "step": 10287 + }, + { + "epoch": 2.8793730758466274, + "grad_norm": 0.2582180835960804, + "learning_rate": 4.434174717497075e-07, + "loss": 0.4542, + "step": 10288 + }, + { + "epoch": 2.8796529527008117, + "grad_norm": 0.257995531526806, + "learning_rate": 4.413700111684349e-07, + "loss": 0.4611, + "step": 10289 + }, + { + "epoch": 2.879932829554996, + "grad_norm": 0.2536050936538131, + "learning_rate": 4.3932726760531064e-07, + "loss": 0.4331, + "step": 10290 + }, + { + "epoch": 2.88021270640918, + "grad_norm": 0.26348346728174327, + "learning_rate": 4.3728924125475137e-07, + "loss": 0.4652, + "step": 10291 + }, + { + "epoch": 2.880492583263364, + "grad_norm": 0.25005631660677363, + "learning_rate": 4.3525593231074633e-07, + "loss": 0.4544, + "step": 10292 + }, + { + "epoch": 2.8807724601175484, + "grad_norm": 0.2555331171203246, + "learning_rate": 4.3322734096682417e-07, + "loss": 0.457, + "step": 10293 + }, + { + "epoch": 2.8810523369717327, + "grad_norm": 0.2545421363229972, + "learning_rate": 4.3120346741606367e-07, + "loss": 0.4686, + "step": 10294 + }, + { + "epoch": 2.8813322138259165, + "grad_norm": 0.2617427853569952, + "learning_rate": 4.2918431185110517e-07, + "loss": 0.4435, + "step": 10295 + }, + { + "epoch": 2.881612090680101, + "grad_norm": 0.2611070268049796, + "learning_rate": 4.271698744641339e-07, + "loss": 0.4822, + "step": 10296 + }, + { + "epoch": 2.881891967534285, + "grad_norm": 0.2651306945435216, + "learning_rate": 4.2516015544687426e-07, + "loss": 0.4603, + "step": 10297 + }, + { + "epoch": 2.882171844388469, + "grad_norm": 0.3124262786349235, + "learning_rate": 4.2315515499062317e-07, + "loss": 0.4459, + "step": 10298 + }, + { + "epoch": 2.8824517212426533, + "grad_norm": 0.2685649925102902, + "learning_rate": 4.211548732862114e-07, + "loss": 0.4392, + "step": 10299 + }, + { + "epoch": 2.8827315980968375, + "grad_norm": 0.2532537109973419, + "learning_rate": 4.19159310524031e-07, + "loss": 0.442, + "step": 10300 + }, + { + "epoch": 2.8830114749510214, + "grad_norm": 0.25293340380592855, + "learning_rate": 4.17168466894019e-07, + "loss": 0.4272, + "step": 10301 + }, + { + "epoch": 2.8832913518052057, + "grad_norm": 0.2626769114687486, + "learning_rate": 4.1518234258566824e-07, + "loss": 0.4514, + "step": 10302 + }, + { + "epoch": 2.88357122865939, + "grad_norm": 0.2548871588665236, + "learning_rate": 4.132009377880108e-07, + "loss": 0.4563, + "step": 10303 + }, + { + "epoch": 2.883851105513574, + "grad_norm": 0.2499079155484336, + "learning_rate": 4.1122425268964593e-07, + "loss": 0.4488, + "step": 10304 + }, + { + "epoch": 2.884130982367758, + "grad_norm": 0.2774741682164875, + "learning_rate": 4.092522874787119e-07, + "loss": 0.4595, + "step": 10305 + }, + { + "epoch": 2.8844108592219424, + "grad_norm": 0.2620330159986628, + "learning_rate": 4.072850423428975e-07, + "loss": 0.4553, + "step": 10306 + }, + { + "epoch": 2.8846907360761262, + "grad_norm": 0.24761164989931458, + "learning_rate": 4.053225174694586e-07, + "loss": 0.4532, + "step": 10307 + }, + { + "epoch": 2.8849706129303105, + "grad_norm": 0.24479380194816994, + "learning_rate": 4.0336471304517897e-07, + "loss": 0.4172, + "step": 10308 + }, + { + "epoch": 2.885250489784495, + "grad_norm": 0.2514815231592224, + "learning_rate": 4.014116292564041e-07, + "loss": 0.4459, + "step": 10309 + }, + { + "epoch": 2.885530366638679, + "grad_norm": 0.2923214184067987, + "learning_rate": 3.9946326628903516e-07, + "loss": 0.4461, + "step": 10310 + }, + { + "epoch": 2.8858102434928634, + "grad_norm": 0.24988649212900807, + "learning_rate": 3.975196243285129e-07, + "loss": 0.4624, + "step": 10311 + }, + { + "epoch": 2.8860901203470473, + "grad_norm": 0.2483774134025153, + "learning_rate": 3.9558070355983357e-07, + "loss": 0.447, + "step": 10312 + }, + { + "epoch": 2.8863699972012316, + "grad_norm": 0.2575682884161871, + "learning_rate": 3.9364650416755525e-07, + "loss": 0.4319, + "step": 10313 + }, + { + "epoch": 2.886649874055416, + "grad_norm": 0.2563758815757559, + "learning_rate": 3.9171702633576945e-07, + "loss": 0.423, + "step": 10314 + }, + { + "epoch": 2.8869297509095997, + "grad_norm": 0.25206218781065665, + "learning_rate": 3.8979227024811826e-07, + "loss": 0.4592, + "step": 10315 + }, + { + "epoch": 2.887209627763784, + "grad_norm": 0.25445140057837873, + "learning_rate": 3.878722360878051e-07, + "loss": 0.4538, + "step": 10316 + }, + { + "epoch": 2.8874895046179683, + "grad_norm": 0.25892850164373754, + "learning_rate": 3.8595692403758376e-07, + "loss": 0.4565, + "step": 10317 + }, + { + "epoch": 2.887769381472152, + "grad_norm": 0.2571744291136916, + "learning_rate": 3.8404633427975846e-07, + "loss": 0.4458, + "step": 10318 + }, + { + "epoch": 2.8880492583263364, + "grad_norm": 0.25961005904007983, + "learning_rate": 3.82140466996167e-07, + "loss": 0.4681, + "step": 10319 + }, + { + "epoch": 2.8883291351805207, + "grad_norm": 0.2626162006435372, + "learning_rate": 3.802393223682199e-07, + "loss": 0.4348, + "step": 10320 + }, + { + "epoch": 2.8886090120347045, + "grad_norm": 0.26403517659005993, + "learning_rate": 3.7834290057686684e-07, + "loss": 0.4463, + "step": 10321 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.2640114446640457, + "learning_rate": 3.764512018026134e-07, + "loss": 0.4491, + "step": 10322 + }, + { + "epoch": 2.889168765743073, + "grad_norm": 0.2567861703073457, + "learning_rate": 3.7456422622551e-07, + "loss": 0.4629, + "step": 10323 + }, + { + "epoch": 2.889448642597257, + "grad_norm": 0.26970481734620666, + "learning_rate": 3.726819740251575e-07, + "loss": 0.4425, + "step": 10324 + }, + { + "epoch": 2.8897285194514413, + "grad_norm": 0.2573265949733196, + "learning_rate": 3.70804445380718e-07, + "loss": 0.4339, + "step": 10325 + }, + { + "epoch": 2.8900083963056256, + "grad_norm": 0.2457024445347563, + "learning_rate": 3.68931640470882e-07, + "loss": 0.4542, + "step": 10326 + }, + { + "epoch": 2.8902882731598094, + "grad_norm": 0.2518158589030567, + "learning_rate": 3.67063559473918e-07, + "loss": 0.4413, + "step": 10327 + }, + { + "epoch": 2.8905681500139937, + "grad_norm": 0.2541691212113756, + "learning_rate": 3.6520020256762267e-07, + "loss": 0.4354, + "step": 10328 + }, + { + "epoch": 2.890848026868178, + "grad_norm": 0.26148669070858493, + "learning_rate": 3.6334156992935406e-07, + "loss": 0.4758, + "step": 10329 + }, + { + "epoch": 2.8911279037223623, + "grad_norm": 0.2578137534929587, + "learning_rate": 3.614876617360152e-07, + "loss": 0.4694, + "step": 10330 + }, + { + "epoch": 2.8914077805765466, + "grad_norm": 0.25462776023270506, + "learning_rate": 3.596384781640705e-07, + "loss": 0.4343, + "step": 10331 + }, + { + "epoch": 2.8916876574307304, + "grad_norm": 0.2574136099033175, + "learning_rate": 3.5779401938951794e-07, + "loss": 0.4517, + "step": 10332 + }, + { + "epoch": 2.8919675342849147, + "grad_norm": 0.2528803426108313, + "learning_rate": 3.559542855879172e-07, + "loss": 0.4427, + "step": 10333 + }, + { + "epoch": 2.892247411139099, + "grad_norm": 0.2428749181134538, + "learning_rate": 3.5411927693437265e-07, + "loss": 0.4381, + "step": 10334 + }, + { + "epoch": 2.892527287993283, + "grad_norm": 0.2563684456973611, + "learning_rate": 3.522889936035445e-07, + "loss": 0.4615, + "step": 10335 + }, + { + "epoch": 2.892807164847467, + "grad_norm": 0.2603322799709365, + "learning_rate": 3.504634357696379e-07, + "loss": 0.469, + "step": 10336 + }, + { + "epoch": 2.8930870417016514, + "grad_norm": 0.24520091679492342, + "learning_rate": 3.4864260360641386e-07, + "loss": 0.4374, + "step": 10337 + }, + { + "epoch": 2.8933669185558353, + "grad_norm": 0.265476476947951, + "learning_rate": 3.468264972871782e-07, + "loss": 0.463, + "step": 10338 + }, + { + "epoch": 2.8936467954100196, + "grad_norm": 0.2716861089064996, + "learning_rate": 3.4501511698478705e-07, + "loss": 0.4538, + "step": 10339 + }, + { + "epoch": 2.893926672264204, + "grad_norm": 0.24986600046345175, + "learning_rate": 3.43208462871647e-07, + "loss": 0.4434, + "step": 10340 + }, + { + "epoch": 2.8942065491183877, + "grad_norm": 0.25661580131767164, + "learning_rate": 3.41406535119726e-07, + "loss": 0.4358, + "step": 10341 + }, + { + "epoch": 2.894486425972572, + "grad_norm": 0.26288992965346053, + "learning_rate": 3.396093339005202e-07, + "loss": 0.4532, + "step": 10342 + }, + { + "epoch": 2.8947663028267563, + "grad_norm": 0.2598594542669984, + "learning_rate": 3.378168593850983e-07, + "loss": 0.4471, + "step": 10343 + }, + { + "epoch": 2.89504617968094, + "grad_norm": 0.2620213552419905, + "learning_rate": 3.3602911174406283e-07, + "loss": 0.4515, + "step": 10344 + }, + { + "epoch": 2.8953260565351244, + "grad_norm": 0.33377074231948806, + "learning_rate": 3.34246091147572e-07, + "loss": 0.4552, + "step": 10345 + }, + { + "epoch": 2.8956059333893087, + "grad_norm": 0.2577495637966244, + "learning_rate": 3.324677977653401e-07, + "loss": 0.4417, + "step": 10346 + }, + { + "epoch": 2.895885810243493, + "grad_norm": 0.275292397605857, + "learning_rate": 3.306942317666206e-07, + "loss": 0.462, + "step": 10347 + }, + { + "epoch": 2.8961656870976773, + "grad_norm": 0.24961360541812772, + "learning_rate": 3.289253933202285e-07, + "loss": 0.4533, + "step": 10348 + }, + { + "epoch": 2.896445563951861, + "grad_norm": 0.2661825316055105, + "learning_rate": 3.2716128259451804e-07, + "loss": 0.4429, + "step": 10349 + }, + { + "epoch": 2.8967254408060454, + "grad_norm": 0.25305679025577094, + "learning_rate": 3.254018997574049e-07, + "loss": 0.45, + "step": 10350 + }, + { + "epoch": 2.8970053176602297, + "grad_norm": 0.25119578479911997, + "learning_rate": 3.236472449763384e-07, + "loss": 0.4816, + "step": 10351 + }, + { + "epoch": 2.8972851945144136, + "grad_norm": 0.25395181283716584, + "learning_rate": 3.218973184183349e-07, + "loss": 0.4181, + "step": 10352 + }, + { + "epoch": 2.897565071368598, + "grad_norm": 0.25487831155720125, + "learning_rate": 3.2015212024995024e-07, + "loss": 0.4494, + "step": 10353 + }, + { + "epoch": 2.897844948222782, + "grad_norm": 0.2599533908619156, + "learning_rate": 3.184116506372903e-07, + "loss": 0.4703, + "step": 10354 + }, + { + "epoch": 2.898124825076966, + "grad_norm": 0.25358339034142985, + "learning_rate": 3.1667590974602256e-07, + "loss": 0.4428, + "step": 10355 + }, + { + "epoch": 2.8984047019311503, + "grad_norm": 0.26625769348057865, + "learning_rate": 3.1494489774134273e-07, + "loss": 0.4752, + "step": 10356 + }, + { + "epoch": 2.8986845787853346, + "grad_norm": 0.2534972553995035, + "learning_rate": 3.132186147880245e-07, + "loss": 0.4547, + "step": 10357 + }, + { + "epoch": 2.8989644556395184, + "grad_norm": 0.24764399053567493, + "learning_rate": 3.114970610503698e-07, + "loss": 0.4588, + "step": 10358 + }, + { + "epoch": 2.8992443324937027, + "grad_norm": 0.2565092323150165, + "learning_rate": 3.0978023669223087e-07, + "loss": 0.4616, + "step": 10359 + }, + { + "epoch": 2.899524209347887, + "grad_norm": 0.2525542577336461, + "learning_rate": 3.0806814187702703e-07, + "loss": 0.4575, + "step": 10360 + }, + { + "epoch": 2.899804086202071, + "grad_norm": 0.26140960295508986, + "learning_rate": 3.0636077676770567e-07, + "loss": 0.4497, + "step": 10361 + }, + { + "epoch": 2.900083963056255, + "grad_norm": 0.2679370880273387, + "learning_rate": 3.0465814152678685e-07, + "loss": 0.4545, + "step": 10362 + }, + { + "epoch": 2.9003638399104394, + "grad_norm": 0.2699083430806211, + "learning_rate": 3.0296023631631865e-07, + "loss": 0.4643, + "step": 10363 + }, + { + "epoch": 2.9006437167646233, + "grad_norm": 0.26170266192893454, + "learning_rate": 3.0126706129791073e-07, + "loss": 0.4561, + "step": 10364 + }, + { + "epoch": 2.9009235936188076, + "grad_norm": 0.24858769529730645, + "learning_rate": 2.99578616632723e-07, + "loss": 0.453, + "step": 10365 + }, + { + "epoch": 2.901203470472992, + "grad_norm": 0.2574791076494076, + "learning_rate": 2.9789490248146033e-07, + "loss": 0.4434, + "step": 10366 + }, + { + "epoch": 2.901483347327176, + "grad_norm": 0.26240149623407355, + "learning_rate": 2.9621591900437784e-07, + "loss": 0.4402, + "step": 10367 + }, + { + "epoch": 2.9017632241813605, + "grad_norm": 0.2507582670062776, + "learning_rate": 2.945416663612921e-07, + "loss": 0.4458, + "step": 10368 + }, + { + "epoch": 2.9020431010355443, + "grad_norm": 0.2471406035226927, + "learning_rate": 2.928721447115479e-07, + "loss": 0.4346, + "step": 10369 + }, + { + "epoch": 2.9023229778897286, + "grad_norm": 0.25689886731465, + "learning_rate": 2.91207354214057e-07, + "loss": 0.4527, + "step": 10370 + }, + { + "epoch": 2.902602854743913, + "grad_norm": 0.25295420038947664, + "learning_rate": 2.895472950272704e-07, + "loss": 0.4379, + "step": 10371 + }, + { + "epoch": 2.9028827315980967, + "grad_norm": 0.2560257149664785, + "learning_rate": 2.878919673091951e-07, + "loss": 0.4282, + "step": 10372 + }, + { + "epoch": 2.903162608452281, + "grad_norm": 0.2696114038415163, + "learning_rate": 2.8624137121738836e-07, + "loss": 0.4706, + "step": 10373 + }, + { + "epoch": 2.9034424853064653, + "grad_norm": 0.2500674589002211, + "learning_rate": 2.845955069089579e-07, + "loss": 0.4488, + "step": 10374 + }, + { + "epoch": 2.903722362160649, + "grad_norm": 0.2554546188397463, + "learning_rate": 2.829543745405505e-07, + "loss": 0.4372, + "step": 10375 + }, + { + "epoch": 2.9040022390148335, + "grad_norm": 0.24931022196936325, + "learning_rate": 2.813179742683747e-07, + "loss": 0.4526, + "step": 10376 + }, + { + "epoch": 2.9042821158690177, + "grad_norm": 0.2928468198012615, + "learning_rate": 2.796863062481836e-07, + "loss": 0.4511, + "step": 10377 + }, + { + "epoch": 2.9045619927232016, + "grad_norm": 0.26388584571093515, + "learning_rate": 2.780593706352752e-07, + "loss": 0.4539, + "step": 10378 + }, + { + "epoch": 2.904841869577386, + "grad_norm": 0.2498804927398296, + "learning_rate": 2.7643716758451457e-07, + "loss": 0.4373, + "step": 10379 + }, + { + "epoch": 2.90512174643157, + "grad_norm": 0.26863275546009896, + "learning_rate": 2.748196972502892e-07, + "loss": 0.4611, + "step": 10380 + }, + { + "epoch": 2.905401623285754, + "grad_norm": 0.2517534306363316, + "learning_rate": 2.7320695978655943e-07, + "loss": 0.4529, + "step": 10381 + }, + { + "epoch": 2.9056815001399383, + "grad_norm": 0.2560660861816635, + "learning_rate": 2.71598955346819e-07, + "loss": 0.463, + "step": 10382 + }, + { + "epoch": 2.9059613769941226, + "grad_norm": 0.2605364040440226, + "learning_rate": 2.6999568408413443e-07, + "loss": 0.4483, + "step": 10383 + }, + { + "epoch": 2.906241253848307, + "grad_norm": 0.24860598895959993, + "learning_rate": 2.6839714615108926e-07, + "loss": 0.4366, + "step": 10384 + }, + { + "epoch": 2.9065211307024907, + "grad_norm": 0.2540350770123497, + "learning_rate": 2.6680334169983946e-07, + "loss": 0.4506, + "step": 10385 + }, + { + "epoch": 2.906801007556675, + "grad_norm": 0.26270021658466275, + "learning_rate": 2.652142708820915e-07, + "loss": 0.4541, + "step": 10386 + }, + { + "epoch": 2.9070808844108593, + "grad_norm": 0.2660916319559597, + "learning_rate": 2.6362993384907997e-07, + "loss": 0.4614, + "step": 10387 + }, + { + "epoch": 2.9073607612650436, + "grad_norm": 0.25039301873484276, + "learning_rate": 2.62050330751612e-07, + "loss": 0.4466, + "step": 10388 + }, + { + "epoch": 2.9076406381192275, + "grad_norm": 0.24631662305707414, + "learning_rate": 2.60475461740034e-07, + "loss": 0.4223, + "step": 10389 + }, + { + "epoch": 2.9079205149734118, + "grad_norm": 0.24299943031929547, + "learning_rate": 2.5890532696424274e-07, + "loss": 0.4428, + "step": 10390 + }, + { + "epoch": 2.908200391827596, + "grad_norm": 0.25736270614928747, + "learning_rate": 2.573399265736798e-07, + "loss": 0.4457, + "step": 10391 + }, + { + "epoch": 2.90848026868178, + "grad_norm": 0.25310670202243307, + "learning_rate": 2.557792607173481e-07, + "loss": 0.4447, + "step": 10392 + }, + { + "epoch": 2.908760145535964, + "grad_norm": 0.2601763089070998, + "learning_rate": 2.5422332954379014e-07, + "loss": 0.4497, + "step": 10393 + }, + { + "epoch": 2.9090400223901485, + "grad_norm": 0.25673423214608204, + "learning_rate": 2.5267213320109283e-07, + "loss": 0.4462, + "step": 10394 + }, + { + "epoch": 2.9093198992443323, + "grad_norm": 0.2732253329291374, + "learning_rate": 2.5112567183691595e-07, + "loss": 0.4579, + "step": 10395 + }, + { + "epoch": 2.9095997760985166, + "grad_norm": 0.24489089427254132, + "learning_rate": 2.4958394559843614e-07, + "loss": 0.4637, + "step": 10396 + }, + { + "epoch": 2.909879652952701, + "grad_norm": 0.2597770859891086, + "learning_rate": 2.4804695463240826e-07, + "loss": 0.4524, + "step": 10397 + }, + { + "epoch": 2.9101595298068847, + "grad_norm": 0.25115464762045403, + "learning_rate": 2.4651469908511527e-07, + "loss": 0.4323, + "step": 10398 + }, + { + "epoch": 2.910439406661069, + "grad_norm": 0.25904281835353893, + "learning_rate": 2.4498717910240163e-07, + "loss": 0.4532, + "step": 10399 + }, + { + "epoch": 2.9107192835152533, + "grad_norm": 0.26533887036075104, + "learning_rate": 2.4346439482965664e-07, + "loss": 0.4473, + "step": 10400 + }, + { + "epoch": 2.910999160369437, + "grad_norm": 0.25950334993597235, + "learning_rate": 2.419463464118199e-07, + "loss": 0.4503, + "step": 10401 + }, + { + "epoch": 2.9112790372236215, + "grad_norm": 0.2592923155127461, + "learning_rate": 2.404330339933869e-07, + "loss": 0.4539, + "step": 10402 + }, + { + "epoch": 2.9115589140778058, + "grad_norm": 0.24565959064147494, + "learning_rate": 2.3892445771838134e-07, + "loss": 0.4426, + "step": 10403 + }, + { + "epoch": 2.91183879093199, + "grad_norm": 0.2564125328004262, + "learning_rate": 2.37420617730405e-07, + "loss": 0.4456, + "step": 10404 + }, + { + "epoch": 2.9121186677861743, + "grad_norm": 0.25657770161499827, + "learning_rate": 2.359215141725879e-07, + "loss": 0.4584, + "step": 10405 + }, + { + "epoch": 2.912398544640358, + "grad_norm": 0.2671894407257729, + "learning_rate": 2.3442714718761027e-07, + "loss": 0.4621, + "step": 10406 + }, + { + "epoch": 2.9126784214945425, + "grad_norm": 0.2676332770972902, + "learning_rate": 2.3293751691771394e-07, + "loss": 0.4649, + "step": 10407 + }, + { + "epoch": 2.9129582983487268, + "grad_norm": 0.2594919036123832, + "learning_rate": 2.314526235046799e-07, + "loss": 0.4604, + "step": 10408 + }, + { + "epoch": 2.9132381752029106, + "grad_norm": 0.2593525779189398, + "learning_rate": 2.299724670898451e-07, + "loss": 0.4349, + "step": 10409 + }, + { + "epoch": 2.913518052057095, + "grad_norm": 0.25144203995667563, + "learning_rate": 2.2849704781408577e-07, + "loss": 0.4491, + "step": 10410 + }, + { + "epoch": 2.913797928911279, + "grad_norm": 0.2635956450162497, + "learning_rate": 2.2702636581783399e-07, + "loss": 0.4605, + "step": 10411 + }, + { + "epoch": 2.914077805765463, + "grad_norm": 0.2600994568333365, + "learning_rate": 2.2556042124107223e-07, + "loss": 0.4368, + "step": 10412 + }, + { + "epoch": 2.9143576826196473, + "grad_norm": 0.2564418348360325, + "learning_rate": 2.2409921422333335e-07, + "loss": 0.4497, + "step": 10413 + }, + { + "epoch": 2.9146375594738316, + "grad_norm": 0.249296973255, + "learning_rate": 2.226427449036894e-07, + "loss": 0.4578, + "step": 10414 + }, + { + "epoch": 2.9149174363280155, + "grad_norm": 0.2624082979505958, + "learning_rate": 2.2119101342076841e-07, + "loss": 0.4557, + "step": 10415 + }, + { + "epoch": 2.9151973131821998, + "grad_norm": 0.2482312610438998, + "learning_rate": 2.1974401991274873e-07, + "loss": 0.4452, + "step": 10416 + }, + { + "epoch": 2.915477190036384, + "grad_norm": 0.28259425383700054, + "learning_rate": 2.1830176451735905e-07, + "loss": 0.4542, + "step": 10417 + }, + { + "epoch": 2.915757066890568, + "grad_norm": 0.25328344799056274, + "learning_rate": 2.1686424737187295e-07, + "loss": 0.4447, + "step": 10418 + }, + { + "epoch": 2.916036943744752, + "grad_norm": 0.2536325406702005, + "learning_rate": 2.1543146861311426e-07, + "loss": 0.4425, + "step": 10419 + }, + { + "epoch": 2.9163168205989365, + "grad_norm": 0.24890910384382822, + "learning_rate": 2.1400342837745168e-07, + "loss": 0.4497, + "step": 10420 + }, + { + "epoch": 2.916596697453121, + "grad_norm": 0.2544036580834295, + "learning_rate": 2.125801268008043e-07, + "loss": 0.4449, + "step": 10421 + }, + { + "epoch": 2.9168765743073046, + "grad_norm": 0.25742946778671977, + "learning_rate": 2.1116156401865815e-07, + "loss": 0.4379, + "step": 10422 + }, + { + "epoch": 2.917156451161489, + "grad_norm": 0.24188272688832368, + "learning_rate": 2.0974774016601638e-07, + "loss": 0.4396, + "step": 10423 + }, + { + "epoch": 2.917436328015673, + "grad_norm": 0.25419956448380476, + "learning_rate": 2.0833865537744912e-07, + "loss": 0.4445, + "step": 10424 + }, + { + "epoch": 2.9177162048698575, + "grad_norm": 0.2541023006339829, + "learning_rate": 2.0693430978708795e-07, + "loss": 0.4506, + "step": 10425 + }, + { + "epoch": 2.9179960817240413, + "grad_norm": 0.24882416696091192, + "learning_rate": 2.0553470352858152e-07, + "loss": 0.4491, + "step": 10426 + }, + { + "epoch": 2.9182759585782256, + "grad_norm": 0.2613166798429044, + "learning_rate": 2.0413983673515659e-07, + "loss": 0.4515, + "step": 10427 + }, + { + "epoch": 2.91855583543241, + "grad_norm": 0.2532369966568343, + "learning_rate": 2.027497095395736e-07, + "loss": 0.4486, + "step": 10428 + }, + { + "epoch": 2.9188357122865938, + "grad_norm": 0.26685547084956296, + "learning_rate": 2.0136432207414347e-07, + "loss": 0.4506, + "step": 10429 + }, + { + "epoch": 2.919115589140778, + "grad_norm": 0.2614000751207966, + "learning_rate": 1.9998367447073285e-07, + "loss": 0.4389, + "step": 10430 + }, + { + "epoch": 2.9193954659949624, + "grad_norm": 0.26655927508353255, + "learning_rate": 1.9860776686075332e-07, + "loss": 0.4532, + "step": 10431 + }, + { + "epoch": 2.919675342849146, + "grad_norm": 0.2578419302241896, + "learning_rate": 1.9723659937515572e-07, + "loss": 0.4331, + "step": 10432 + }, + { + "epoch": 2.9199552197033305, + "grad_norm": 0.26151256479973034, + "learning_rate": 1.958701721444578e-07, + "loss": 0.4619, + "step": 10433 + }, + { + "epoch": 2.920235096557515, + "grad_norm": 0.2533984994980884, + "learning_rate": 1.9450848529871114e-07, + "loss": 0.4559, + "step": 10434 + }, + { + "epoch": 2.9205149734116986, + "grad_norm": 0.26181913630805637, + "learning_rate": 1.9315153896752868e-07, + "loss": 0.4638, + "step": 10435 + }, + { + "epoch": 2.920794850265883, + "grad_norm": 0.25441984983057697, + "learning_rate": 1.9179933328005718e-07, + "loss": 0.4444, + "step": 10436 + }, + { + "epoch": 2.921074727120067, + "grad_norm": 0.2568916127723896, + "learning_rate": 1.9045186836500472e-07, + "loss": 0.4368, + "step": 10437 + }, + { + "epoch": 2.921354603974251, + "grad_norm": 0.2611152004511161, + "learning_rate": 1.8910914435062433e-07, + "loss": 0.4619, + "step": 10438 + }, + { + "epoch": 2.9216344808284354, + "grad_norm": 0.25755190571414255, + "learning_rate": 1.8777116136471372e-07, + "loss": 0.4443, + "step": 10439 + }, + { + "epoch": 2.9219143576826196, + "grad_norm": 0.25147781885391063, + "learning_rate": 1.8643791953462664e-07, + "loss": 0.4446, + "step": 10440 + }, + { + "epoch": 2.922194234536804, + "grad_norm": 0.24820029427422324, + "learning_rate": 1.8510941898726153e-07, + "loss": 0.4537, + "step": 10441 + }, + { + "epoch": 2.9224741113909882, + "grad_norm": 0.2638186979540325, + "learning_rate": 1.837856598490617e-07, + "loss": 0.4554, + "step": 10442 + }, + { + "epoch": 2.922753988245172, + "grad_norm": 0.25462418346204635, + "learning_rate": 1.8246664224603193e-07, + "loss": 0.4428, + "step": 10443 + }, + { + "epoch": 2.9230338650993564, + "grad_norm": 0.2594138620877634, + "learning_rate": 1.8115236630370513e-07, + "loss": 0.4358, + "step": 10444 + }, + { + "epoch": 2.9233137419535407, + "grad_norm": 0.25762916254734597, + "learning_rate": 1.7984283214718122e-07, + "loss": 0.4703, + "step": 10445 + }, + { + "epoch": 2.9235936188077245, + "grad_norm": 0.24532429258654168, + "learning_rate": 1.7853803990110495e-07, + "loss": 0.462, + "step": 10446 + }, + { + "epoch": 2.923873495661909, + "grad_norm": 0.2562022449504887, + "learning_rate": 1.7723798968966032e-07, + "loss": 0.4544, + "step": 10447 + }, + { + "epoch": 2.924153372516093, + "grad_norm": 0.25893836124805875, + "learning_rate": 1.7594268163659278e-07, + "loss": 0.4451, + "step": 10448 + }, + { + "epoch": 2.924433249370277, + "grad_norm": 0.24266450193199873, + "learning_rate": 1.7465211586519258e-07, + "loss": 0.4507, + "step": 10449 + }, + { + "epoch": 2.9247131262244612, + "grad_norm": 0.2524898468017339, + "learning_rate": 1.7336629249828372e-07, + "loss": 0.4432, + "step": 10450 + }, + { + "epoch": 2.9249930030786455, + "grad_norm": 0.25327670454061246, + "learning_rate": 1.7208521165826829e-07, + "loss": 0.4703, + "step": 10451 + }, + { + "epoch": 2.9252728799328294, + "grad_norm": 0.2652988133563846, + "learning_rate": 1.7080887346707098e-07, + "loss": 0.4556, + "step": 10452 + }, + { + "epoch": 2.9255527567870137, + "grad_norm": 0.25212927409896774, + "learning_rate": 1.6953727804617237e-07, + "loss": 0.4334, + "step": 10453 + }, + { + "epoch": 2.925832633641198, + "grad_norm": 0.257078013870885, + "learning_rate": 1.6827042551660898e-07, + "loss": 0.4199, + "step": 10454 + }, + { + "epoch": 2.926112510495382, + "grad_norm": 0.26032378169304704, + "learning_rate": 1.670083159989566e-07, + "loss": 0.4716, + "step": 10455 + }, + { + "epoch": 2.926392387349566, + "grad_norm": 0.24862097369804104, + "learning_rate": 1.6575094961335248e-07, + "loss": 0.4319, + "step": 10456 + }, + { + "epoch": 2.9266722642037504, + "grad_norm": 0.24885533344996946, + "learning_rate": 1.644983264794564e-07, + "loss": 0.4413, + "step": 10457 + }, + { + "epoch": 2.9269521410579347, + "grad_norm": 0.261188700334077, + "learning_rate": 1.632504467165119e-07, + "loss": 0.4465, + "step": 10458 + }, + { + "epoch": 2.9272320179121185, + "grad_norm": 0.24969608494229537, + "learning_rate": 1.620073104432851e-07, + "loss": 0.4313, + "step": 10459 + }, + { + "epoch": 2.927511894766303, + "grad_norm": 0.24858008443318744, + "learning_rate": 1.6076891777809245e-07, + "loss": 0.4744, + "step": 10460 + }, + { + "epoch": 2.927791771620487, + "grad_norm": 0.25462668422505136, + "learning_rate": 1.5953526883881188e-07, + "loss": 0.4702, + "step": 10461 + }, + { + "epoch": 2.9280716484746714, + "grad_norm": 0.2523815260372512, + "learning_rate": 1.5830636374286056e-07, + "loss": 0.4711, + "step": 10462 + }, + { + "epoch": 2.9283515253288552, + "grad_norm": 0.25767637274775135, + "learning_rate": 1.5708220260721163e-07, + "loss": 0.4455, + "step": 10463 + }, + { + "epoch": 2.9286314021830395, + "grad_norm": 0.2574606985716124, + "learning_rate": 1.5586278554837187e-07, + "loss": 0.4387, + "step": 10464 + }, + { + "epoch": 2.928911279037224, + "grad_norm": 0.2585395013878928, + "learning_rate": 1.546481126824151e-07, + "loss": 0.4426, + "step": 10465 + }, + { + "epoch": 2.9291911558914077, + "grad_norm": 0.2567128032532917, + "learning_rate": 1.534381841249488e-07, + "loss": 0.4581, + "step": 10466 + }, + { + "epoch": 2.929471032745592, + "grad_norm": 0.2551522452384083, + "learning_rate": 1.5223299999113094e-07, + "loss": 0.4508, + "step": 10467 + }, + { + "epoch": 2.9297509095997762, + "grad_norm": 0.26152786331414646, + "learning_rate": 1.5103256039568635e-07, + "loss": 0.4462, + "step": 10468 + }, + { + "epoch": 2.93003078645396, + "grad_norm": 0.2531210010499005, + "learning_rate": 1.4983686545285701e-07, + "loss": 0.4444, + "step": 10469 + }, + { + "epoch": 2.9303106633081444, + "grad_norm": 0.24830215591602056, + "learning_rate": 1.4864591527646298e-07, + "loss": 0.4222, + "step": 10470 + }, + { + "epoch": 2.9305905401623287, + "grad_norm": 0.2636931535798214, + "learning_rate": 1.4745970997985248e-07, + "loss": 0.456, + "step": 10471 + }, + { + "epoch": 2.9308704170165125, + "grad_norm": 0.2524364263885123, + "learning_rate": 1.4627824967592963e-07, + "loss": 0.4496, + "step": 10472 + }, + { + "epoch": 2.931150293870697, + "grad_norm": 0.2616100064256884, + "learning_rate": 1.451015344771489e-07, + "loss": 0.4612, + "step": 10473 + }, + { + "epoch": 2.931430170724881, + "grad_norm": 0.25763637866034367, + "learning_rate": 1.439295644955041e-07, + "loss": 0.448, + "step": 10474 + }, + { + "epoch": 2.931710047579065, + "grad_norm": 0.2584985683151651, + "learning_rate": 1.4276233984255593e-07, + "loss": 0.4615, + "step": 10475 + }, + { + "epoch": 2.9319899244332492, + "grad_norm": 0.2511241667910256, + "learning_rate": 1.415998606293878e-07, + "loss": 0.4303, + "step": 10476 + }, + { + "epoch": 2.9322698012874335, + "grad_norm": 0.2545468157726461, + "learning_rate": 1.404421269666556e-07, + "loss": 0.4323, + "step": 10477 + }, + { + "epoch": 2.932549678141618, + "grad_norm": 0.24978718053471582, + "learning_rate": 1.3928913896454897e-07, + "loss": 0.4596, + "step": 10478 + }, + { + "epoch": 2.932829554995802, + "grad_norm": 0.24891041253653481, + "learning_rate": 1.381408967328135e-07, + "loss": 0.4408, + "step": 10479 + }, + { + "epoch": 2.933109431849986, + "grad_norm": 0.25436583588653244, + "learning_rate": 1.3699740038073394e-07, + "loss": 0.4401, + "step": 10480 + }, + { + "epoch": 2.9333893087041703, + "grad_norm": 0.25555326435865844, + "learning_rate": 1.3585865001715104e-07, + "loss": 0.4463, + "step": 10481 + }, + { + "epoch": 2.9336691855583545, + "grad_norm": 0.25808759275433096, + "learning_rate": 1.347246457504503e-07, + "loss": 0.4591, + "step": 10482 + }, + { + "epoch": 2.9339490624125384, + "grad_norm": 0.26026257449568424, + "learning_rate": 1.3359538768856762e-07, + "loss": 0.4479, + "step": 10483 + }, + { + "epoch": 2.9342289392667227, + "grad_norm": 0.262707843378836, + "learning_rate": 1.3247087593898922e-07, + "loss": 0.451, + "step": 10484 + }, + { + "epoch": 2.934508816120907, + "grad_norm": 0.2564618605525408, + "learning_rate": 1.3135111060874617e-07, + "loss": 0.4358, + "step": 10485 + }, + { + "epoch": 2.934788692975091, + "grad_norm": 0.2568705199636224, + "learning_rate": 1.3023609180441431e-07, + "loss": 0.4473, + "step": 10486 + }, + { + "epoch": 2.935068569829275, + "grad_norm": 0.25888006691203863, + "learning_rate": 1.2912581963212543e-07, + "loss": 0.4572, + "step": 10487 + }, + { + "epoch": 2.9353484466834594, + "grad_norm": 0.26257361864420226, + "learning_rate": 1.2802029419755613e-07, + "loss": 0.4479, + "step": 10488 + }, + { + "epoch": 2.9356283235376432, + "grad_norm": 0.25256274125923245, + "learning_rate": 1.269195156059333e-07, + "loss": 0.4529, + "step": 10489 + }, + { + "epoch": 2.9359082003918275, + "grad_norm": 0.26555130805791666, + "learning_rate": 1.2582348396202316e-07, + "loss": 0.4576, + "step": 10490 + }, + { + "epoch": 2.936188077246012, + "grad_norm": 0.2657930704720208, + "learning_rate": 1.247321993701478e-07, + "loss": 0.459, + "step": 10491 + }, + { + "epoch": 2.9364679541001957, + "grad_norm": 0.25168542573681696, + "learning_rate": 1.236456619341797e-07, + "loss": 0.4326, + "step": 10492 + }, + { + "epoch": 2.93674783095438, + "grad_norm": 0.2560893374026327, + "learning_rate": 1.225638717575306e-07, + "loss": 0.4643, + "step": 10493 + }, + { + "epoch": 2.9370277078085643, + "grad_norm": 0.24755384682714268, + "learning_rate": 1.214868289431792e-07, + "loss": 0.4479, + "step": 10494 + }, + { + "epoch": 2.9373075846627485, + "grad_norm": 0.26603348984649644, + "learning_rate": 1.2041453359362132e-07, + "loss": 0.4439, + "step": 10495 + }, + { + "epoch": 2.9375874615169324, + "grad_norm": 0.2509154217666826, + "learning_rate": 1.1934698581093086e-07, + "loss": 0.4408, + "step": 10496 + }, + { + "epoch": 2.9378673383711167, + "grad_norm": 0.25680508562855797, + "learning_rate": 1.1828418569670985e-07, + "loss": 0.4351, + "step": 10497 + }, + { + "epoch": 2.938147215225301, + "grad_norm": 0.2585395777558349, + "learning_rate": 1.1722613335212185e-07, + "loss": 0.4556, + "step": 10498 + }, + { + "epoch": 2.9384270920794853, + "grad_norm": 0.24513710160148572, + "learning_rate": 1.1617282887787518e-07, + "loss": 0.4251, + "step": 10499 + }, + { + "epoch": 2.938706968933669, + "grad_norm": 0.2549706151869714, + "learning_rate": 1.1512427237421741e-07, + "loss": 0.4419, + "step": 10500 + }, + { + "epoch": 2.9389868457878534, + "grad_norm": 0.2394454558323718, + "learning_rate": 1.1408046394095206e-07, + "loss": 0.4508, + "step": 10501 + }, + { + "epoch": 2.9392667226420377, + "grad_norm": 0.24841273548598636, + "learning_rate": 1.1304140367742744e-07, + "loss": 0.4425, + "step": 10502 + }, + { + "epoch": 2.9395465994962215, + "grad_norm": 0.26824065946500614, + "learning_rate": 1.1200709168254774e-07, + "loss": 0.4657, + "step": 10503 + }, + { + "epoch": 2.939826476350406, + "grad_norm": 0.25734686141639923, + "learning_rate": 1.1097752805475647e-07, + "loss": 0.4425, + "step": 10504 + }, + { + "epoch": 2.94010635320459, + "grad_norm": 0.25906967232550493, + "learning_rate": 1.0995271289204745e-07, + "loss": 0.4325, + "step": 10505 + }, + { + "epoch": 2.940386230058774, + "grad_norm": 0.25985462367295037, + "learning_rate": 1.0893264629197042e-07, + "loss": 0.4374, + "step": 10506 + }, + { + "epoch": 2.9406661069129583, + "grad_norm": 0.2424408673662371, + "learning_rate": 1.0791732835160328e-07, + "loss": 0.4225, + "step": 10507 + }, + { + "epoch": 2.9409459837671426, + "grad_norm": 0.2572039680041926, + "learning_rate": 1.0690675916759097e-07, + "loss": 0.4438, + "step": 10508 + }, + { + "epoch": 2.9412258606213264, + "grad_norm": 0.25194635813849986, + "learning_rate": 1.0590093883611762e-07, + "loss": 0.4151, + "step": 10509 + }, + { + "epoch": 2.9415057374755107, + "grad_norm": 0.26942250750658214, + "learning_rate": 1.0489986745292335e-07, + "loss": 0.4737, + "step": 10510 + }, + { + "epoch": 2.941785614329695, + "grad_norm": 0.249986867792104, + "learning_rate": 1.0390354511328749e-07, + "loss": 0.454, + "step": 10511 + }, + { + "epoch": 2.942065491183879, + "grad_norm": 0.24610171411008774, + "learning_rate": 1.0291197191203971e-07, + "loss": 0.4396, + "step": 10512 + }, + { + "epoch": 2.942345368038063, + "grad_norm": 0.255544816937748, + "learning_rate": 1.0192514794356012e-07, + "loss": 0.4401, + "step": 10513 + }, + { + "epoch": 2.9426252448922474, + "grad_norm": 0.2455925774087127, + "learning_rate": 1.0094307330177355e-07, + "loss": 0.4536, + "step": 10514 + }, + { + "epoch": 2.9429051217464317, + "grad_norm": 0.2583339543631677, + "learning_rate": 9.99657480801497e-08, + "loss": 0.4508, + "step": 10515 + }, + { + "epoch": 2.943184998600616, + "grad_norm": 0.25668429968223977, + "learning_rate": 9.899317237172523e-08, + "loss": 0.419, + "step": 10516 + }, + { + "epoch": 2.9434648754548, + "grad_norm": 0.24562189192693365, + "learning_rate": 9.80253462690539e-08, + "loss": 0.4555, + "step": 10517 + }, + { + "epoch": 2.943744752308984, + "grad_norm": 0.2622676135147595, + "learning_rate": 9.706226986426203e-08, + "loss": 0.4386, + "step": 10518 + }, + { + "epoch": 2.9440246291631684, + "grad_norm": 0.26528886042279054, + "learning_rate": 9.610394324902073e-08, + "loss": 0.4399, + "step": 10519 + }, + { + "epoch": 2.9443045060173523, + "grad_norm": 0.2687492946457659, + "learning_rate": 9.515036651453479e-08, + "loss": 0.4565, + "step": 10520 + }, + { + "epoch": 2.9445843828715366, + "grad_norm": 0.25029425954088713, + "learning_rate": 9.420153975157053e-08, + "loss": 0.45, + "step": 10521 + }, + { + "epoch": 2.944864259725721, + "grad_norm": 0.2530798242372889, + "learning_rate": 9.325746305043348e-08, + "loss": 0.4337, + "step": 10522 + }, + { + "epoch": 2.9451441365799047, + "grad_norm": 0.25875219638515645, + "learning_rate": 9.231813650099064e-08, + "loss": 0.4519, + "step": 10523 + }, + { + "epoch": 2.945424013434089, + "grad_norm": 0.25682815207388776, + "learning_rate": 9.138356019264271e-08, + "loss": 0.4389, + "step": 10524 + }, + { + "epoch": 2.9457038902882733, + "grad_norm": 0.25104282666120115, + "learning_rate": 9.045373421433523e-08, + "loss": 0.425, + "step": 10525 + }, + { + "epoch": 2.945983767142457, + "grad_norm": 0.26342494863030874, + "learning_rate": 8.952865865458626e-08, + "loss": 0.4703, + "step": 10526 + }, + { + "epoch": 2.9462636439966414, + "grad_norm": 0.26385994849672095, + "learning_rate": 8.860833360143095e-08, + "loss": 0.4356, + "step": 10527 + }, + { + "epoch": 2.9465435208508257, + "grad_norm": 0.251010870200779, + "learning_rate": 8.769275914247143e-08, + "loss": 0.4662, + "step": 10528 + }, + { + "epoch": 2.9468233977050096, + "grad_norm": 0.2471674470410353, + "learning_rate": 8.678193536484914e-08, + "loss": 0.4394, + "step": 10529 + }, + { + "epoch": 2.947103274559194, + "grad_norm": 0.24808625130165926, + "learning_rate": 8.58758623552669e-08, + "loss": 0.4527, + "step": 10530 + }, + { + "epoch": 2.947383151413378, + "grad_norm": 0.2553979940589703, + "learning_rate": 8.497454019995022e-08, + "loss": 0.4581, + "step": 10531 + }, + { + "epoch": 2.9476630282675624, + "grad_norm": 0.27036146065247973, + "learning_rate": 8.407796898470266e-08, + "loss": 0.4543, + "step": 10532 + }, + { + "epoch": 2.9479429051217463, + "grad_norm": 0.26083548862095113, + "learning_rate": 8.318614879485043e-08, + "loss": 0.4528, + "step": 10533 + }, + { + "epoch": 2.9482227819759306, + "grad_norm": 0.25369304134129755, + "learning_rate": 8.229907971528116e-08, + "loss": 0.4301, + "step": 10534 + }, + { + "epoch": 2.948502658830115, + "grad_norm": 0.2587089753348896, + "learning_rate": 8.14167618304218e-08, + "loss": 0.4547, + "step": 10535 + }, + { + "epoch": 2.948782535684299, + "grad_norm": 0.2561463390198463, + "learning_rate": 8.053919522425513e-08, + "loss": 0.4417, + "step": 10536 + }, + { + "epoch": 2.949062412538483, + "grad_norm": 0.2804159977991239, + "learning_rate": 7.966637998031434e-08, + "loss": 0.4435, + "step": 10537 + }, + { + "epoch": 2.9493422893926673, + "grad_norm": 0.2636994221235027, + "learning_rate": 7.879831618166633e-08, + "loss": 0.4542, + "step": 10538 + }, + { + "epoch": 2.9496221662468516, + "grad_norm": 0.25934639410112037, + "learning_rate": 7.793500391093944e-08, + "loss": 0.445, + "step": 10539 + }, + { + "epoch": 2.9499020431010354, + "grad_norm": 0.28139162564843395, + "learning_rate": 7.707644325029572e-08, + "loss": 0.4317, + "step": 10540 + }, + { + "epoch": 2.9501819199552197, + "grad_norm": 0.26420461175765253, + "learning_rate": 7.622263428146426e-08, + "loss": 0.4356, + "step": 10541 + }, + { + "epoch": 2.950461796809404, + "grad_norm": 0.25441472194653697, + "learning_rate": 7.537357708570225e-08, + "loss": 0.446, + "step": 10542 + }, + { + "epoch": 2.950741673663588, + "grad_norm": 0.2671800826870824, + "learning_rate": 7.452927174383396e-08, + "loss": 0.4549, + "step": 10543 + }, + { + "epoch": 2.951021550517772, + "grad_norm": 0.25564475592140923, + "learning_rate": 7.368971833620619e-08, + "loss": 0.4551, + "step": 10544 + }, + { + "epoch": 2.9513014273719564, + "grad_norm": 0.25828170287220853, + "learning_rate": 7.285491694273838e-08, + "loss": 0.445, + "step": 10545 + }, + { + "epoch": 2.9515813042261403, + "grad_norm": 0.24080567738555247, + "learning_rate": 7.202486764288918e-08, + "loss": 0.4379, + "step": 10546 + }, + { + "epoch": 2.9518611810803246, + "grad_norm": 0.25363021744294506, + "learning_rate": 7.119957051565651e-08, + "loss": 0.4627, + "step": 10547 + }, + { + "epoch": 2.952141057934509, + "grad_norm": 0.25326867966327166, + "learning_rate": 7.03790256395942e-08, + "loss": 0.4262, + "step": 10548 + }, + { + "epoch": 2.9524209347886927, + "grad_norm": 0.26917960408980013, + "learning_rate": 6.956323309280089e-08, + "loss": 0.4603, + "step": 10549 + }, + { + "epoch": 2.952700811642877, + "grad_norm": 0.26027908392323174, + "learning_rate": 6.875219295293111e-08, + "loss": 0.461, + "step": 10550 + }, + { + "epoch": 2.9529806884970613, + "grad_norm": 0.2591632379456325, + "learning_rate": 6.794590529717315e-08, + "loss": 0.4536, + "step": 10551 + }, + { + "epoch": 2.9532605653512456, + "grad_norm": 0.2510652354734244, + "learning_rate": 6.714437020227115e-08, + "loss": 0.4235, + "step": 10552 + }, + { + "epoch": 2.95354044220543, + "grad_norm": 0.2526340838303442, + "learning_rate": 6.634758774451966e-08, + "loss": 0.4352, + "step": 10553 + }, + { + "epoch": 2.9538203190596137, + "grad_norm": 0.2575515578872242, + "learning_rate": 6.555555799974689e-08, + "loss": 0.4536, + "step": 10554 + }, + { + "epoch": 2.954100195913798, + "grad_norm": 0.2603208898237062, + "learning_rate": 6.476828104335364e-08, + "loss": 0.4133, + "step": 10555 + }, + { + "epoch": 2.9543800727679823, + "grad_norm": 0.24904628086688163, + "learning_rate": 6.398575695026332e-08, + "loss": 0.431, + "step": 10556 + }, + { + "epoch": 2.954659949622166, + "grad_norm": 0.26696013383377293, + "learning_rate": 6.320798579495524e-08, + "loss": 0.4594, + "step": 10557 + }, + { + "epoch": 2.9549398264763505, + "grad_norm": 0.25246866520967143, + "learning_rate": 6.243496765146461e-08, + "loss": 0.4408, + "step": 10558 + }, + { + "epoch": 2.9552197033305347, + "grad_norm": 0.254437865474292, + "learning_rate": 6.166670259336594e-08, + "loss": 0.4652, + "step": 10559 + }, + { + "epoch": 2.9554995801847186, + "grad_norm": 0.26299613136944583, + "learning_rate": 6.090319069377848e-08, + "loss": 0.4597, + "step": 10560 + }, + { + "epoch": 2.955779457038903, + "grad_norm": 0.2615353362609834, + "learning_rate": 6.0144432025383e-08, + "loss": 0.4629, + "step": 10561 + }, + { + "epoch": 2.956059333893087, + "grad_norm": 0.2512308203604363, + "learning_rate": 5.939042666038841e-08, + "loss": 0.4444, + "step": 10562 + }, + { + "epoch": 2.956339210747271, + "grad_norm": 0.2377900486143696, + "learning_rate": 5.864117467057062e-08, + "loss": 0.4592, + "step": 10563 + }, + { + "epoch": 2.9566190876014553, + "grad_norm": 0.25875069490141756, + "learning_rate": 5.789667612723371e-08, + "loss": 0.4335, + "step": 10564 + }, + { + "epoch": 2.9568989644556396, + "grad_norm": 0.24683365444532673, + "learning_rate": 5.7156931101248755e-08, + "loss": 0.4351, + "step": 10565 + }, + { + "epoch": 2.9571788413098234, + "grad_norm": 0.26012406806987626, + "learning_rate": 5.642193966302056e-08, + "loss": 0.4473, + "step": 10566 + }, + { + "epoch": 2.9574587181640077, + "grad_norm": 0.2542922961412427, + "learning_rate": 5.569170188250983e-08, + "loss": 0.4543, + "step": 10567 + }, + { + "epoch": 2.957738595018192, + "grad_norm": 0.25715451992754895, + "learning_rate": 5.496621782921097e-08, + "loss": 0.443, + "step": 10568 + }, + { + "epoch": 2.9580184718723763, + "grad_norm": 0.2590939919758361, + "learning_rate": 5.4245487572190946e-08, + "loss": 0.4282, + "step": 10569 + }, + { + "epoch": 2.95829834872656, + "grad_norm": 0.24454462290233103, + "learning_rate": 5.35295111800338e-08, + "loss": 0.4258, + "step": 10570 + }, + { + "epoch": 2.9585782255807445, + "grad_norm": 0.24034675492651944, + "learning_rate": 5.281828872089611e-08, + "loss": 0.4318, + "step": 10571 + }, + { + "epoch": 2.9588581024349287, + "grad_norm": 0.2668304845642728, + "learning_rate": 5.2111820262473745e-08, + "loss": 0.4531, + "step": 10572 + }, + { + "epoch": 2.959137979289113, + "grad_norm": 0.24610859132094123, + "learning_rate": 5.141010587200179e-08, + "loss": 0.4518, + "step": 10573 + }, + { + "epoch": 2.959417856143297, + "grad_norm": 0.2525784737926144, + "learning_rate": 5.071314561627682e-08, + "loss": 0.4499, + "step": 10574 + }, + { + "epoch": 2.959697732997481, + "grad_norm": 0.2589959315475041, + "learning_rate": 5.002093956162912e-08, + "loss": 0.4473, + "step": 10575 + }, + { + "epoch": 2.9599776098516655, + "grad_norm": 0.24563499364177488, + "learning_rate": 4.933348777395042e-08, + "loss": 0.4311, + "step": 10576 + }, + { + "epoch": 2.9602574867058493, + "grad_norm": 0.24229642325126785, + "learning_rate": 4.865079031866615e-08, + "loss": 0.4356, + "step": 10577 + }, + { + "epoch": 2.9605373635600336, + "grad_norm": 0.2569134940403133, + "learning_rate": 4.797284726075768e-08, + "loss": 0.4439, + "step": 10578 + }, + { + "epoch": 2.960817240414218, + "grad_norm": 0.2544678366133651, + "learning_rate": 4.729965866475117e-08, + "loss": 0.4644, + "step": 10579 + }, + { + "epoch": 2.9610971172684017, + "grad_norm": 0.25061094843996773, + "learning_rate": 4.663122459472869e-08, + "loss": 0.4305, + "step": 10580 + }, + { + "epoch": 2.961376994122586, + "grad_norm": 0.25322228356666415, + "learning_rate": 4.5967545114306007e-08, + "loss": 0.4518, + "step": 10581 + }, + { + "epoch": 2.9616568709767703, + "grad_norm": 0.24853741746919858, + "learning_rate": 4.530862028664928e-08, + "loss": 0.4444, + "step": 10582 + }, + { + "epoch": 2.961936747830954, + "grad_norm": 0.25579650896961803, + "learning_rate": 4.465445017448056e-08, + "loss": 0.4376, + "step": 10583 + }, + { + "epoch": 2.9622166246851385, + "grad_norm": 0.25032094678732375, + "learning_rate": 4.400503484006113e-08, + "loss": 0.4392, + "step": 10584 + }, + { + "epoch": 2.9624965015393228, + "grad_norm": 0.25834259594052383, + "learning_rate": 4.336037434520823e-08, + "loss": 0.456, + "step": 10585 + }, + { + "epoch": 2.9627763783935066, + "grad_norm": 0.247699967081848, + "learning_rate": 4.272046875127278e-08, + "loss": 0.4352, + "step": 10586 + }, + { + "epoch": 2.963056255247691, + "grad_norm": 0.255500336642797, + "learning_rate": 4.208531811916716e-08, + "loss": 0.4507, + "step": 10587 + }, + { + "epoch": 2.963336132101875, + "grad_norm": 0.2481995662102386, + "learning_rate": 4.1454922509337466e-08, + "loss": 0.4505, + "step": 10588 + }, + { + "epoch": 2.9636160089560595, + "grad_norm": 0.25844158185485583, + "learning_rate": 4.082928198179681e-08, + "loss": 0.4587, + "step": 10589 + }, + { + "epoch": 2.9638958858102438, + "grad_norm": 0.25711496564190917, + "learning_rate": 4.020839659609199e-08, + "loss": 0.4555, + "step": 10590 + }, + { + "epoch": 2.9641757626644276, + "grad_norm": 0.2588625423650436, + "learning_rate": 3.959226641130909e-08, + "loss": 0.4533, + "step": 10591 + }, + { + "epoch": 2.964455639518612, + "grad_norm": 0.24528164675911024, + "learning_rate": 3.8980891486101176e-08, + "loss": 0.4379, + "step": 10592 + }, + { + "epoch": 2.964735516372796, + "grad_norm": 0.25861177214476944, + "learning_rate": 3.837427187866061e-08, + "loss": 0.4488, + "step": 10593 + }, + { + "epoch": 2.96501539322698, + "grad_norm": 0.2520859346888829, + "learning_rate": 3.777240764671342e-08, + "loss": 0.4513, + "step": 10594 + }, + { + "epoch": 2.9652952700811643, + "grad_norm": 0.2604468848180648, + "learning_rate": 3.7175298847558216e-08, + "loss": 0.4585, + "step": 10595 + }, + { + "epoch": 2.9655751469353486, + "grad_norm": 0.2582001394915078, + "learning_rate": 3.6582945538027324e-08, + "loss": 0.4321, + "step": 10596 + }, + { + "epoch": 2.9658550237895325, + "grad_norm": 0.25229843018884607, + "learning_rate": 3.599534777449232e-08, + "loss": 0.463, + "step": 10597 + }, + { + "epoch": 2.9661349006437168, + "grad_norm": 0.25236312467740846, + "learning_rate": 3.5412505612886225e-08, + "loss": 0.4519, + "step": 10598 + }, + { + "epoch": 2.966414777497901, + "grad_norm": 0.25724033603089713, + "learning_rate": 3.4834419108681346e-08, + "loss": 0.4533, + "step": 10599 + }, + { + "epoch": 2.966694654352085, + "grad_norm": 0.2581127353937177, + "learning_rate": 3.426108831691144e-08, + "loss": 0.4595, + "step": 10600 + }, + { + "epoch": 2.966974531206269, + "grad_norm": 0.24611693632209142, + "learning_rate": 3.369251329213285e-08, + "loss": 0.415, + "step": 10601 + }, + { + "epoch": 2.9672544080604535, + "grad_norm": 0.25310950920344844, + "learning_rate": 3.312869408846897e-08, + "loss": 0.429, + "step": 10602 + }, + { + "epoch": 2.9675342849146373, + "grad_norm": 0.24968979714091358, + "learning_rate": 3.2569630759582415e-08, + "loss": 0.4566, + "step": 10603 + }, + { + "epoch": 2.9678141617688216, + "grad_norm": 0.2609776394190351, + "learning_rate": 3.201532335868618e-08, + "loss": 0.4458, + "step": 10604 + }, + { + "epoch": 2.968094038623006, + "grad_norm": 0.25969592415796383, + "learning_rate": 3.146577193854361e-08, + "loss": 0.4498, + "step": 10605 + }, + { + "epoch": 2.9683739154771898, + "grad_norm": 0.24716778897711522, + "learning_rate": 3.092097655145176e-08, + "loss": 0.4332, + "step": 10606 + }, + { + "epoch": 2.968653792331374, + "grad_norm": 0.27328902605297223, + "learning_rate": 3.038093724927471e-08, + "loss": 0.4511, + "step": 10607 + }, + { + "epoch": 2.9689336691855583, + "grad_norm": 0.2507518753167543, + "learning_rate": 2.984565408341022e-08, + "loss": 0.4494, + "step": 10608 + }, + { + "epoch": 2.9692135460397426, + "grad_norm": 0.2609750291364599, + "learning_rate": 2.9315127104800887e-08, + "loss": 0.4491, + "step": 10609 + }, + { + "epoch": 2.969493422893927, + "grad_norm": 0.25655339861321874, + "learning_rate": 2.878935636395075e-08, + "loss": 0.4316, + "step": 10610 + }, + { + "epoch": 2.9697732997481108, + "grad_norm": 0.2530639883196548, + "learning_rate": 2.8268341910903108e-08, + "loss": 0.4525, + "step": 10611 + }, + { + "epoch": 2.970053176602295, + "grad_norm": 0.23225610378688546, + "learning_rate": 2.7752083795240525e-08, + "loss": 0.4282, + "step": 10612 + }, + { + "epoch": 2.9703330534564794, + "grad_norm": 0.2543375083380285, + "learning_rate": 2.7240582066107022e-08, + "loss": 0.4498, + "step": 10613 + }, + { + "epoch": 2.970612930310663, + "grad_norm": 0.2527960226970521, + "learning_rate": 2.6733836772185884e-08, + "loss": 0.462, + "step": 10614 + }, + { + "epoch": 2.9708928071648475, + "grad_norm": 0.2554175886404961, + "learning_rate": 2.623184796170519e-08, + "loss": 0.4319, + "step": 10615 + }, + { + "epoch": 2.971172684019032, + "grad_norm": 0.25718644400633023, + "learning_rate": 2.57346156824545e-08, + "loss": 0.4517, + "step": 10616 + }, + { + "epoch": 2.9714525608732156, + "grad_norm": 0.302200558832404, + "learning_rate": 2.5242139981751513e-08, + "loss": 0.4469, + "step": 10617 + }, + { + "epoch": 2.9717324377274, + "grad_norm": 0.2549185842864372, + "learning_rate": 2.4754420906475396e-08, + "loss": 0.4441, + "step": 10618 + }, + { + "epoch": 2.972012314581584, + "grad_norm": 0.2552989797038095, + "learning_rate": 2.4271458503044576e-08, + "loss": 0.4555, + "step": 10619 + }, + { + "epoch": 2.972292191435768, + "grad_norm": 0.25867881848070834, + "learning_rate": 2.3793252817427836e-08, + "loss": 0.4586, + "step": 10620 + }, + { + "epoch": 2.9725720682899524, + "grad_norm": 0.25767245445537734, + "learning_rate": 2.3319803895144322e-08, + "loss": 0.437, + "step": 10621 + }, + { + "epoch": 2.9728519451441366, + "grad_norm": 0.261765852972024, + "learning_rate": 2.2851111781257983e-08, + "loss": 0.4512, + "step": 10622 + }, + { + "epoch": 2.9731318219983205, + "grad_norm": 0.25522924545223363, + "learning_rate": 2.238717652037203e-08, + "loss": 0.449, + "step": 10623 + }, + { + "epoch": 2.973411698852505, + "grad_norm": 0.24927821271923611, + "learning_rate": 2.1927998156651142e-08, + "loss": 0.4506, + "step": 10624 + }, + { + "epoch": 2.973691575706689, + "grad_norm": 0.25831566762208635, + "learning_rate": 2.1473576733793686e-08, + "loss": 0.4281, + "step": 10625 + }, + { + "epoch": 2.9739714525608734, + "grad_norm": 0.25375287107557304, + "learning_rate": 2.1023912295059512e-08, + "loss": 0.4466, + "step": 10626 + }, + { + "epoch": 2.974251329415057, + "grad_norm": 0.25946073380754525, + "learning_rate": 2.0579004883236608e-08, + "loss": 0.4414, + "step": 10627 + }, + { + "epoch": 2.9745312062692415, + "grad_norm": 0.25171033719930647, + "learning_rate": 2.0138854540685538e-08, + "loss": 0.435, + "step": 10628 + }, + { + "epoch": 2.974811083123426, + "grad_norm": 0.26115332869155145, + "learning_rate": 1.9703461309295013e-08, + "loss": 0.4663, + "step": 10629 + }, + { + "epoch": 2.97509095997761, + "grad_norm": 0.25790911409843703, + "learning_rate": 1.927282523049856e-08, + "loss": 0.4663, + "step": 10630 + }, + { + "epoch": 2.975370836831794, + "grad_norm": 0.2607999674754798, + "learning_rate": 1.884694634529116e-08, + "loss": 0.4595, + "step": 10631 + }, + { + "epoch": 2.975650713685978, + "grad_norm": 0.2534597997717571, + "learning_rate": 1.842582469420706e-08, + "loss": 0.4221, + "step": 10632 + }, + { + "epoch": 2.9759305905401625, + "grad_norm": 0.2503832993380337, + "learning_rate": 1.8009460317330862e-08, + "loss": 0.4242, + "step": 10633 + }, + { + "epoch": 2.9762104673943464, + "grad_norm": 0.25792042031621715, + "learning_rate": 1.7597853254291972e-08, + "loss": 0.4446, + "step": 10634 + }, + { + "epoch": 2.9764903442485307, + "grad_norm": 0.24700530197851792, + "learning_rate": 1.7191003544259064e-08, + "loss": 0.4664, + "step": 10635 + }, + { + "epoch": 2.976770221102715, + "grad_norm": 0.26387028208771174, + "learning_rate": 1.6788911225967817e-08, + "loss": 0.438, + "step": 10636 + }, + { + "epoch": 2.977050097956899, + "grad_norm": 0.2465718699073755, + "learning_rate": 1.639157633768762e-08, + "loss": 0.4598, + "step": 10637 + }, + { + "epoch": 2.977329974811083, + "grad_norm": 0.2632178783866457, + "learning_rate": 1.5998998917227116e-08, + "loss": 0.4506, + "step": 10638 + }, + { + "epoch": 2.9776098516652674, + "grad_norm": 0.25198103145232476, + "learning_rate": 1.5611179001967513e-08, + "loss": 0.4497, + "step": 10639 + }, + { + "epoch": 2.977889728519451, + "grad_norm": 0.2615169372245782, + "learning_rate": 1.5228116628807078e-08, + "loss": 0.4634, + "step": 10640 + }, + { + "epoch": 2.9781696053736355, + "grad_norm": 0.24759420280801528, + "learning_rate": 1.484981183421108e-08, + "loss": 0.4492, + "step": 10641 + }, + { + "epoch": 2.97844948222782, + "grad_norm": 0.2575222446333656, + "learning_rate": 1.447626465419516e-08, + "loss": 0.4414, + "step": 10642 + }, + { + "epoch": 2.9787293590820036, + "grad_norm": 0.2578216050855136, + "learning_rate": 1.4107475124297553e-08, + "loss": 0.4615, + "step": 10643 + }, + { + "epoch": 2.979009235936188, + "grad_norm": 0.24764862867426704, + "learning_rate": 1.3743443279634617e-08, + "loss": 0.4396, + "step": 10644 + }, + { + "epoch": 2.9792891127903722, + "grad_norm": 0.2790608325780094, + "learning_rate": 1.3384169154850856e-08, + "loss": 0.4493, + "step": 10645 + }, + { + "epoch": 2.9795689896445565, + "grad_norm": 0.24885669381579884, + "learning_rate": 1.3029652784135592e-08, + "loss": 0.4471, + "step": 10646 + }, + { + "epoch": 2.979848866498741, + "grad_norm": 0.2516985946364248, + "learning_rate": 1.2679894201239606e-08, + "loss": 0.4282, + "step": 10647 + }, + { + "epoch": 2.9801287433529247, + "grad_norm": 0.2546918838471997, + "learning_rate": 1.2334893439447382e-08, + "loss": 0.4247, + "step": 10648 + }, + { + "epoch": 2.980408620207109, + "grad_norm": 0.26154533642566097, + "learning_rate": 1.1994650531604868e-08, + "loss": 0.4514, + "step": 10649 + }, + { + "epoch": 2.9806884970612932, + "grad_norm": 0.2541071458160839, + "learning_rate": 1.1659165510086167e-08, + "loss": 0.4453, + "step": 10650 + }, + { + "epoch": 2.980968373915477, + "grad_norm": 0.2557609039926244, + "learning_rate": 1.1328438406826847e-08, + "loss": 0.4518, + "step": 10651 + }, + { + "epoch": 2.9812482507696614, + "grad_norm": 0.2603397966746705, + "learning_rate": 1.100246925331283e-08, + "loss": 0.4518, + "step": 10652 + }, + { + "epoch": 2.9815281276238457, + "grad_norm": 0.25378766274204506, + "learning_rate": 1.0681258080558198e-08, + "loss": 0.4632, + "step": 10653 + }, + { + "epoch": 2.9818080044780295, + "grad_norm": 0.2577461103752013, + "learning_rate": 1.0364804919144044e-08, + "loss": 0.4321, + "step": 10654 + }, + { + "epoch": 2.982087881332214, + "grad_norm": 0.2548663096401936, + "learning_rate": 1.0053109799190719e-08, + "loss": 0.4583, + "step": 10655 + }, + { + "epoch": 2.982367758186398, + "grad_norm": 0.2678503092268285, + "learning_rate": 9.74617275035783e-09, + "loss": 0.4446, + "step": 10656 + }, + { + "epoch": 2.982647635040582, + "grad_norm": 0.25644673755161296, + "learning_rate": 9.443993801866447e-09, + "loss": 0.4487, + "step": 10657 + }, + { + "epoch": 2.9829275118947662, + "grad_norm": 0.26481506867740406, + "learning_rate": 9.146572982476897e-09, + "loss": 0.4555, + "step": 10658 + }, + { + "epoch": 2.9832073887489505, + "grad_norm": 0.26361043389181155, + "learning_rate": 8.85391032049987e-09, + "loss": 0.4423, + "step": 10659 + }, + { + "epoch": 2.9834872656031344, + "grad_norm": 0.25615383736953834, + "learning_rate": 8.566005843790858e-09, + "loss": 0.4583, + "step": 10660 + }, + { + "epoch": 2.9837671424573187, + "grad_norm": 0.24828969986997024, + "learning_rate": 8.282859579744617e-09, + "loss": 0.4262, + "step": 10661 + }, + { + "epoch": 2.984047019311503, + "grad_norm": 0.25552999391652786, + "learning_rate": 8.004471555322913e-09, + "loss": 0.4646, + "step": 10662 + }, + { + "epoch": 2.9843268961656872, + "grad_norm": 0.2636652872115676, + "learning_rate": 7.730841797010113e-09, + "loss": 0.4618, + "step": 10663 + }, + { + "epoch": 2.984606773019871, + "grad_norm": 0.2583926307618832, + "learning_rate": 7.461970330863156e-09, + "loss": 0.4385, + "step": 10664 + }, + { + "epoch": 2.9848866498740554, + "grad_norm": 0.2518994518625086, + "learning_rate": 7.197857182467127e-09, + "loss": 0.438, + "step": 10665 + }, + { + "epoch": 2.9851665267282397, + "grad_norm": 0.2549280585315688, + "learning_rate": 6.938502376963029e-09, + "loss": 0.431, + "step": 10666 + }, + { + "epoch": 2.985446403582424, + "grad_norm": 0.26188810220058695, + "learning_rate": 6.683905939031121e-09, + "loss": 0.4571, + "step": 10667 + }, + { + "epoch": 2.985726280436608, + "grad_norm": 0.2629607222117437, + "learning_rate": 6.434067892907569e-09, + "loss": 0.4575, + "step": 10668 + }, + { + "epoch": 2.986006157290792, + "grad_norm": 0.2497045188475996, + "learning_rate": 6.188988262373352e-09, + "loss": 0.4428, + "step": 10669 + }, + { + "epoch": 2.9862860341449764, + "grad_norm": 0.26736706499842067, + "learning_rate": 5.948667070754255e-09, + "loss": 0.4459, + "step": 10670 + }, + { + "epoch": 2.9865659109991602, + "grad_norm": 0.24828775818061782, + "learning_rate": 5.713104340926423e-09, + "loss": 0.4284, + "step": 10671 + }, + { + "epoch": 2.9868457878533445, + "grad_norm": 0.26353052160753476, + "learning_rate": 5.482300095305259e-09, + "loss": 0.4721, + "step": 10672 + }, + { + "epoch": 2.987125664707529, + "grad_norm": 0.25205314276037316, + "learning_rate": 5.256254355862078e-09, + "loss": 0.4565, + "step": 10673 + }, + { + "epoch": 2.9874055415617127, + "grad_norm": 0.2559405262948841, + "learning_rate": 5.034967144113001e-09, + "loss": 0.4575, + "step": 10674 + }, + { + "epoch": 2.987685418415897, + "grad_norm": 0.24907589377603614, + "learning_rate": 4.818438481118959e-09, + "loss": 0.4411, + "step": 10675 + }, + { + "epoch": 2.9879652952700813, + "grad_norm": 0.2549880059005624, + "learning_rate": 4.606668387491242e-09, + "loss": 0.4556, + "step": 10676 + }, + { + "epoch": 2.988245172124265, + "grad_norm": 0.2570026338796731, + "learning_rate": 4.399656883380398e-09, + "loss": 0.4777, + "step": 10677 + }, + { + "epoch": 2.9885250489784494, + "grad_norm": 0.24982827713037273, + "learning_rate": 4.197403988492887e-09, + "loss": 0.4447, + "step": 10678 + }, + { + "epoch": 2.9888049258326337, + "grad_norm": 0.2588687368315132, + "learning_rate": 3.999909722085527e-09, + "loss": 0.4514, + "step": 10679 + }, + { + "epoch": 2.9890848026868175, + "grad_norm": 0.2582412441405769, + "learning_rate": 3.807174102948841e-09, + "loss": 0.4494, + "step": 10680 + }, + { + "epoch": 2.989364679541002, + "grad_norm": 0.2508054374346863, + "learning_rate": 3.6191971494292652e-09, + "loss": 0.4488, + "step": 10681 + }, + { + "epoch": 2.989644556395186, + "grad_norm": 0.24647912747459155, + "learning_rate": 3.435978879418045e-09, + "loss": 0.4673, + "step": 10682 + }, + { + "epoch": 2.9899244332493704, + "grad_norm": 0.2600041063000193, + "learning_rate": 3.2575193103567826e-09, + "loss": 0.4533, + "step": 10683 + }, + { + "epoch": 2.9902043101035547, + "grad_norm": 0.25480134712200003, + "learning_rate": 3.0838184592263396e-09, + "loss": 0.4655, + "step": 10684 + }, + { + "epoch": 2.9904841869577385, + "grad_norm": 0.25467713191116054, + "learning_rate": 2.9148763425634886e-09, + "loss": 0.4577, + "step": 10685 + }, + { + "epoch": 2.990764063811923, + "grad_norm": 0.2425309705348517, + "learning_rate": 2.750692976444258e-09, + "loss": 0.4377, + "step": 10686 + }, + { + "epoch": 2.991043940666107, + "grad_norm": 0.26213250626211815, + "learning_rate": 2.5912683765061398e-09, + "loss": 0.4705, + "step": 10687 + }, + { + "epoch": 2.991323817520291, + "grad_norm": 0.2612357437354752, + "learning_rate": 2.436602557909229e-09, + "loss": 0.4615, + "step": 10688 + }, + { + "epoch": 2.9916036943744753, + "grad_norm": 0.260821699009736, + "learning_rate": 2.286695535386185e-09, + "loss": 0.4519, + "step": 10689 + }, + { + "epoch": 2.9918835712286596, + "grad_norm": 0.2515545516960629, + "learning_rate": 2.1415473231978236e-09, + "loss": 0.4532, + "step": 10690 + }, + { + "epoch": 2.9921634480828434, + "grad_norm": 0.2640350546718974, + "learning_rate": 2.001157935160869e-09, + "loss": 0.4628, + "step": 10691 + }, + { + "epoch": 2.9924433249370277, + "grad_norm": 0.2552477225519158, + "learning_rate": 1.865527384642407e-09, + "loss": 0.443, + "step": 10692 + }, + { + "epoch": 2.992723201791212, + "grad_norm": 0.28604816511846115, + "learning_rate": 1.7346556845432294e-09, + "loss": 0.4417, + "step": 10693 + }, + { + "epoch": 2.993003078645396, + "grad_norm": 0.2607651957497237, + "learning_rate": 1.6085428473311404e-09, + "loss": 0.4655, + "step": 10694 + }, + { + "epoch": 2.99328295549958, + "grad_norm": 0.2537250574074743, + "learning_rate": 1.4871888849965488e-09, + "loss": 0.4519, + "step": 10695 + }, + { + "epoch": 2.9935628323537644, + "grad_norm": 0.2496928430714104, + "learning_rate": 1.3705938091024273e-09, + "loss": 0.4401, + "step": 10696 + }, + { + "epoch": 2.9938427092079483, + "grad_norm": 0.2458011649695888, + "learning_rate": 1.2587576307343528e-09, + "loss": 0.4475, + "step": 10697 + }, + { + "epoch": 2.9941225860621326, + "grad_norm": 0.24453034008079477, + "learning_rate": 1.1516803605504666e-09, + "loss": 0.4365, + "step": 10698 + }, + { + "epoch": 2.994402462916317, + "grad_norm": 0.26694280093260125, + "learning_rate": 1.0493620087315136e-09, + "loss": 0.4531, + "step": 10699 + }, + { + "epoch": 2.994682339770501, + "grad_norm": 0.25832226564127514, + "learning_rate": 9.51802585019701e-10, + "loss": 0.4457, + "step": 10700 + }, + { + "epoch": 2.994962216624685, + "grad_norm": 0.25240393231643893, + "learning_rate": 8.590020987020442e-10, + "loss": 0.4708, + "step": 10701 + }, + { + "epoch": 2.9952420934788693, + "grad_norm": 0.253125299590793, + "learning_rate": 7.709605586103674e-10, + "loss": 0.4476, + "step": 10702 + }, + { + "epoch": 2.9955219703330536, + "grad_norm": 0.2445135905145077, + "learning_rate": 6.876779731213035e-10, + "loss": 0.4462, + "step": 10703 + }, + { + "epoch": 2.995801847187238, + "grad_norm": 0.25008011484227777, + "learning_rate": 6.091543501673958e-10, + "loss": 0.445, + "step": 10704 + }, + { + "epoch": 2.9960817240414217, + "grad_norm": 0.25643300402049357, + "learning_rate": 5.353896972204453e-10, + "loss": 0.4406, + "step": 10705 + }, + { + "epoch": 2.996361600895606, + "grad_norm": 0.24847707003519692, + "learning_rate": 4.663840213026127e-10, + "loss": 0.4677, + "step": 10706 + }, + { + "epoch": 2.9966414777497903, + "grad_norm": 0.2520890475535469, + "learning_rate": 4.0213732897531607e-10, + "loss": 0.449, + "step": 10707 + }, + { + "epoch": 2.996921354603974, + "grad_norm": 0.24197743329090915, + "learning_rate": 3.4264962636143536e-10, + "loss": 0.4382, + "step": 10708 + }, + { + "epoch": 2.9972012314581584, + "grad_norm": 0.2663302724507473, + "learning_rate": 2.8792091912310804e-10, + "loss": 0.4549, + "step": 10709 + }, + { + "epoch": 2.9974811083123427, + "grad_norm": 0.27210717025743686, + "learning_rate": 2.379512124617289e-10, + "loss": 0.4738, + "step": 10710 + }, + { + "epoch": 2.9977609851665266, + "grad_norm": 0.25045671556430316, + "learning_rate": 1.9274051114015478e-10, + "loss": 0.4365, + "step": 10711 + }, + { + "epoch": 2.998040862020711, + "grad_norm": 0.2669187587751103, + "learning_rate": 1.522888194604999e-10, + "loss": 0.4566, + "step": 10712 + }, + { + "epoch": 2.998320738874895, + "grad_norm": 0.2600536100682567, + "learning_rate": 1.16596141269687e-10, + "loss": 0.4347, + "step": 10713 + }, + { + "epoch": 2.998600615729079, + "grad_norm": 0.2541319276216827, + "learning_rate": 8.566247997054966e-11, + "loss": 0.4663, + "step": 10714 + }, + { + "epoch": 2.9988804925832633, + "grad_norm": 0.2576446088985775, + "learning_rate": 5.948783850517892e-11, + "loss": 0.4636, + "step": 10715 + }, + { + "epoch": 2.9991603694374476, + "grad_norm": 0.2536248818180751, + "learning_rate": 3.807221936047434e-11, + "loss": 0.435, + "step": 10716 + }, + { + "epoch": 2.9994402462916314, + "grad_norm": 0.2545279038011797, + "learning_rate": 2.1415624579246284e-11, + "loss": 0.4368, + "step": 10717 + }, + { + "epoch": 2.9997201231458157, + "grad_norm": 0.2604701422540892, + "learning_rate": 9.51805574356257e-12, + "loss": 0.4551, + "step": 10718 + }, + { + "epoch": 3.0, + "grad_norm": 0.2550703778343758, + "learning_rate": 2.3795139914017937e-12, + "loss": 0.4401, + "step": 10719 } ], "logging_steps": 1, @@ -70022,12 +75055,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 2349444853596160.0, + "total_flos": 2518172089335808.0, "train_batch_size": 8, "trial_name": null, "trial_params": null