diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4978 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1410, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004259850905218318, + "grad_norm": 0.80859375, + "learning_rate": 4.225352112676057e-07, + "loss": 1.8638979196548462, + "step": 2 + }, + { + "epoch": 0.008519701810436636, + "grad_norm": 0.69921875, + "learning_rate": 1.267605633802817e-06, + "loss": 1.9382712841033936, + "step": 4 + }, + { + "epoch": 0.012779552715654952, + "grad_norm": 1.09375, + "learning_rate": 2.1126760563380285e-06, + "loss": 1.8919719457626343, + "step": 6 + }, + { + "epoch": 0.01703940362087327, + "grad_norm": 0.6875, + "learning_rate": 2.957746478873239e-06, + "loss": 1.9754539728164673, + "step": 8 + }, + { + "epoch": 0.021299254526091587, + "grad_norm": 0.8046875, + "learning_rate": 3.8028169014084508e-06, + "loss": 1.9735431671142578, + "step": 10 + }, + { + "epoch": 0.025559105431309903, + "grad_norm": 0.490234375, + "learning_rate": 4.6478873239436615e-06, + "loss": 1.962188959121704, + "step": 12 + }, + { + "epoch": 0.029818956336528223, + "grad_norm": 0.796875, + "learning_rate": 5.492957746478874e-06, + "loss": 1.8216444253921509, + "step": 14 + }, + { + "epoch": 0.03407880724174654, + "grad_norm": 0.65234375, + "learning_rate": 6.338028169014085e-06, + "loss": 1.879197359085083, + "step": 16 + }, + { + "epoch": 0.038338658146964855, + "grad_norm": 0.58203125, + "learning_rate": 7.183098591549295e-06, + "loss": 1.9074592590332031, + "step": 18 + }, + { + "epoch": 0.042598509052183174, + "grad_norm": 0.5625, + "learning_rate": 8.028169014084507e-06, + "loss": 1.8535538911819458, + "step": 20 + }, + { + "epoch": 0.046858359957401494, + "grad_norm": 0.70703125, + "learning_rate": 8.873239436619718e-06, + "loss": 1.7652872800827026, + "step": 22 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.470703125, + "learning_rate": 9.71830985915493e-06, + "loss": 1.7537739276885986, + "step": 24 + }, + { + "epoch": 0.055378061767838126, + "grad_norm": 0.4375, + "learning_rate": 1.056338028169014e-05, + "loss": 1.6868540048599243, + "step": 26 + }, + { + "epoch": 0.059637912673056445, + "grad_norm": 0.419921875, + "learning_rate": 1.1408450704225351e-05, + "loss": 1.694838523864746, + "step": 28 + }, + { + "epoch": 0.06389776357827476, + "grad_norm": 0.6953125, + "learning_rate": 1.2253521126760564e-05, + "loss": 1.7118496894836426, + "step": 30 + }, + { + "epoch": 0.06815761448349308, + "grad_norm": 0.5, + "learning_rate": 1.3098591549295775e-05, + "loss": 1.7367674112319946, + "step": 32 + }, + { + "epoch": 0.0724174653887114, + "grad_norm": 0.625, + "learning_rate": 1.3943661971830985e-05, + "loss": 1.6931723356246948, + "step": 34 + }, + { + "epoch": 0.07667731629392971, + "grad_norm": 0.87109375, + "learning_rate": 1.4788732394366198e-05, + "loss": 1.8322988748550415, + "step": 36 + }, + { + "epoch": 0.08093716719914804, + "grad_norm": 0.71875, + "learning_rate": 1.563380281690141e-05, + "loss": 1.723473072052002, + "step": 38 + }, + { + "epoch": 0.08519701810436635, + "grad_norm": 0.765625, + "learning_rate": 1.6478873239436623e-05, + "loss": 1.584614872932434, + "step": 40 + }, + { + "epoch": 0.08945686900958466, + "grad_norm": 0.458984375, + "learning_rate": 1.7323943661971833e-05, + "loss": 1.757703423500061, + "step": 42 + }, + { + "epoch": 0.09371671991480299, + "grad_norm": 0.384765625, + "learning_rate": 1.816901408450704e-05, + "loss": 1.5781840085983276, + "step": 44 + }, + { + "epoch": 0.0979765708200213, + "grad_norm": 0.98828125, + "learning_rate": 1.9014084507042255e-05, + "loss": 1.6014955043792725, + "step": 46 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.73828125, + "learning_rate": 1.9859154929577465e-05, + "loss": 1.674490213394165, + "step": 48 + }, + { + "epoch": 0.10649627263045794, + "grad_norm": 0.61328125, + "learning_rate": 2.0704225352112676e-05, + "loss": 1.393676519393921, + "step": 50 + }, + { + "epoch": 0.11075612353567625, + "grad_norm": 0.69921875, + "learning_rate": 2.154929577464789e-05, + "loss": 1.495132327079773, + "step": 52 + }, + { + "epoch": 0.11501597444089456, + "grad_norm": 0.5703125, + "learning_rate": 2.23943661971831e-05, + "loss": 1.5442848205566406, + "step": 54 + }, + { + "epoch": 0.11927582534611289, + "grad_norm": 0.396484375, + "learning_rate": 2.3239436619718308e-05, + "loss": 1.459320068359375, + "step": 56 + }, + { + "epoch": 0.1235356762513312, + "grad_norm": 0.7578125, + "learning_rate": 2.4084507042253522e-05, + "loss": 1.4848005771636963, + "step": 58 + }, + { + "epoch": 0.12779552715654952, + "grad_norm": 0.310546875, + "learning_rate": 2.4929577464788733e-05, + "loss": 1.3594304323196411, + "step": 60 + }, + { + "epoch": 0.13205537806176784, + "grad_norm": 0.6640625, + "learning_rate": 2.5774647887323944e-05, + "loss": 1.4234025478363037, + "step": 62 + }, + { + "epoch": 0.13631522896698617, + "grad_norm": 0.27734375, + "learning_rate": 2.6619718309859158e-05, + "loss": 1.3102433681488037, + "step": 64 + }, + { + "epoch": 0.14057507987220447, + "grad_norm": 0.3046875, + "learning_rate": 2.746478873239437e-05, + "loss": 1.3784610033035278, + "step": 66 + }, + { + "epoch": 0.1448349307774228, + "grad_norm": 0.5625, + "learning_rate": 2.8309859154929576e-05, + "loss": 1.325601577758789, + "step": 68 + }, + { + "epoch": 0.14909478168264112, + "grad_norm": 0.36328125, + "learning_rate": 2.915492957746479e-05, + "loss": 1.3749815225601196, + "step": 70 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.2890625, + "learning_rate": 3e-05, + "loss": 1.4330412149429321, + "step": 72 + }, + { + "epoch": 0.15761448349307774, + "grad_norm": 1.609375, + "learning_rate": 2.9999867885940888e-05, + "loss": 1.4420013427734375, + "step": 74 + }, + { + "epoch": 0.16187433439829607, + "grad_norm": 0.345703125, + "learning_rate": 2.999947154667255e-05, + "loss": 1.3299309015274048, + "step": 76 + }, + { + "epoch": 0.16613418530351437, + "grad_norm": 0.3125, + "learning_rate": 2.9998810990921997e-05, + "loss": 1.2926124334335327, + "step": 78 + }, + { + "epoch": 0.1703940362087327, + "grad_norm": 0.34765625, + "learning_rate": 2.999788623323402e-05, + "loss": 1.4053008556365967, + "step": 80 + }, + { + "epoch": 0.17465388711395102, + "grad_norm": 0.341796875, + "learning_rate": 2.999669729397085e-05, + "loss": 1.3341435194015503, + "step": 82 + }, + { + "epoch": 0.17891373801916932, + "grad_norm": 1.1171875, + "learning_rate": 2.999524419931176e-05, + "loss": 1.3351154327392578, + "step": 84 + }, + { + "epoch": 0.18317358892438765, + "grad_norm": 0.275390625, + "learning_rate": 2.9993526981252465e-05, + "loss": 1.279821515083313, + "step": 86 + }, + { + "epoch": 0.18743343982960597, + "grad_norm": 0.29296875, + "learning_rate": 2.999154567760439e-05, + "loss": 1.2992417812347412, + "step": 88 + }, + { + "epoch": 0.19169329073482427, + "grad_norm": 0.43359375, + "learning_rate": 2.998930033199389e-05, + "loss": 1.2671334743499756, + "step": 90 + }, + { + "epoch": 0.1959531416400426, + "grad_norm": 0.396484375, + "learning_rate": 2.9986790993861245e-05, + "loss": 1.4086840152740479, + "step": 92 + }, + { + "epoch": 0.20021299254526093, + "grad_norm": 0.4765625, + "learning_rate": 2.9984017718459603e-05, + "loss": 1.260171890258789, + "step": 94 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 0.4453125, + "learning_rate": 2.998098056685374e-05, + "loss": 1.337839126586914, + "step": 96 + }, + { + "epoch": 0.20873269435569755, + "grad_norm": 0.44921875, + "learning_rate": 2.9977679605918732e-05, + "loss": 1.3128660917282104, + "step": 98 + }, + { + "epoch": 0.21299254526091588, + "grad_norm": 0.3515625, + "learning_rate": 2.9974114908338454e-05, + "loss": 1.3010621070861816, + "step": 100 + }, + { + "epoch": 0.21725239616613418, + "grad_norm": 0.2451171875, + "learning_rate": 2.9970286552604036e-05, + "loss": 1.3007886409759521, + "step": 102 + }, + { + "epoch": 0.2215122470713525, + "grad_norm": 0.251953125, + "learning_rate": 2.996619462301207e-05, + "loss": 1.3102291822433472, + "step": 104 + }, + { + "epoch": 0.22577209797657083, + "grad_norm": 0.515625, + "learning_rate": 2.9961839209662808e-05, + "loss": 1.4082542657852173, + "step": 106 + }, + { + "epoch": 0.23003194888178913, + "grad_norm": 0.447265625, + "learning_rate": 2.9957220408458118e-05, + "loss": 1.220694899559021, + "step": 108 + }, + { + "epoch": 0.23429179978700745, + "grad_norm": 0.494140625, + "learning_rate": 2.9952338321099435e-05, + "loss": 1.3067396879196167, + "step": 110 + }, + { + "epoch": 0.23855165069222578, + "grad_norm": 0.34765625, + "learning_rate": 2.9947193055085505e-05, + "loss": 1.283180594444275, + "step": 112 + }, + { + "epoch": 0.24281150159744408, + "grad_norm": 0.255859375, + "learning_rate": 2.9941784723709973e-05, + "loss": 1.273285150527954, + "step": 114 + }, + { + "epoch": 0.2470713525026624, + "grad_norm": 0.4453125, + "learning_rate": 2.993611344605895e-05, + "loss": 1.2464628219604492, + "step": 116 + }, + { + "epoch": 0.25133120340788073, + "grad_norm": 0.255859375, + "learning_rate": 2.9930179347008347e-05, + "loss": 1.2899105548858643, + "step": 118 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.71484375, + "learning_rate": 2.9923982557221154e-05, + "loss": 1.166140079498291, + "step": 120 + }, + { + "epoch": 0.2598509052183174, + "grad_norm": 0.359375, + "learning_rate": 2.9917523213144554e-05, + "loss": 1.3073238134384155, + "step": 122 + }, + { + "epoch": 0.2641107561235357, + "grad_norm": 0.251953125, + "learning_rate": 2.9910801457006897e-05, + "loss": 1.2734519243240356, + "step": 124 + }, + { + "epoch": 0.268370607028754, + "grad_norm": 0.2470703125, + "learning_rate": 2.9903817436814603e-05, + "loss": 1.190434455871582, + "step": 126 + }, + { + "epoch": 0.27263045793397234, + "grad_norm": 0.41796875, + "learning_rate": 2.9896571306348874e-05, + "loss": 1.2509433031082153, + "step": 128 + }, + { + "epoch": 0.27689030883919064, + "grad_norm": 0.48828125, + "learning_rate": 2.9889063225162337e-05, + "loss": 1.3253034353256226, + "step": 130 + }, + { + "epoch": 0.28115015974440893, + "grad_norm": 0.318359375, + "learning_rate": 2.98812933585755e-05, + "loss": 1.2004448175430298, + "step": 132 + }, + { + "epoch": 0.2854100106496273, + "grad_norm": 0.2890625, + "learning_rate": 2.9873261877673142e-05, + "loss": 1.1677073240280151, + "step": 134 + }, + { + "epoch": 0.2896698615548456, + "grad_norm": 0.48046875, + "learning_rate": 2.9864968959300505e-05, + "loss": 1.2530547380447388, + "step": 136 + }, + { + "epoch": 0.2939297124600639, + "grad_norm": 0.482421875, + "learning_rate": 2.985641478605945e-05, + "loss": 1.2573705911636353, + "step": 138 + }, + { + "epoch": 0.29818956336528224, + "grad_norm": 0.435546875, + "learning_rate": 2.9847599546304395e-05, + "loss": 1.3057535886764526, + "step": 140 + }, + { + "epoch": 0.30244941427050054, + "grad_norm": 0.2734375, + "learning_rate": 2.9838523434138204e-05, + "loss": 1.2737255096435547, + "step": 142 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 0.419921875, + "learning_rate": 2.982918664940787e-05, + "loss": 1.2411197423934937, + "step": 144 + }, + { + "epoch": 0.3109691160809372, + "grad_norm": 0.3203125, + "learning_rate": 2.9819589397700148e-05, + "loss": 1.2790652513504028, + "step": 146 + }, + { + "epoch": 0.3152289669861555, + "grad_norm": 0.408203125, + "learning_rate": 2.9809731890337017e-05, + "loss": 1.2759779691696167, + "step": 148 + }, + { + "epoch": 0.3194888178913738, + "grad_norm": 0.337890625, + "learning_rate": 2.979961434437103e-05, + "loss": 1.3018522262573242, + "step": 150 + }, + { + "epoch": 0.32374866879659214, + "grad_norm": 0.365234375, + "learning_rate": 2.9789236982580538e-05, + "loss": 1.3175352811813354, + "step": 152 + }, + { + "epoch": 0.32800851970181044, + "grad_norm": 0.32421875, + "learning_rate": 2.9778600033464767e-05, + "loss": 1.291448712348938, + "step": 154 + }, + { + "epoch": 0.33226837060702874, + "grad_norm": 0.275390625, + "learning_rate": 2.97677037312388e-05, + "loss": 1.2841699123382568, + "step": 156 + }, + { + "epoch": 0.3365282215122471, + "grad_norm": 0.33203125, + "learning_rate": 2.975654831582843e-05, + "loss": 1.3081012964248657, + "step": 158 + }, + { + "epoch": 0.3407880724174654, + "grad_norm": 0.2734375, + "learning_rate": 2.9745134032864862e-05, + "loss": 1.2524945735931396, + "step": 160 + }, + { + "epoch": 0.3450479233226837, + "grad_norm": 0.228515625, + "learning_rate": 2.973346113367929e-05, + "loss": 1.1932268142700195, + "step": 162 + }, + { + "epoch": 0.34930777422790205, + "grad_norm": 0.453125, + "learning_rate": 2.972152987529741e-05, + "loss": 1.2276166677474976, + "step": 164 + }, + { + "epoch": 0.35356762513312034, + "grad_norm": 0.2734375, + "learning_rate": 2.9709340520433722e-05, + "loss": 1.2343382835388184, + "step": 166 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 0.96875, + "learning_rate": 2.9696893337485734e-05, + "loss": 1.3475210666656494, + "step": 168 + }, + { + "epoch": 0.362087326943557, + "grad_norm": 0.69921875, + "learning_rate": 2.9684188600528098e-05, + "loss": 1.2921943664550781, + "step": 170 + }, + { + "epoch": 0.3663471778487753, + "grad_norm": 0.515625, + "learning_rate": 2.967122658930654e-05, + "loss": 1.1600617170333862, + "step": 172 + }, + { + "epoch": 0.3706070287539936, + "grad_norm": 0.341796875, + "learning_rate": 2.9658007589231723e-05, + "loss": 1.3178966045379639, + "step": 174 + }, + { + "epoch": 0.37486687965921195, + "grad_norm": 0.33203125, + "learning_rate": 2.9644531891372925e-05, + "loss": 1.3098689317703247, + "step": 176 + }, + { + "epoch": 0.37912673056443025, + "grad_norm": 0.234375, + "learning_rate": 2.9630799792451687e-05, + "loss": 1.1713343858718872, + "step": 178 + }, + { + "epoch": 0.38338658146964855, + "grad_norm": 0.2109375, + "learning_rate": 2.9616811594835214e-05, + "loss": 1.2428940534591675, + "step": 180 + }, + { + "epoch": 0.3876464323748669, + "grad_norm": 0.359375, + "learning_rate": 2.9602567606529776e-05, + "loss": 1.2774041891098022, + "step": 182 + }, + { + "epoch": 0.3919062832800852, + "grad_norm": 0.33984375, + "learning_rate": 2.9588068141173888e-05, + "loss": 1.2816710472106934, + "step": 184 + }, + { + "epoch": 0.3961661341853035, + "grad_norm": 0.349609375, + "learning_rate": 2.9573313518031424e-05, + "loss": 1.1415907144546509, + "step": 186 + }, + { + "epoch": 0.40042598509052185, + "grad_norm": 0.396484375, + "learning_rate": 2.955830406198458e-05, + "loss": 1.232388973236084, + "step": 188 + }, + { + "epoch": 0.40468583599574015, + "grad_norm": 0.3125, + "learning_rate": 2.95430401035267e-05, + "loss": 1.250954031944275, + "step": 190 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 0.310546875, + "learning_rate": 2.9527521978755053e-05, + "loss": 1.205154299736023, + "step": 192 + }, + { + "epoch": 0.4132055378061768, + "grad_norm": 0.26171875, + "learning_rate": 2.9511750029363377e-05, + "loss": 1.2991074323654175, + "step": 194 + }, + { + "epoch": 0.4174653887113951, + "grad_norm": 0.296875, + "learning_rate": 2.949572460263438e-05, + "loss": 1.2244809865951538, + "step": 196 + }, + { + "epoch": 0.4217252396166134, + "grad_norm": 0.41796875, + "learning_rate": 2.947944605143208e-05, + "loss": 1.2750012874603271, + "step": 198 + }, + { + "epoch": 0.42598509052183176, + "grad_norm": 0.376953125, + "learning_rate": 2.9462914734194078e-05, + "loss": 1.3423129320144653, + "step": 200 + }, + { + "epoch": 0.43024494142705005, + "grad_norm": 0.35546875, + "learning_rate": 2.9446131014923593e-05, + "loss": 1.280989646911621, + "step": 202 + }, + { + "epoch": 0.43450479233226835, + "grad_norm": 0.26171875, + "learning_rate": 2.9429095263181514e-05, + "loss": 1.2904020547866821, + "step": 204 + }, + { + "epoch": 0.4387646432374867, + "grad_norm": 0.427734375, + "learning_rate": 2.9411807854078226e-05, + "loss": 1.2392964363098145, + "step": 206 + }, + { + "epoch": 0.443024494142705, + "grad_norm": 0.3046875, + "learning_rate": 2.9394269168265358e-05, + "loss": 1.2662076950073242, + "step": 208 + }, + { + "epoch": 0.4472843450479233, + "grad_norm": 0.3046875, + "learning_rate": 2.9376479591927408e-05, + "loss": 1.2238541841506958, + "step": 210 + }, + { + "epoch": 0.45154419595314166, + "grad_norm": 0.263671875, + "learning_rate": 2.935843951677323e-05, + "loss": 1.2052886486053467, + "step": 212 + }, + { + "epoch": 0.45580404685835996, + "grad_norm": 0.3515625, + "learning_rate": 2.9340149340027412e-05, + "loss": 1.2680332660675049, + "step": 214 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.318359375, + "learning_rate": 2.9321609464421546e-05, + "loss": 1.233550786972046, + "step": 216 + }, + { + "epoch": 0.4643237486687966, + "grad_norm": 0.578125, + "learning_rate": 2.930282029818533e-05, + "loss": 1.2138795852661133, + "step": 218 + }, + { + "epoch": 0.4685835995740149, + "grad_norm": 0.2578125, + "learning_rate": 2.92837822550376e-05, + "loss": 1.1213371753692627, + "step": 220 + }, + { + "epoch": 0.4728434504792332, + "grad_norm": 0.404296875, + "learning_rate": 2.9264495754177225e-05, + "loss": 1.2127740383148193, + "step": 222 + }, + { + "epoch": 0.47710330138445156, + "grad_norm": 1.5078125, + "learning_rate": 2.924496122027384e-05, + "loss": 1.3384878635406494, + "step": 224 + }, + { + "epoch": 0.48136315228966986, + "grad_norm": 0.345703125, + "learning_rate": 2.9225179083458555e-05, + "loss": 1.1937229633331299, + "step": 226 + }, + { + "epoch": 0.48562300319488816, + "grad_norm": 0.52734375, + "learning_rate": 2.9205149779314425e-05, + "loss": 1.2608391046524048, + "step": 228 + }, + { + "epoch": 0.4898828541001065, + "grad_norm": 0.31640625, + "learning_rate": 2.918487374886691e-05, + "loss": 1.2325993776321411, + "step": 230 + }, + { + "epoch": 0.4941427050053248, + "grad_norm": 0.287109375, + "learning_rate": 2.91643514385741e-05, + "loss": 1.2050917148590088, + "step": 232 + }, + { + "epoch": 0.4984025559105431, + "grad_norm": 0.296875, + "learning_rate": 2.9143583300316975e-05, + "loss": 1.2299753427505493, + "step": 234 + }, + { + "epoch": 0.5026624068157615, + "grad_norm": 0.322265625, + "learning_rate": 2.9122569791389354e-05, + "loss": 1.2500553131103516, + "step": 236 + }, + { + "epoch": 0.5069222577209798, + "grad_norm": 0.46875, + "learning_rate": 2.9101311374487908e-05, + "loss": 1.3044551610946655, + "step": 238 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.28125, + "learning_rate": 2.907980851770193e-05, + "loss": 1.1923537254333496, + "step": 240 + }, + { + "epoch": 0.5154419595314164, + "grad_norm": 0.419921875, + "learning_rate": 2.905806169450303e-05, + "loss": 1.2567352056503296, + "step": 242 + }, + { + "epoch": 0.5197018104366348, + "grad_norm": 0.369140625, + "learning_rate": 2.9036071383734716e-05, + "loss": 1.2812081575393677, + "step": 244 + }, + { + "epoch": 0.5239616613418531, + "grad_norm": 0.4609375, + "learning_rate": 2.9013838069601874e-05, + "loss": 1.2612706422805786, + "step": 246 + }, + { + "epoch": 0.5282215122470714, + "grad_norm": 0.27734375, + "learning_rate": 2.8991362241660053e-05, + "loss": 1.2162076234817505, + "step": 248 + }, + { + "epoch": 0.5324813631522897, + "grad_norm": 0.267578125, + "learning_rate": 2.8968644394804736e-05, + "loss": 1.2357534170150757, + "step": 250 + }, + { + "epoch": 0.536741214057508, + "grad_norm": 0.255859375, + "learning_rate": 2.894568502926042e-05, + "loss": 1.144363284111023, + "step": 252 + }, + { + "epoch": 0.5410010649627263, + "grad_norm": 0.68359375, + "learning_rate": 2.8922484650569597e-05, + "loss": 1.1998339891433716, + "step": 254 + }, + { + "epoch": 0.5452609158679447, + "grad_norm": 0.435546875, + "learning_rate": 2.8899043769581627e-05, + "loss": 1.1842751502990723, + "step": 256 + }, + { + "epoch": 0.549520766773163, + "grad_norm": 0.78515625, + "learning_rate": 2.8875362902441517e-05, + "loss": 1.1901715993881226, + "step": 258 + }, + { + "epoch": 0.5537806176783813, + "grad_norm": 0.447265625, + "learning_rate": 2.885144257057849e-05, + "loss": 1.3038347959518433, + "step": 260 + }, + { + "epoch": 0.5580404685835996, + "grad_norm": 0.46484375, + "learning_rate": 2.8827283300694593e-05, + "loss": 1.2350062131881714, + "step": 262 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 0.443359375, + "learning_rate": 2.8802885624753013e-05, + "loss": 1.2469710111618042, + "step": 264 + }, + { + "epoch": 0.5665601703940362, + "grad_norm": 0.38671875, + "learning_rate": 2.8778250079966417e-05, + "loss": 1.2484819889068604, + "step": 266 + }, + { + "epoch": 0.5708200212992546, + "grad_norm": 0.48046875, + "learning_rate": 2.875337720878512e-05, + "loss": 1.213232159614563, + "step": 268 + }, + { + "epoch": 0.5750798722044729, + "grad_norm": 0.349609375, + "learning_rate": 2.8728267558885102e-05, + "loss": 1.1985093355178833, + "step": 270 + }, + { + "epoch": 0.5793397231096912, + "grad_norm": 0.28125, + "learning_rate": 2.8702921683156e-05, + "loss": 1.2459266185760498, + "step": 272 + }, + { + "epoch": 0.5835995740149095, + "grad_norm": 0.251953125, + "learning_rate": 2.867734013968891e-05, + "loss": 1.3075346946716309, + "step": 274 + }, + { + "epoch": 0.5878594249201278, + "grad_norm": 0.63671875, + "learning_rate": 2.8651523491764074e-05, + "loss": 1.254473090171814, + "step": 276 + }, + { + "epoch": 0.5921192758253461, + "grad_norm": 0.5703125, + "learning_rate": 2.8625472307838518e-05, + "loss": 1.2639200687408447, + "step": 278 + }, + { + "epoch": 0.5963791267305645, + "grad_norm": 0.703125, + "learning_rate": 2.8599187161533533e-05, + "loss": 1.23056161403656, + "step": 280 + }, + { + "epoch": 0.6006389776357828, + "grad_norm": 0.53125, + "learning_rate": 2.8572668631622e-05, + "loss": 1.2778501510620117, + "step": 282 + }, + { + "epoch": 0.6048988285410011, + "grad_norm": 0.376953125, + "learning_rate": 2.8545917302015693e-05, + "loss": 1.240308403968811, + "step": 284 + }, + { + "epoch": 0.6091586794462194, + "grad_norm": 0.2734375, + "learning_rate": 2.851893376175241e-05, + "loss": 1.3061432838439941, + "step": 286 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 0.326171875, + "learning_rate": 2.849171860498298e-05, + "loss": 1.1693536043167114, + "step": 288 + }, + { + "epoch": 0.617678381256656, + "grad_norm": 0.396484375, + "learning_rate": 2.8464272430958208e-05, + "loss": 1.3255276679992676, + "step": 290 + }, + { + "epoch": 0.6219382321618744, + "grad_norm": 0.4140625, + "learning_rate": 2.843659584401568e-05, + "loss": 1.1839312314987183, + "step": 292 + }, + { + "epoch": 0.6261980830670927, + "grad_norm": 0.400390625, + "learning_rate": 2.840868945356643e-05, + "loss": 1.2237545251846313, + "step": 294 + }, + { + "epoch": 0.630457933972311, + "grad_norm": 0.279296875, + "learning_rate": 2.8380553874081544e-05, + "loss": 1.219810962677002, + "step": 296 + }, + { + "epoch": 0.6347177848775293, + "grad_norm": 0.2099609375, + "learning_rate": 2.8352189725078623e-05, + "loss": 1.148103952407837, + "step": 298 + }, + { + "epoch": 0.6389776357827476, + "grad_norm": 0.70703125, + "learning_rate": 2.8323597631108148e-05, + "loss": 1.266182780265808, + "step": 300 + }, + { + "epoch": 0.6432374866879659, + "grad_norm": 0.9765625, + "learning_rate": 2.829477822173972e-05, + "loss": 1.1832197904586792, + "step": 302 + }, + { + "epoch": 0.6474973375931843, + "grad_norm": 0.322265625, + "learning_rate": 2.8265732131548185e-05, + "loss": 1.2743726968765259, + "step": 304 + }, + { + "epoch": 0.6517571884984026, + "grad_norm": 0.2216796875, + "learning_rate": 2.82364600000997e-05, + "loss": 1.2408907413482666, + "step": 306 + }, + { + "epoch": 0.6560170394036209, + "grad_norm": 0.46875, + "learning_rate": 2.8206962471937612e-05, + "loss": 1.2314817905426025, + "step": 308 + }, + { + "epoch": 0.6602768903088392, + "grad_norm": 0.279296875, + "learning_rate": 2.817724019656829e-05, + "loss": 1.0730669498443604, + "step": 310 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.3203125, + "learning_rate": 2.81472938284468e-05, + "loss": 1.250943660736084, + "step": 312 + }, + { + "epoch": 0.6687965921192758, + "grad_norm": 0.98046875, + "learning_rate": 2.811712402696252e-05, + "loss": 1.1586111783981323, + "step": 314 + }, + { + "epoch": 0.6730564430244942, + "grad_norm": 0.267578125, + "learning_rate": 2.808673145642461e-05, + "loss": 1.2091357707977295, + "step": 316 + }, + { + "epoch": 0.6773162939297125, + "grad_norm": 0.7890625, + "learning_rate": 2.805611678604737e-05, + "loss": 1.219393253326416, + "step": 318 + }, + { + "epoch": 0.6815761448349308, + "grad_norm": 0.302734375, + "learning_rate": 2.8025280689935538e-05, + "loss": 1.2416179180145264, + "step": 320 + }, + { + "epoch": 0.6858359957401491, + "grad_norm": 0.46875, + "learning_rate": 2.7994223847069417e-05, + "loss": 1.2236298322677612, + "step": 322 + }, + { + "epoch": 0.6900958466453674, + "grad_norm": 1.109375, + "learning_rate": 2.7962946941289932e-05, + "loss": 1.1898835897445679, + "step": 324 + }, + { + "epoch": 0.6943556975505857, + "grad_norm": 0.357421875, + "learning_rate": 2.7931450661283587e-05, + "loss": 1.1595722436904907, + "step": 326 + }, + { + "epoch": 0.6986155484558041, + "grad_norm": 0.53125, + "learning_rate": 2.7899735700567272e-05, + "loss": 1.221711277961731, + "step": 328 + }, + { + "epoch": 0.7028753993610224, + "grad_norm": 0.2275390625, + "learning_rate": 2.7867802757473023e-05, + "loss": 1.2105400562286377, + "step": 330 + }, + { + "epoch": 0.7071352502662407, + "grad_norm": 0.30078125, + "learning_rate": 2.7835652535132635e-05, + "loss": 1.2640867233276367, + "step": 332 + }, + { + "epoch": 0.711395101171459, + "grad_norm": 0.330078125, + "learning_rate": 2.780328574146216e-05, + "loss": 1.259413480758667, + "step": 334 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.4453125, + "learning_rate": 2.7770703089146355e-05, + "loss": 1.3237056732177734, + "step": 336 + }, + { + "epoch": 0.7199148029818956, + "grad_norm": 0.400390625, + "learning_rate": 2.7737905295622957e-05, + "loss": 1.2199316024780273, + "step": 338 + }, + { + "epoch": 0.724174653887114, + "grad_norm": 0.51953125, + "learning_rate": 2.7704893083066906e-05, + "loss": 1.1969261169433594, + "step": 340 + }, + { + "epoch": 0.7284345047923323, + "grad_norm": 0.466796875, + "learning_rate": 2.7671667178374443e-05, + "loss": 1.2693402767181396, + "step": 342 + }, + { + "epoch": 0.7326943556975506, + "grad_norm": 0.265625, + "learning_rate": 2.7638228313147083e-05, + "loss": 1.230875015258789, + "step": 344 + }, + { + "epoch": 0.7369542066027689, + "grad_norm": 0.375, + "learning_rate": 2.760457722367553e-05, + "loss": 1.1558018922805786, + "step": 346 + }, + { + "epoch": 0.7412140575079872, + "grad_norm": 2.40625, + "learning_rate": 2.7570714650923446e-05, + "loss": 1.3312543630599976, + "step": 348 + }, + { + "epoch": 0.7454739084132055, + "grad_norm": 0.283203125, + "learning_rate": 2.7536641340511177e-05, + "loss": 1.1423282623291016, + "step": 350 + }, + { + "epoch": 0.7497337593184239, + "grad_norm": 0.41796875, + "learning_rate": 2.7502358042699257e-05, + "loss": 1.1751903295516968, + "step": 352 + }, + { + "epoch": 0.7539936102236422, + "grad_norm": 0.384765625, + "learning_rate": 2.7467865512371974e-05, + "loss": 1.2713823318481445, + "step": 354 + }, + { + "epoch": 0.7582534611288605, + "grad_norm": 0.330078125, + "learning_rate": 2.7433164509020684e-05, + "loss": 1.2887362241744995, + "step": 356 + }, + { + "epoch": 0.7625133120340788, + "grad_norm": 0.341796875, + "learning_rate": 2.7398255796727127e-05, + "loss": 1.2369112968444824, + "step": 358 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 0.34375, + "learning_rate": 2.7363140144146578e-05, + "loss": 1.150454044342041, + "step": 360 + }, + { + "epoch": 0.7710330138445154, + "grad_norm": 0.275390625, + "learning_rate": 2.7327818324490938e-05, + "loss": 1.2185767889022827, + "step": 362 + }, + { + "epoch": 0.7752928647497338, + "grad_norm": 0.6953125, + "learning_rate": 2.729229111551171e-05, + "loss": 1.2292591333389282, + "step": 364 + }, + { + "epoch": 0.7795527156549521, + "grad_norm": 0.236328125, + "learning_rate": 2.725655929948285e-05, + "loss": 1.2185684442520142, + "step": 366 + }, + { + "epoch": 0.7838125665601704, + "grad_norm": 0.30078125, + "learning_rate": 2.722062366318357e-05, + "loss": 1.1981046199798584, + "step": 368 + }, + { + "epoch": 0.7880724174653887, + "grad_norm": 0.2197265625, + "learning_rate": 2.7184484997881e-05, + "loss": 1.1411432027816772, + "step": 370 + }, + { + "epoch": 0.792332268370607, + "grad_norm": 0.85546875, + "learning_rate": 2.7148144099312765e-05, + "loss": 1.2738561630249023, + "step": 372 + }, + { + "epoch": 0.7965921192758253, + "grad_norm": 0.275390625, + "learning_rate": 2.7111601767669473e-05, + "loss": 1.1942780017852783, + "step": 374 + }, + { + "epoch": 0.8008519701810437, + "grad_norm": 0.212890625, + "learning_rate": 2.7074858807577084e-05, + "loss": 1.1684967279434204, + "step": 376 + }, + { + "epoch": 0.805111821086262, + "grad_norm": 0.171875, + "learning_rate": 2.7037916028079198e-05, + "loss": 1.1836313009262085, + "step": 378 + }, + { + "epoch": 0.8093716719914803, + "grad_norm": 0.365234375, + "learning_rate": 2.7000774242619235e-05, + "loss": 1.2047457695007324, + "step": 380 + }, + { + "epoch": 0.8136315228966986, + "grad_norm": 0.2294921875, + "learning_rate": 2.696343426902254e-05, + "loss": 1.186992883682251, + "step": 382 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 0.376953125, + "learning_rate": 2.6925896929478355e-05, + "loss": 1.1887181997299194, + "step": 384 + }, + { + "epoch": 0.8221512247071352, + "grad_norm": 0.328125, + "learning_rate": 2.6888163050521734e-05, + "loss": 1.2181212902069092, + "step": 386 + }, + { + "epoch": 0.8264110756123536, + "grad_norm": 0.55078125, + "learning_rate": 2.6850233463015334e-05, + "loss": 1.1820951700210571, + "step": 388 + }, + { + "epoch": 0.8306709265175719, + "grad_norm": 0.267578125, + "learning_rate": 2.6812109002131106e-05, + "loss": 1.1575113534927368, + "step": 390 + }, + { + "epoch": 0.8349307774227902, + "grad_norm": 0.40625, + "learning_rate": 2.6773790507331936e-05, + "loss": 1.1017088890075684, + "step": 392 + }, + { + "epoch": 0.8391906283280085, + "grad_norm": 0.310546875, + "learning_rate": 2.673527882235314e-05, + "loss": 1.1889958381652832, + "step": 394 + }, + { + "epoch": 0.8434504792332268, + "grad_norm": 0.373046875, + "learning_rate": 2.6696574795183882e-05, + "loss": 1.1406269073486328, + "step": 396 + }, + { + "epoch": 0.8477103301384451, + "grad_norm": 0.453125, + "learning_rate": 2.665767927804852e-05, + "loss": 1.172967791557312, + "step": 398 + }, + { + "epoch": 0.8519701810436635, + "grad_norm": 0.23046875, + "learning_rate": 2.661859312738783e-05, + "loss": 1.2290892601013184, + "step": 400 + }, + { + "epoch": 0.8562300319488818, + "grad_norm": 0.255859375, + "learning_rate": 2.6579317203840154e-05, + "loss": 1.0655782222747803, + "step": 402 + }, + { + "epoch": 0.8604898828541001, + "grad_norm": 0.208984375, + "learning_rate": 2.6539852372222434e-05, + "loss": 1.1730587482452393, + "step": 404 + }, + { + "epoch": 0.8647497337593184, + "grad_norm": 0.283203125, + "learning_rate": 2.6500199501511184e-05, + "loss": 1.2667183876037598, + "step": 406 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.3125, + "learning_rate": 2.646035946482336e-05, + "loss": 1.2611602544784546, + "step": 408 + }, + { + "epoch": 0.873269435569755, + "grad_norm": 0.234375, + "learning_rate": 2.6420333139397122e-05, + "loss": 1.2684861421585083, + "step": 410 + }, + { + "epoch": 0.8775292864749734, + "grad_norm": 0.2333984375, + "learning_rate": 2.638012140657252e-05, + "loss": 1.2144488096237183, + "step": 412 + }, + { + "epoch": 0.8817891373801917, + "grad_norm": 0.44921875, + "learning_rate": 2.6339725151772095e-05, + "loss": 1.2024558782577515, + "step": 414 + }, + { + "epoch": 0.88604898828541, + "grad_norm": 0.478515625, + "learning_rate": 2.6299145264481386e-05, + "loss": 1.2472572326660156, + "step": 416 + }, + { + "epoch": 0.8903088391906283, + "grad_norm": 0.310546875, + "learning_rate": 2.625838263822932e-05, + "loss": 1.15989351272583, + "step": 418 + }, + { + "epoch": 0.8945686900958466, + "grad_norm": 0.236328125, + "learning_rate": 2.621743817056858e-05, + "loss": 1.2214092016220093, + "step": 420 + }, + { + "epoch": 0.898828541001065, + "grad_norm": 0.37109375, + "learning_rate": 2.6176312763055795e-05, + "loss": 1.1031744480133057, + "step": 422 + }, + { + "epoch": 0.9030883919062833, + "grad_norm": 0.251953125, + "learning_rate": 2.6135007321231715e-05, + "loss": 1.0990759134292603, + "step": 424 + }, + { + "epoch": 0.9073482428115016, + "grad_norm": 0.41015625, + "learning_rate": 2.6093522754601284e-05, + "loss": 1.180249810218811, + "step": 426 + }, + { + "epoch": 0.9116080937167199, + "grad_norm": 0.185546875, + "learning_rate": 2.6051859976613564e-05, + "loss": 1.1679967641830444, + "step": 428 + }, + { + "epoch": 0.9158679446219382, + "grad_norm": 0.294921875, + "learning_rate": 2.601001990464169e-05, + "loss": 1.1675636768341064, + "step": 430 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.93359375, + "learning_rate": 2.5968003459962608e-05, + "loss": 1.187214732170105, + "step": 432 + }, + { + "epoch": 0.9243876464323749, + "grad_norm": 0.23828125, + "learning_rate": 2.592581156773684e-05, + "loss": 1.1574485301971436, + "step": 434 + }, + { + "epoch": 0.9286474973375932, + "grad_norm": 0.33203125, + "learning_rate": 2.588344515698806e-05, + "loss": 1.207824468612671, + "step": 436 + }, + { + "epoch": 0.9329073482428115, + "grad_norm": 0.275390625, + "learning_rate": 2.58409051605827e-05, + "loss": 1.160508155822754, + "step": 438 + }, + { + "epoch": 0.9371671991480298, + "grad_norm": 0.3125, + "learning_rate": 2.5798192515209343e-05, + "loss": 1.1380846500396729, + "step": 440 + }, + { + "epoch": 0.9414270500532481, + "grad_norm": 0.1884765625, + "learning_rate": 2.5755308161358166e-05, + "loss": 1.1430374383926392, + "step": 442 + }, + { + "epoch": 0.9456869009584664, + "grad_norm": 1.0625, + "learning_rate": 2.5712253043300174e-05, + "loss": 1.1965644359588623, + "step": 444 + }, + { + "epoch": 0.9499467518636848, + "grad_norm": 0.302734375, + "learning_rate": 2.5669028109066426e-05, + "loss": 1.2050869464874268, + "step": 446 + }, + { + "epoch": 0.9542066027689031, + "grad_norm": 0.380859375, + "learning_rate": 2.5625634310427188e-05, + "loss": 1.1945817470550537, + "step": 448 + }, + { + "epoch": 0.9584664536741214, + "grad_norm": 0.275390625, + "learning_rate": 2.558207260287093e-05, + "loss": 1.1947966814041138, + "step": 450 + }, + { + "epoch": 0.9627263045793397, + "grad_norm": 0.279296875, + "learning_rate": 2.553834394558332e-05, + "loss": 1.134352445602417, + "step": 452 + }, + { + "epoch": 0.966986155484558, + "grad_norm": 1.0859375, + "learning_rate": 2.5494449301426102e-05, + "loss": 1.2251217365264893, + "step": 454 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.2177734375, + "learning_rate": 2.5450389636915867e-05, + "loss": 1.081860899925232, + "step": 456 + }, + { + "epoch": 0.9755058572949947, + "grad_norm": 0.1943359375, + "learning_rate": 2.540616592220281e-05, + "loss": 1.182367205619812, + "step": 458 + }, + { + "epoch": 0.979765708200213, + "grad_norm": 0.2451171875, + "learning_rate": 2.5361779131049344e-05, + "loss": 1.158174991607666, + "step": 460 + }, + { + "epoch": 0.9840255591054313, + "grad_norm": 0.2001953125, + "learning_rate": 2.5317230240808656e-05, + "loss": 1.1436811685562134, + "step": 462 + }, + { + "epoch": 0.9882854100106496, + "grad_norm": 0.31640625, + "learning_rate": 2.527252023240319e-05, + "loss": 1.1009982824325562, + "step": 464 + }, + { + "epoch": 0.9925452609158679, + "grad_norm": 0.39453125, + "learning_rate": 2.5227650090303083e-05, + "loss": 1.2242732048034668, + "step": 466 + }, + { + "epoch": 0.9968051118210862, + "grad_norm": 0.255859375, + "learning_rate": 2.5182620802504415e-05, + "loss": 1.1412031650543213, + "step": 468 + }, + { + "epoch": 1.0, + "grad_norm": 0.453125, + "learning_rate": 2.513743336050753e-05, + "loss": 1.3368866443634033, + "step": 470 + }, + { + "epoch": 1.0042598509052183, + "grad_norm": 0.23828125, + "learning_rate": 2.5092088759295147e-05, + "loss": 0.9358726739883423, + "step": 472 + }, + { + "epoch": 1.0085197018104366, + "grad_norm": 0.189453125, + "learning_rate": 2.5046587997310503e-05, + "loss": 0.9842238426208496, + "step": 474 + }, + { + "epoch": 1.012779552715655, + "grad_norm": 0.2314453125, + "learning_rate": 2.500093207643532e-05, + "loss": 0.909864068031311, + "step": 476 + }, + { + "epoch": 1.0170394036208732, + "grad_norm": 0.248046875, + "learning_rate": 2.4955122001967757e-05, + "loss": 0.8217376470565796, + "step": 478 + }, + { + "epoch": 1.0212992545260915, + "grad_norm": 0.2236328125, + "learning_rate": 2.4909158782600303e-05, + "loss": 0.9412868618965149, + "step": 480 + }, + { + "epoch": 1.0255591054313098, + "grad_norm": 0.423828125, + "learning_rate": 2.4863043430397546e-05, + "loss": 0.9232436418533325, + "step": 482 + }, + { + "epoch": 1.0298189563365283, + "grad_norm": 0.1708984375, + "learning_rate": 2.481677696077387e-05, + "loss": 0.9075867533683777, + "step": 484 + }, + { + "epoch": 1.0340788072417466, + "grad_norm": 0.3203125, + "learning_rate": 2.477036039247113e-05, + "loss": 0.9229554533958435, + "step": 486 + }, + { + "epoch": 1.038338658146965, + "grad_norm": 0.28515625, + "learning_rate": 2.4723794747536204e-05, + "loss": 0.8909753561019897, + "step": 488 + }, + { + "epoch": 1.0425985090521832, + "grad_norm": 0.2578125, + "learning_rate": 2.4677081051298473e-05, + "loss": 0.8516156077384949, + "step": 490 + }, + { + "epoch": 1.0468583599574015, + "grad_norm": 0.4375, + "learning_rate": 2.4630220332347293e-05, + "loss": 0.910189151763916, + "step": 492 + }, + { + "epoch": 1.0511182108626198, + "grad_norm": 0.2314453125, + "learning_rate": 2.458321362250928e-05, + "loss": 0.8809674978256226, + "step": 494 + }, + { + "epoch": 1.055378061767838, + "grad_norm": 0.337890625, + "learning_rate": 2.4536061956825653e-05, + "loss": 0.9545248746871948, + "step": 496 + }, + { + "epoch": 1.0596379126730564, + "grad_norm": 0.359375, + "learning_rate": 2.44887663735294e-05, + "loss": 0.8128166794776917, + "step": 498 + }, + { + "epoch": 1.0638977635782747, + "grad_norm": 0.271484375, + "learning_rate": 2.4441327914022435e-05, + "loss": 0.7933678030967712, + "step": 500 + }, + { + "epoch": 1.068157614483493, + "grad_norm": 0.5859375, + "learning_rate": 2.4393747622852666e-05, + "loss": 0.845329761505127, + "step": 502 + }, + { + "epoch": 1.0724174653887113, + "grad_norm": 0.291015625, + "learning_rate": 2.4346026547690983e-05, + "loss": 0.8825768232345581, + "step": 504 + }, + { + "epoch": 1.0766773162939298, + "grad_norm": 0.322265625, + "learning_rate": 2.4298165739308227e-05, + "loss": 0.9173828959465027, + "step": 506 + }, + { + "epoch": 1.0809371671991481, + "grad_norm": 0.625, + "learning_rate": 2.4250166251551998e-05, + "loss": 0.9571421146392822, + "step": 508 + }, + { + "epoch": 1.0851970181043664, + "grad_norm": 0.29296875, + "learning_rate": 2.4202029141323492e-05, + "loss": 0.8474833369255066, + "step": 510 + }, + { + "epoch": 1.0894568690095847, + "grad_norm": 0.34765625, + "learning_rate": 2.415375546855422e-05, + "loss": 0.8801344633102417, + "step": 512 + }, + { + "epoch": 1.093716719914803, + "grad_norm": 0.33984375, + "learning_rate": 2.4105346296182648e-05, + "loss": 0.8761341571807861, + "step": 514 + }, + { + "epoch": 1.0979765708200213, + "grad_norm": 0.921875, + "learning_rate": 2.4056802690130826e-05, + "loss": 0.8511140942573547, + "step": 516 + }, + { + "epoch": 1.1022364217252396, + "grad_norm": 0.2470703125, + "learning_rate": 2.4008125719280893e-05, + "loss": 0.8243319392204285, + "step": 518 + }, + { + "epoch": 1.106496272630458, + "grad_norm": 0.2353515625, + "learning_rate": 2.395931645545155e-05, + "loss": 0.9023821949958801, + "step": 520 + }, + { + "epoch": 1.1107561235356762, + "grad_norm": 0.5859375, + "learning_rate": 2.391037597337446e-05, + "loss": 0.8977804183959961, + "step": 522 + }, + { + "epoch": 1.1150159744408945, + "grad_norm": 0.271484375, + "learning_rate": 2.3861305350670564e-05, + "loss": 0.8644490242004395, + "step": 524 + }, + { + "epoch": 1.1192758253461128, + "grad_norm": 0.220703125, + "learning_rate": 2.381210566782642e-05, + "loss": 0.8652825951576233, + "step": 526 + }, + { + "epoch": 1.123535676251331, + "grad_norm": 0.48828125, + "learning_rate": 2.3762778008170296e-05, + "loss": 0.9315000176429749, + "step": 528 + }, + { + "epoch": 1.1277955271565494, + "grad_norm": 0.298828125, + "learning_rate": 2.3713323457848425e-05, + "loss": 0.8627546429634094, + "step": 530 + }, + { + "epoch": 1.132055378061768, + "grad_norm": 0.3515625, + "learning_rate": 2.366374310580106e-05, + "loss": 0.8466436266899109, + "step": 532 + }, + { + "epoch": 1.1363152289669862, + "grad_norm": 0.234375, + "learning_rate": 2.3614038043738432e-05, + "loss": 0.8433495163917542, + "step": 534 + }, + { + "epoch": 1.1405750798722045, + "grad_norm": 0.671875, + "learning_rate": 2.35642093661168e-05, + "loss": 0.9653686285018921, + "step": 536 + }, + { + "epoch": 1.1448349307774228, + "grad_norm": 0.46484375, + "learning_rate": 2.351425817011432e-05, + "loss": 0.9155454039573669, + "step": 538 + }, + { + "epoch": 1.1490947816826411, + "grad_norm": 0.2333984375, + "learning_rate": 2.3464185555606854e-05, + "loss": 0.8044310212135315, + "step": 540 + }, + { + "epoch": 1.1533546325878594, + "grad_norm": 0.2451171875, + "learning_rate": 2.3413992625143808e-05, + "loss": 0.8448784947395325, + "step": 542 + }, + { + "epoch": 1.1576144834930777, + "grad_norm": 0.51171875, + "learning_rate": 2.3363680483923794e-05, + "loss": 0.9145954251289368, + "step": 544 + }, + { + "epoch": 1.161874334398296, + "grad_norm": 0.251953125, + "learning_rate": 2.3313250239770364e-05, + "loss": 0.8059402108192444, + "step": 546 + }, + { + "epoch": 1.1661341853035143, + "grad_norm": 0.32421875, + "learning_rate": 2.326270300310756e-05, + "loss": 0.910370945930481, + "step": 548 + }, + { + "epoch": 1.1703940362087326, + "grad_norm": 0.2060546875, + "learning_rate": 2.3212039886935464e-05, + "loss": 0.8459041118621826, + "step": 550 + }, + { + "epoch": 1.174653887113951, + "grad_norm": 0.23046875, + "learning_rate": 2.3161262006805744e-05, + "loss": 0.8679651618003845, + "step": 552 + }, + { + "epoch": 1.1789137380191694, + "grad_norm": 0.353515625, + "learning_rate": 2.3110370480797046e-05, + "loss": 0.8998923897743225, + "step": 554 + }, + { + "epoch": 1.1831735889243877, + "grad_norm": 0.380859375, + "learning_rate": 2.3059366429490382e-05, + "loss": 0.9410486817359924, + "step": 556 + }, + { + "epoch": 1.187433439829606, + "grad_norm": 0.294921875, + "learning_rate": 2.3008250975944458e-05, + "loss": 0.8485605120658875, + "step": 558 + }, + { + "epoch": 1.1916932907348243, + "grad_norm": 0.240234375, + "learning_rate": 2.2957025245670945e-05, + "loss": 0.8777744770050049, + "step": 560 + }, + { + "epoch": 1.1959531416400426, + "grad_norm": 0.40234375, + "learning_rate": 2.2905690366609703e-05, + "loss": 0.9006752967834473, + "step": 562 + }, + { + "epoch": 1.200212992545261, + "grad_norm": 0.3828125, + "learning_rate": 2.2854247469103943e-05, + "loss": 0.8309807181358337, + "step": 564 + }, + { + "epoch": 1.2044728434504792, + "grad_norm": 0.26953125, + "learning_rate": 2.280269768587534e-05, + "loss": 0.9057250618934631, + "step": 566 + }, + { + "epoch": 1.2087326943556975, + "grad_norm": 0.2080078125, + "learning_rate": 2.2751042151999064e-05, + "loss": 0.829549252986908, + "step": 568 + }, + { + "epoch": 1.2129925452609158, + "grad_norm": 0.2734375, + "learning_rate": 2.2699282004878834e-05, + "loss": 0.9091805219650269, + "step": 570 + }, + { + "epoch": 1.2172523961661341, + "grad_norm": 0.2490234375, + "learning_rate": 2.264741838422183e-05, + "loss": 0.8178958296775818, + "step": 572 + }, + { + "epoch": 1.2215122470713524, + "grad_norm": 0.62109375, + "learning_rate": 2.2595452432013637e-05, + "loss": 0.9319694638252258, + "step": 574 + }, + { + "epoch": 1.225772097976571, + "grad_norm": 0.255859375, + "learning_rate": 2.2543385292493068e-05, + "loss": 0.8054318428039551, + "step": 576 + }, + { + "epoch": 1.230031948881789, + "grad_norm": 0.37890625, + "learning_rate": 2.2491218112126974e-05, + "loss": 0.8717759847640991, + "step": 578 + }, + { + "epoch": 1.2342917997870075, + "grad_norm": 0.28515625, + "learning_rate": 2.2438952039585023e-05, + "loss": 0.9084351062774658, + "step": 580 + }, + { + "epoch": 1.2385516506922258, + "grad_norm": 0.25390625, + "learning_rate": 2.238658822571437e-05, + "loss": 0.877246618270874, + "step": 582 + }, + { + "epoch": 1.2428115015974441, + "grad_norm": 0.380859375, + "learning_rate": 2.2334127823514353e-05, + "loss": 0.8917878866195679, + "step": 584 + }, + { + "epoch": 1.2470713525026624, + "grad_norm": 0.2392578125, + "learning_rate": 2.2281571988111087e-05, + "loss": 0.9018102884292603, + "step": 586 + }, + { + "epoch": 1.2513312034078807, + "grad_norm": 0.181640625, + "learning_rate": 2.222892187673203e-05, + "loss": 0.8929234147071838, + "step": 588 + }, + { + "epoch": 1.255591054313099, + "grad_norm": 0.291015625, + "learning_rate": 2.2176178648680504e-05, + "loss": 0.9248031973838806, + "step": 590 + }, + { + "epoch": 1.2598509052183173, + "grad_norm": 0.275390625, + "learning_rate": 2.2123343465310163e-05, + "loss": 0.9204663038253784, + "step": 592 + }, + { + "epoch": 1.2641107561235356, + "grad_norm": 0.2021484375, + "learning_rate": 2.2070417489999427e-05, + "loss": 0.8040061593055725, + "step": 594 + }, + { + "epoch": 1.268370607028754, + "grad_norm": 0.349609375, + "learning_rate": 2.201740188812588e-05, + "loss": 0.9146944880485535, + "step": 596 + }, + { + "epoch": 1.2726304579339724, + "grad_norm": 0.2578125, + "learning_rate": 2.196429782704057e-05, + "loss": 0.8526248931884766, + "step": 598 + }, + { + "epoch": 1.2768903088391905, + "grad_norm": 0.328125, + "learning_rate": 2.191110647604235e-05, + "loss": 0.8366101384162903, + "step": 600 + }, + { + "epoch": 1.281150159744409, + "grad_norm": 0.2333984375, + "learning_rate": 2.1857829006352092e-05, + "loss": 0.8716267347335815, + "step": 602 + }, + { + "epoch": 1.2854100106496273, + "grad_norm": 0.333984375, + "learning_rate": 2.180446659108693e-05, + "loss": 0.9040926694869995, + "step": 604 + }, + { + "epoch": 1.2896698615548456, + "grad_norm": 0.251953125, + "learning_rate": 2.1751020405234427e-05, + "loss": 0.8583382368087769, + "step": 606 + }, + { + "epoch": 1.293929712460064, + "grad_norm": 0.314453125, + "learning_rate": 2.1697491625626652e-05, + "loss": 0.8685941696166992, + "step": 608 + }, + { + "epoch": 1.2981895633652822, + "grad_norm": 0.21484375, + "learning_rate": 2.1643881430914343e-05, + "loss": 0.8654310703277588, + "step": 610 + }, + { + "epoch": 1.3024494142705005, + "grad_norm": 0.2265625, + "learning_rate": 2.1590191001540903e-05, + "loss": 0.8943390846252441, + "step": 612 + }, + { + "epoch": 1.3067092651757188, + "grad_norm": 0.26171875, + "learning_rate": 2.153642151971643e-05, + "loss": 0.8576252460479736, + "step": 614 + }, + { + "epoch": 1.3109691160809371, + "grad_norm": 0.4375, + "learning_rate": 2.1482574169391664e-05, + "loss": 0.8761968612670898, + "step": 616 + }, + { + "epoch": 1.3152289669861554, + "grad_norm": 0.232421875, + "learning_rate": 2.1428650136231948e-05, + "loss": 0.8207455277442932, + "step": 618 + }, + { + "epoch": 1.3194888178913737, + "grad_norm": 0.22265625, + "learning_rate": 2.1374650607591106e-05, + "loss": 0.8694437742233276, + "step": 620 + }, + { + "epoch": 1.323748668796592, + "grad_norm": 0.408203125, + "learning_rate": 2.1320576772485284e-05, + "loss": 0.872995138168335, + "step": 622 + }, + { + "epoch": 1.3280085197018106, + "grad_norm": 0.30859375, + "learning_rate": 2.126642982156679e-05, + "loss": 0.9666632413864136, + "step": 624 + }, + { + "epoch": 1.3322683706070286, + "grad_norm": 0.3828125, + "learning_rate": 2.1212210947097873e-05, + "loss": 0.8025370836257935, + "step": 626 + }, + { + "epoch": 1.3365282215122471, + "grad_norm": 0.349609375, + "learning_rate": 2.1157921342924457e-05, + "loss": 0.8531129956245422, + "step": 628 + }, + { + "epoch": 1.3407880724174654, + "grad_norm": 0.2734375, + "learning_rate": 2.1103562204449876e-05, + "loss": 0.8310921788215637, + "step": 630 + }, + { + "epoch": 1.3450479233226837, + "grad_norm": 0.291015625, + "learning_rate": 2.1049134728608537e-05, + "loss": 0.903289794921875, + "step": 632 + }, + { + "epoch": 1.349307774227902, + "grad_norm": 0.28125, + "learning_rate": 2.0994640113839568e-05, + "loss": 0.8707770705223083, + "step": 634 + }, + { + "epoch": 1.3535676251331203, + "grad_norm": 0.267578125, + "learning_rate": 2.0940079560060427e-05, + "loss": 0.8999609351158142, + "step": 636 + }, + { + "epoch": 1.3578274760383386, + "grad_norm": 0.2119140625, + "learning_rate": 2.088545426864048e-05, + "loss": 0.8670209646224976, + "step": 638 + }, + { + "epoch": 1.362087326943557, + "grad_norm": 0.349609375, + "learning_rate": 2.0830765442374563e-05, + "loss": 0.8102102279663086, + "step": 640 + }, + { + "epoch": 1.3663471778487752, + "grad_norm": 0.2041015625, + "learning_rate": 2.077601428545648e-05, + "loss": 0.8202542662620544, + "step": 642 + }, + { + "epoch": 1.3706070287539935, + "grad_norm": 0.234375, + "learning_rate": 2.0721202003452496e-05, + "loss": 0.8944796323776245, + "step": 644 + }, + { + "epoch": 1.374866879659212, + "grad_norm": 0.208984375, + "learning_rate": 2.066632980327478e-05, + "loss": 0.9467480778694153, + "step": 646 + }, + { + "epoch": 1.3791267305644301, + "grad_norm": 0.482421875, + "learning_rate": 2.061139889315486e-05, + "loss": 0.8729652762413025, + "step": 648 + }, + { + "epoch": 1.3833865814696487, + "grad_norm": 0.275390625, + "learning_rate": 2.0556410482616977e-05, + "loss": 0.8954660892486572, + "step": 650 + }, + { + "epoch": 1.387646432374867, + "grad_norm": 0.2734375, + "learning_rate": 2.050136578245149e-05, + "loss": 0.870725691318512, + "step": 652 + }, + { + "epoch": 1.3919062832800853, + "grad_norm": 0.251953125, + "learning_rate": 2.0446266004688197e-05, + "loss": 0.8651110529899597, + "step": 654 + }, + { + "epoch": 1.3961661341853036, + "grad_norm": 0.240234375, + "learning_rate": 2.039111236256964e-05, + "loss": 0.8937119841575623, + "step": 656 + }, + { + "epoch": 1.4004259850905219, + "grad_norm": 0.2333984375, + "learning_rate": 2.0335906070524416e-05, + "loss": 0.8803120851516724, + "step": 658 + }, + { + "epoch": 1.4046858359957402, + "grad_norm": 0.2236328125, + "learning_rate": 2.02806483441404e-05, + "loss": 0.8514755368232727, + "step": 660 + }, + { + "epoch": 1.4089456869009584, + "grad_norm": 0.1630859375, + "learning_rate": 2.0225340400138033e-05, + "loss": 0.8654860258102417, + "step": 662 + }, + { + "epoch": 1.4132055378061767, + "grad_norm": 0.1904296875, + "learning_rate": 2.0169983456343464e-05, + "loss": 0.861249566078186, + "step": 664 + }, + { + "epoch": 1.417465388711395, + "grad_norm": 0.48046875, + "learning_rate": 2.011457873166179e-05, + "loss": 0.8996407389640808, + "step": 666 + }, + { + "epoch": 1.4217252396166133, + "grad_norm": 0.390625, + "learning_rate": 2.005912744605019e-05, + "loss": 0.822201132774353, + "step": 668 + }, + { + "epoch": 1.4259850905218316, + "grad_norm": 0.294921875, + "learning_rate": 2.0003630820491066e-05, + "loss": 0.8432199358940125, + "step": 670 + }, + { + "epoch": 1.4302449414270502, + "grad_norm": 0.263671875, + "learning_rate": 1.9948090076965163e-05, + "loss": 0.8672274351119995, + "step": 672 + }, + { + "epoch": 1.4345047923322682, + "grad_norm": 0.25, + "learning_rate": 1.9892506438424666e-05, + "loss": 0.8486787676811218, + "step": 674 + }, + { + "epoch": 1.4387646432374868, + "grad_norm": 0.279296875, + "learning_rate": 1.9836881128766248e-05, + "loss": 0.8892148733139038, + "step": 676 + }, + { + "epoch": 1.443024494142705, + "grad_norm": 0.27734375, + "learning_rate": 1.9781215372804158e-05, + "loss": 0.8915472030639648, + "step": 678 + }, + { + "epoch": 1.4472843450479234, + "grad_norm": 0.3203125, + "learning_rate": 1.9725510396243226e-05, + "loss": 0.8767306804656982, + "step": 680 + }, + { + "epoch": 1.4515441959531417, + "grad_norm": 0.421875, + "learning_rate": 1.9669767425651873e-05, + "loss": 1.0251777172088623, + "step": 682 + }, + { + "epoch": 1.45580404685836, + "grad_norm": 0.32421875, + "learning_rate": 1.9613987688435132e-05, + "loss": 0.8821164965629578, + "step": 684 + }, + { + "epoch": 1.4600638977635783, + "grad_norm": 0.2099609375, + "learning_rate": 1.955817241280757e-05, + "loss": 0.8836470246315002, + "step": 686 + }, + { + "epoch": 1.4643237486687966, + "grad_norm": 0.173828125, + "learning_rate": 1.9502322827766297e-05, + "loss": 0.9067674279212952, + "step": 688 + }, + { + "epoch": 1.4685835995740149, + "grad_norm": 0.2294921875, + "learning_rate": 1.9446440163063875e-05, + "loss": 0.9052207469940186, + "step": 690 + }, + { + "epoch": 1.4728434504792332, + "grad_norm": 0.271484375, + "learning_rate": 1.939052564918126e-05, + "loss": 0.8458245396614075, + "step": 692 + }, + { + "epoch": 1.4771033013844517, + "grad_norm": 0.328125, + "learning_rate": 1.9334580517300668e-05, + "loss": 0.9541709423065186, + "step": 694 + }, + { + "epoch": 1.4813631522896697, + "grad_norm": 0.2255859375, + "learning_rate": 1.9278605999278513e-05, + "loss": 0.9391557574272156, + "step": 696 + }, + { + "epoch": 1.4856230031948883, + "grad_norm": 0.18359375, + "learning_rate": 1.922260332761827e-05, + "loss": 0.9119634628295898, + "step": 698 + }, + { + "epoch": 1.4898828541001066, + "grad_norm": 0.3671875, + "learning_rate": 1.9166573735443302e-05, + "loss": 0.872115433216095, + "step": 700 + }, + { + "epoch": 1.4941427050053249, + "grad_norm": 2.609375, + "learning_rate": 1.9110518456469764e-05, + "loss": 0.9491547346115112, + "step": 702 + }, + { + "epoch": 1.4984025559105432, + "grad_norm": 0.1826171875, + "learning_rate": 1.905443872497939e-05, + "loss": 0.8039662837982178, + "step": 704 + }, + { + "epoch": 1.5026624068157615, + "grad_norm": 0.26953125, + "learning_rate": 1.8998335775792343e-05, + "loss": 0.8376708030700684, + "step": 706 + }, + { + "epoch": 1.5069222577209798, + "grad_norm": 0.2275390625, + "learning_rate": 1.894221084424001e-05, + "loss": 0.8669439554214478, + "step": 708 + }, + { + "epoch": 1.511182108626198, + "grad_norm": 0.1962890625, + "learning_rate": 1.888606516613781e-05, + "loss": 0.8526804447174072, + "step": 710 + }, + { + "epoch": 1.5154419595314164, + "grad_norm": 0.2060546875, + "learning_rate": 1.8829899977757996e-05, + "loss": 0.838132381439209, + "step": 712 + }, + { + "epoch": 1.5197018104366347, + "grad_norm": 0.294921875, + "learning_rate": 1.8773716515802387e-05, + "loss": 0.9030261635780334, + "step": 714 + }, + { + "epoch": 1.5239616613418532, + "grad_norm": 0.298828125, + "learning_rate": 1.8717516017375192e-05, + "loss": 0.8684689998626709, + "step": 716 + }, + { + "epoch": 1.5282215122470713, + "grad_norm": 0.208984375, + "learning_rate": 1.866129971995575e-05, + "loss": 0.950151264667511, + "step": 718 + }, + { + "epoch": 1.5324813631522898, + "grad_norm": 0.4765625, + "learning_rate": 1.8605068861371255e-05, + "loss": 0.9864886403083801, + "step": 720 + }, + { + "epoch": 1.5367412140575079, + "grad_norm": 0.2421875, + "learning_rate": 1.8548824679769538e-05, + "loss": 0.9203893542289734, + "step": 722 + }, + { + "epoch": 1.5410010649627264, + "grad_norm": 0.2490234375, + "learning_rate": 1.8492568413591787e-05, + "loss": 0.8589147329330444, + "step": 724 + }, + { + "epoch": 1.5452609158679447, + "grad_norm": 0.2265625, + "learning_rate": 1.8436301301545282e-05, + "loss": 0.7150123119354248, + "step": 726 + }, + { + "epoch": 1.549520766773163, + "grad_norm": 0.1796875, + "learning_rate": 1.8380024582576128e-05, + "loss": 0.843291163444519, + "step": 728 + }, + { + "epoch": 1.5537806176783813, + "grad_norm": 0.267578125, + "learning_rate": 1.8323739495841943e-05, + "loss": 0.8748659491539001, + "step": 730 + }, + { + "epoch": 1.5580404685835996, + "grad_norm": 0.2314453125, + "learning_rate": 1.8267447280684607e-05, + "loss": 0.8816359043121338, + "step": 732 + }, + { + "epoch": 1.5623003194888179, + "grad_norm": 0.1611328125, + "learning_rate": 1.8211149176602964e-05, + "loss": 0.9086512923240662, + "step": 734 + }, + { + "epoch": 1.5665601703940362, + "grad_norm": 0.19140625, + "learning_rate": 1.8154846423225515e-05, + "loss": 0.9282605648040771, + "step": 736 + }, + { + "epoch": 1.5708200212992547, + "grad_norm": 0.486328125, + "learning_rate": 1.8098540260283158e-05, + "loss": 0.8508008122444153, + "step": 738 + }, + { + "epoch": 1.5750798722044728, + "grad_norm": 0.2021484375, + "learning_rate": 1.8042231927581833e-05, + "loss": 0.7999932169914246, + "step": 740 + }, + { + "epoch": 1.5793397231096913, + "grad_norm": 0.341796875, + "learning_rate": 1.7985922664975274e-05, + "loss": 0.9391716718673706, + "step": 742 + }, + { + "epoch": 1.5835995740149094, + "grad_norm": 0.201171875, + "learning_rate": 1.79296137123377e-05, + "loss": 0.8545106649398804, + "step": 744 + }, + { + "epoch": 1.5878594249201279, + "grad_norm": 0.197265625, + "learning_rate": 1.7873306309536485e-05, + "loss": 0.8491992950439453, + "step": 746 + }, + { + "epoch": 1.592119275825346, + "grad_norm": 0.306640625, + "learning_rate": 1.7817001696404894e-05, + "loss": 0.8515585064888, + "step": 748 + }, + { + "epoch": 1.5963791267305645, + "grad_norm": 0.212890625, + "learning_rate": 1.7760701112714742e-05, + "loss": 0.8558241128921509, + "step": 750 + }, + { + "epoch": 1.6006389776357828, + "grad_norm": 0.28125, + "learning_rate": 1.7704405798149154e-05, + "loss": 0.8748922944068909, + "step": 752 + }, + { + "epoch": 1.604898828541001, + "grad_norm": 0.45703125, + "learning_rate": 1.764811699227521e-05, + "loss": 0.881086528301239, + "step": 754 + }, + { + "epoch": 1.6091586794462194, + "grad_norm": 0.2373046875, + "learning_rate": 1.7591835934516677e-05, + "loss": 0.8601434230804443, + "step": 756 + }, + { + "epoch": 1.6134185303514377, + "grad_norm": 0.27734375, + "learning_rate": 1.7535563864126723e-05, + "loss": 0.925481915473938, + "step": 758 + }, + { + "epoch": 1.617678381256656, + "grad_norm": 0.224609375, + "learning_rate": 1.7479302020160627e-05, + "loss": 0.8856874108314514, + "step": 760 + }, + { + "epoch": 1.6219382321618743, + "grad_norm": 0.6875, + "learning_rate": 1.7423051641448478e-05, + "loss": 0.9088162779808044, + "step": 762 + }, + { + "epoch": 1.6261980830670928, + "grad_norm": 0.2734375, + "learning_rate": 1.7366813966567914e-05, + "loss": 0.7893877029418945, + "step": 764 + }, + { + "epoch": 1.6304579339723109, + "grad_norm": 0.3046875, + "learning_rate": 1.7310590233816868e-05, + "loss": 0.8651562929153442, + "step": 766 + }, + { + "epoch": 1.6347177848775294, + "grad_norm": 0.2470703125, + "learning_rate": 1.7254381681186248e-05, + "loss": 0.8518175482749939, + "step": 768 + }, + { + "epoch": 1.6389776357827475, + "grad_norm": 0.306640625, + "learning_rate": 1.7198189546332738e-05, + "loss": 0.8798878192901611, + "step": 770 + }, + { + "epoch": 1.643237486687966, + "grad_norm": 0.248046875, + "learning_rate": 1.7142015066551515e-05, + "loss": 0.815255343914032, + "step": 772 + }, + { + "epoch": 1.6474973375931843, + "grad_norm": 0.2373046875, + "learning_rate": 1.7085859478748988e-05, + "loss": 0.936029314994812, + "step": 774 + }, + { + "epoch": 1.6517571884984026, + "grad_norm": 0.2060546875, + "learning_rate": 1.7029724019415604e-05, + "loss": 0.9097844362258911, + "step": 776 + }, + { + "epoch": 1.6560170394036209, + "grad_norm": 0.29296875, + "learning_rate": 1.6973609924598605e-05, + "loss": 0.8360726833343506, + "step": 778 + }, + { + "epoch": 1.6602768903088392, + "grad_norm": 0.31640625, + "learning_rate": 1.691751842987478e-05, + "loss": 0.7691276669502258, + "step": 780 + }, + { + "epoch": 1.6645367412140575, + "grad_norm": 0.412109375, + "learning_rate": 1.6861450770323317e-05, + "loss": 0.9032488465309143, + "step": 782 + }, + { + "epoch": 1.6687965921192758, + "grad_norm": 0.30859375, + "learning_rate": 1.680540818049856e-05, + "loss": 0.8317678570747375, + "step": 784 + }, + { + "epoch": 1.6730564430244943, + "grad_norm": 0.455078125, + "learning_rate": 1.674939189440285e-05, + "loss": 0.8583813905715942, + "step": 786 + }, + { + "epoch": 1.6773162939297124, + "grad_norm": 0.2314453125, + "learning_rate": 1.6693403145459335e-05, + "loss": 0.8612514138221741, + "step": 788 + }, + { + "epoch": 1.681576144834931, + "grad_norm": 0.19921875, + "learning_rate": 1.6637443166484836e-05, + "loss": 0.8975757360458374, + "step": 790 + }, + { + "epoch": 1.685835995740149, + "grad_norm": 0.2734375, + "learning_rate": 1.6581513189662684e-05, + "loss": 0.8868735432624817, + "step": 792 + }, + { + "epoch": 1.6900958466453675, + "grad_norm": 0.251953125, + "learning_rate": 1.652561444651558e-05, + "loss": 0.887550950050354, + "step": 794 + }, + { + "epoch": 1.6943556975505856, + "grad_norm": 0.25, + "learning_rate": 1.6469748167878502e-05, + "loss": 0.8832526803016663, + "step": 796 + }, + { + "epoch": 1.698615548455804, + "grad_norm": 0.203125, + "learning_rate": 1.64139155838716e-05, + "loss": 0.8911911845207214, + "step": 798 + }, + { + "epoch": 1.7028753993610224, + "grad_norm": 0.181640625, + "learning_rate": 1.635811792387308e-05, + "loss": 0.8105019927024841, + "step": 800 + }, + { + "epoch": 1.7071352502662407, + "grad_norm": 0.337890625, + "learning_rate": 1.630235641649217e-05, + "loss": 0.8116901516914368, + "step": 802 + }, + { + "epoch": 1.711395101171459, + "grad_norm": 0.2353515625, + "learning_rate": 1.6246632289542054e-05, + "loss": 0.936326801776886, + "step": 804 + }, + { + "epoch": 1.7156549520766773, + "grad_norm": 0.251953125, + "learning_rate": 1.6190946770012838e-05, + "loss": 0.7342237234115601, + "step": 806 + }, + { + "epoch": 1.7199148029818956, + "grad_norm": 0.314453125, + "learning_rate": 1.613530108404451e-05, + "loss": 0.8804312944412231, + "step": 808 + }, + { + "epoch": 1.7241746538871139, + "grad_norm": 0.255859375, + "learning_rate": 1.6079696456899987e-05, + "loss": 0.900128960609436, + "step": 810 + }, + { + "epoch": 1.7284345047923324, + "grad_norm": 0.2353515625, + "learning_rate": 1.6024134112938102e-05, + "loss": 0.9259054660797119, + "step": 812 + }, + { + "epoch": 1.7326943556975505, + "grad_norm": 0.353515625, + "learning_rate": 1.5968615275586648e-05, + "loss": 0.7679681777954102, + "step": 814 + }, + { + "epoch": 1.736954206602769, + "grad_norm": 0.267578125, + "learning_rate": 1.5913141167315455e-05, + "loss": 0.8207501173019409, + "step": 816 + }, + { + "epoch": 1.741214057507987, + "grad_norm": 0.69140625, + "learning_rate": 1.5857713009609468e-05, + "loss": 0.8840711116790771, + "step": 818 + }, + { + "epoch": 1.7454739084132056, + "grad_norm": 1.03125, + "learning_rate": 1.5802332022941827e-05, + "loss": 0.87161785364151, + "step": 820 + }, + { + "epoch": 1.749733759318424, + "grad_norm": 0.1552734375, + "learning_rate": 1.5746999426747028e-05, + "loss": 0.8653435111045837, + "step": 822 + }, + { + "epoch": 1.7539936102236422, + "grad_norm": 0.39453125, + "learning_rate": 1.5691716439394043e-05, + "loss": 0.8810278177261353, + "step": 824 + }, + { + "epoch": 1.7582534611288605, + "grad_norm": 0.29296875, + "learning_rate": 1.563648427815953e-05, + "loss": 0.8902249336242676, + "step": 826 + }, + { + "epoch": 1.7625133120340788, + "grad_norm": 0.32421875, + "learning_rate": 1.558130415920098e-05, + "loss": 0.8972048163414001, + "step": 828 + }, + { + "epoch": 1.766773162939297, + "grad_norm": 0.2412109375, + "learning_rate": 1.552617729752998e-05, + "loss": 0.8320347666740417, + "step": 830 + }, + { + "epoch": 1.7710330138445154, + "grad_norm": 0.1982421875, + "learning_rate": 1.5471104906985447e-05, + "loss": 0.8805668354034424, + "step": 832 + }, + { + "epoch": 1.775292864749734, + "grad_norm": 0.146484375, + "learning_rate": 1.5416088200206873e-05, + "loss": 0.8669639229774475, + "step": 834 + }, + { + "epoch": 1.779552715654952, + "grad_norm": 0.2412109375, + "learning_rate": 1.5361128388607685e-05, + "loss": 0.8641019463539124, + "step": 836 + }, + { + "epoch": 1.7838125665601705, + "grad_norm": 0.2080078125, + "learning_rate": 1.5306226682348513e-05, + "loss": 0.8257539868354797, + "step": 838 + }, + { + "epoch": 1.7880724174653886, + "grad_norm": 0.2353515625, + "learning_rate": 1.525138429031056e-05, + "loss": 0.8225594758987427, + "step": 840 + }, + { + "epoch": 1.792332268370607, + "grad_norm": 0.2734375, + "learning_rate": 1.5196602420068995e-05, + "loss": 0.8701678514480591, + "step": 842 + }, + { + "epoch": 1.7965921192758252, + "grad_norm": 0.28515625, + "learning_rate": 1.514188227786637e-05, + "loss": 0.8979432582855225, + "step": 844 + }, + { + "epoch": 1.8008519701810437, + "grad_norm": 0.26171875, + "learning_rate": 1.5087225068586032e-05, + "loss": 0.8577451109886169, + "step": 846 + }, + { + "epoch": 1.805111821086262, + "grad_norm": 0.375, + "learning_rate": 1.5032631995725602e-05, + "loss": 0.7677904367446899, + "step": 848 + }, + { + "epoch": 1.8093716719914803, + "grad_norm": 0.2138671875, + "learning_rate": 1.4978104261370499e-05, + "loss": 0.8740429878234863, + "step": 850 + }, + { + "epoch": 1.8136315228966986, + "grad_norm": 0.2080078125, + "learning_rate": 1.4923643066167442e-05, + "loss": 0.8772373795509338, + "step": 852 + }, + { + "epoch": 1.817891373801917, + "grad_norm": 0.1923828125, + "learning_rate": 1.4869249609298016e-05, + "loss": 0.8475224375724792, + "step": 854 + }, + { + "epoch": 1.8221512247071352, + "grad_norm": 0.2294921875, + "learning_rate": 1.4814925088452294e-05, + "loss": 0.8336386680603027, + "step": 856 + }, + { + "epoch": 1.8264110756123535, + "grad_norm": 0.26171875, + "learning_rate": 1.4760670699802433e-05, + "loss": 0.8594604730606079, + "step": 858 + }, + { + "epoch": 1.830670926517572, + "grad_norm": 0.345703125, + "learning_rate": 1.4706487637976349e-05, + "loss": 0.8947794437408447, + "step": 860 + }, + { + "epoch": 1.83493077742279, + "grad_norm": 0.181640625, + "learning_rate": 1.4652377096031413e-05, + "loss": 0.8090410828590393, + "step": 862 + }, + { + "epoch": 1.8391906283280086, + "grad_norm": 0.255859375, + "learning_rate": 1.4598340265428186e-05, + "loss": 0.8447999954223633, + "step": 864 + }, + { + "epoch": 1.8434504792332267, + "grad_norm": 0.1728515625, + "learning_rate": 1.4544378336004174e-05, + "loss": 0.8753990530967712, + "step": 866 + }, + { + "epoch": 1.8477103301384452, + "grad_norm": 0.291015625, + "learning_rate": 1.4490492495947626e-05, + "loss": 0.8337631225585938, + "step": 868 + }, + { + "epoch": 1.8519701810436635, + "grad_norm": 0.515625, + "learning_rate": 1.4436683931771386e-05, + "loss": 0.8855006098747253, + "step": 870 + }, + { + "epoch": 1.8562300319488818, + "grad_norm": 0.25, + "learning_rate": 1.4382953828286769e-05, + "loss": 0.8446431756019592, + "step": 872 + }, + { + "epoch": 1.8604898828541, + "grad_norm": 0.1845703125, + "learning_rate": 1.4329303368577442e-05, + "loss": 0.9195294976234436, + "step": 874 + }, + { + "epoch": 1.8647497337593184, + "grad_norm": 0.2197265625, + "learning_rate": 1.4275733733973408e-05, + "loss": 0.8846089243888855, + "step": 876 + }, + { + "epoch": 1.8690095846645367, + "grad_norm": 0.310546875, + "learning_rate": 1.4222246104024985e-05, + "loss": 0.8711283802986145, + "step": 878 + }, + { + "epoch": 1.873269435569755, + "grad_norm": 0.2421875, + "learning_rate": 1.4168841656476817e-05, + "loss": 0.8777478337287903, + "step": 880 + }, + { + "epoch": 1.8775292864749735, + "grad_norm": 0.2060546875, + "learning_rate": 1.411552156724196e-05, + "loss": 0.9211516976356506, + "step": 882 + }, + { + "epoch": 1.8817891373801916, + "grad_norm": 0.2060546875, + "learning_rate": 1.4062287010375991e-05, + "loss": 0.7991109490394592, + "step": 884 + }, + { + "epoch": 1.8860489882854101, + "grad_norm": 0.267578125, + "learning_rate": 1.4009139158051142e-05, + "loss": 0.7523772120475769, + "step": 886 + }, + { + "epoch": 1.8903088391906282, + "grad_norm": 0.306640625, + "learning_rate": 1.3956079180530488e-05, + "loss": 0.8029102087020874, + "step": 888 + }, + { + "epoch": 1.8945686900958467, + "grad_norm": 1.09375, + "learning_rate": 1.3903108246142204e-05, + "loss": 0.9185020923614502, + "step": 890 + }, + { + "epoch": 1.898828541001065, + "grad_norm": 0.185546875, + "learning_rate": 1.3850227521253819e-05, + "loss": 0.8490954041481018, + "step": 892 + }, + { + "epoch": 1.9030883919062833, + "grad_norm": 0.2490234375, + "learning_rate": 1.379743817024653e-05, + "loss": 0.9293335676193237, + "step": 894 + }, + { + "epoch": 1.9073482428115016, + "grad_norm": 0.26953125, + "learning_rate": 1.3744741355489573e-05, + "loss": 0.83982253074646, + "step": 896 + }, + { + "epoch": 1.91160809371672, + "grad_norm": 1.0703125, + "learning_rate": 1.3692138237314642e-05, + "loss": 0.8462101817131042, + "step": 898 + }, + { + "epoch": 1.9158679446219382, + "grad_norm": 0.296875, + "learning_rate": 1.3639629973990308e-05, + "loss": 0.8812525272369385, + "step": 900 + }, + { + "epoch": 1.9201277955271565, + "grad_norm": 0.2021484375, + "learning_rate": 1.3587217721696534e-05, + "loss": 0.8216854929924011, + "step": 902 + }, + { + "epoch": 1.924387646432375, + "grad_norm": 0.267578125, + "learning_rate": 1.3534902634499233e-05, + "loss": 0.8462478518486023, + "step": 904 + }, + { + "epoch": 1.928647497337593, + "grad_norm": 0.2099609375, + "learning_rate": 1.3482685864324816e-05, + "loss": 0.8769442439079285, + "step": 906 + }, + { + "epoch": 1.9329073482428116, + "grad_norm": 0.2890625, + "learning_rate": 1.3430568560934854e-05, + "loss": 0.8453910946846008, + "step": 908 + }, + { + "epoch": 1.9371671991480297, + "grad_norm": 0.236328125, + "learning_rate": 1.3378551871900778e-05, + "loss": 0.7549237012863159, + "step": 910 + }, + { + "epoch": 1.9414270500532482, + "grad_norm": 0.2734375, + "learning_rate": 1.332663694257857e-05, + "loss": 0.8484979867935181, + "step": 912 + }, + { + "epoch": 1.9456869009584663, + "grad_norm": 0.33203125, + "learning_rate": 1.3274824916083569e-05, + "loss": 0.8290879130363464, + "step": 914 + }, + { + "epoch": 1.9499467518636848, + "grad_norm": 0.314453125, + "learning_rate": 1.3223116933265295e-05, + "loss": 0.880619466304779, + "step": 916 + }, + { + "epoch": 1.9542066027689031, + "grad_norm": 0.205078125, + "learning_rate": 1.3171514132682338e-05, + "loss": 0.8705392479896545, + "step": 918 + }, + { + "epoch": 1.9584664536741214, + "grad_norm": 0.2333984375, + "learning_rate": 1.3120017650577267e-05, + "loss": 0.849368691444397, + "step": 920 + }, + { + "epoch": 1.9627263045793397, + "grad_norm": 1.046875, + "learning_rate": 1.3068628620851627e-05, + "loss": 0.8190315961837769, + "step": 922 + }, + { + "epoch": 1.966986155484558, + "grad_norm": 0.390625, + "learning_rate": 1.3017348175040983e-05, + "loss": 0.8338907361030579, + "step": 924 + }, + { + "epoch": 1.9712460063897763, + "grad_norm": 0.2294921875, + "learning_rate": 1.2966177442289958e-05, + "loss": 0.783728837966919, + "step": 926 + }, + { + "epoch": 1.9755058572949946, + "grad_norm": 0.16796875, + "learning_rate": 1.2915117549327428e-05, + "loss": 0.8934606313705444, + "step": 928 + }, + { + "epoch": 1.9797657082002131, + "grad_norm": 0.458984375, + "learning_rate": 1.2864169620441688e-05, + "loss": 0.8038821220397949, + "step": 930 + }, + { + "epoch": 1.9840255591054312, + "grad_norm": 0.349609375, + "learning_rate": 1.2813334777455677e-05, + "loss": 0.9299109578132629, + "step": 932 + }, + { + "epoch": 1.9882854100106497, + "grad_norm": 0.1748046875, + "learning_rate": 1.27626141397023e-05, + "loss": 0.7765668034553528, + "step": 934 + }, + { + "epoch": 1.9925452609158678, + "grad_norm": 0.224609375, + "learning_rate": 1.2712008823999787e-05, + "loss": 0.8893784284591675, + "step": 936 + }, + { + "epoch": 1.9968051118210863, + "grad_norm": 0.2275390625, + "learning_rate": 1.2661519944627085e-05, + "loss": 0.8529191017150879, + "step": 938 + }, + { + "epoch": 2.0, + "grad_norm": 0.365234375, + "learning_rate": 1.2611148613299316e-05, + "loss": 0.8112186789512634, + "step": 940 + }, + { + "epoch": 2.0042598509052185, + "grad_norm": 0.1748046875, + "learning_rate": 1.2560895939143335e-05, + "loss": 0.6377139687538147, + "step": 942 + }, + { + "epoch": 2.0085197018104366, + "grad_norm": 0.15625, + "learning_rate": 1.2510763028673259e-05, + "loss": 0.5881322026252747, + "step": 944 + }, + { + "epoch": 2.012779552715655, + "grad_norm": 0.16015625, + "learning_rate": 1.2460750985766133e-05, + "loss": 0.5497787594795227, + "step": 946 + }, + { + "epoch": 2.017039403620873, + "grad_norm": 0.27734375, + "learning_rate": 1.2410860911637633e-05, + "loss": 0.6513974070549011, + "step": 948 + }, + { + "epoch": 2.0212992545260917, + "grad_norm": 0.16796875, + "learning_rate": 1.2361093904817794e-05, + "loss": 0.6880634427070618, + "step": 950 + }, + { + "epoch": 2.02555910543131, + "grad_norm": 0.2470703125, + "learning_rate": 1.2311451061126825e-05, + "loss": 0.669802188873291, + "step": 952 + }, + { + "epoch": 2.0298189563365283, + "grad_norm": 0.2451171875, + "learning_rate": 1.2261933473650986e-05, + "loss": 0.6532925963401794, + "step": 954 + }, + { + "epoch": 2.0340788072417464, + "grad_norm": 0.216796875, + "learning_rate": 1.2212542232718526e-05, + "loss": 0.6424761414527893, + "step": 956 + }, + { + "epoch": 2.038338658146965, + "grad_norm": 0.3046875, + "learning_rate": 1.2163278425875673e-05, + "loss": 0.599922776222229, + "step": 958 + }, + { + "epoch": 2.042598509052183, + "grad_norm": 0.361328125, + "learning_rate": 1.211414313786267e-05, + "loss": 0.5999573469161987, + "step": 960 + }, + { + "epoch": 2.0468583599574015, + "grad_norm": 0.341796875, + "learning_rate": 1.2065137450589902e-05, + "loss": 0.5664547681808472, + "step": 962 + }, + { + "epoch": 2.0511182108626196, + "grad_norm": 0.27734375, + "learning_rate": 1.2016262443114092e-05, + "loss": 0.6771121025085449, + "step": 964 + }, + { + "epoch": 2.055378061767838, + "grad_norm": 0.251953125, + "learning_rate": 1.19675191916145e-05, + "loss": 0.6011976003646851, + "step": 966 + }, + { + "epoch": 2.0596379126730566, + "grad_norm": 0.45703125, + "learning_rate": 1.1918908769369263e-05, + "loss": 0.624125599861145, + "step": 968 + }, + { + "epoch": 2.0638977635782747, + "grad_norm": 0.52734375, + "learning_rate": 1.187043224673176e-05, + "loss": 0.5838209390640259, + "step": 970 + }, + { + "epoch": 2.0681576144834932, + "grad_norm": 0.4765625, + "learning_rate": 1.1822090691107007e-05, + "loss": 0.6163349151611328, + "step": 972 + }, + { + "epoch": 2.0724174653887113, + "grad_norm": 0.2578125, + "learning_rate": 1.1773885166928193e-05, + "loss": 0.6664748787879944, + "step": 974 + }, + { + "epoch": 2.07667731629393, + "grad_norm": 0.26171875, + "learning_rate": 1.1725816735633235e-05, + "loss": 0.6090631484985352, + "step": 976 + }, + { + "epoch": 2.080937167199148, + "grad_norm": 0.2734375, + "learning_rate": 1.1677886455641398e-05, + "loss": 0.6150251030921936, + "step": 978 + }, + { + "epoch": 2.0851970181043664, + "grad_norm": 0.59375, + "learning_rate": 1.1630095382329988e-05, + "loss": 0.6834192872047424, + "step": 980 + }, + { + "epoch": 2.0894568690095845, + "grad_norm": 0.263671875, + "learning_rate": 1.158244456801111e-05, + "loss": 0.5855680108070374, + "step": 982 + }, + { + "epoch": 2.093716719914803, + "grad_norm": 0.26171875, + "learning_rate": 1.1534935061908528e-05, + "loss": 0.6290924549102783, + "step": 984 + }, + { + "epoch": 2.097976570820021, + "grad_norm": 0.27734375, + "learning_rate": 1.1487567910134513e-05, + "loss": 0.5710505247116089, + "step": 986 + }, + { + "epoch": 2.1022364217252396, + "grad_norm": 0.3203125, + "learning_rate": 1.1440344155666851e-05, + "loss": 0.6610984802246094, + "step": 988 + }, + { + "epoch": 2.106496272630458, + "grad_norm": 0.2138671875, + "learning_rate": 1.1393264838325865e-05, + "loss": 0.6294957995414734, + "step": 990 + }, + { + "epoch": 2.110756123535676, + "grad_norm": 0.46484375, + "learning_rate": 1.1346330994751497e-05, + "loss": 0.6489307880401611, + "step": 992 + }, + { + "epoch": 2.1150159744408947, + "grad_norm": 0.287109375, + "learning_rate": 1.1299543658380509e-05, + "loss": 0.5717250108718872, + "step": 994 + }, + { + "epoch": 2.119275825346113, + "grad_norm": 0.236328125, + "learning_rate": 1.1252903859423728e-05, + "loss": 0.5853033065795898, + "step": 996 + }, + { + "epoch": 2.1235356762513313, + "grad_norm": 0.2216796875, + "learning_rate": 1.120641262484335e-05, + "loss": 0.608925461769104, + "step": 998 + }, + { + "epoch": 2.1277955271565494, + "grad_norm": 0.216796875, + "learning_rate": 1.1160070978330323e-05, + "loss": 0.6262862086296082, + "step": 1000 + }, + { + "epoch": 2.132055378061768, + "grad_norm": 0.30078125, + "learning_rate": 1.1113879940281813e-05, + "loss": 0.5531333088874817, + "step": 1002 + }, + { + "epoch": 2.136315228966986, + "grad_norm": 0.302734375, + "learning_rate": 1.1067840527778752e-05, + "loss": 0.6142609119415283, + "step": 1004 + }, + { + "epoch": 2.1405750798722045, + "grad_norm": 0.271484375, + "learning_rate": 1.1021953754563406e-05, + "loss": 0.6254585981369019, + "step": 1006 + }, + { + "epoch": 2.1448349307774226, + "grad_norm": 0.224609375, + "learning_rate": 1.0976220631017094e-05, + "loss": 0.648613691329956, + "step": 1008 + }, + { + "epoch": 2.149094781682641, + "grad_norm": 0.400390625, + "learning_rate": 1.0930642164137922e-05, + "loss": 0.4957270324230194, + "step": 1010 + }, + { + "epoch": 2.1533546325878596, + "grad_norm": 0.2353515625, + "learning_rate": 1.0885219357518583e-05, + "loss": 0.6625660061836243, + "step": 1012 + }, + { + "epoch": 2.1576144834930777, + "grad_norm": 0.21875, + "learning_rate": 1.0839953211324313e-05, + "loss": 0.6448312401771545, + "step": 1014 + }, + { + "epoch": 2.1618743343982962, + "grad_norm": 0.310546875, + "learning_rate": 1.0794844722270831e-05, + "loss": 0.6265139579772949, + "step": 1016 + }, + { + "epoch": 2.1661341853035143, + "grad_norm": 0.3515625, + "learning_rate": 1.0749894883602406e-05, + "loss": 0.58893221616745, + "step": 1018 + }, + { + "epoch": 2.170394036208733, + "grad_norm": 0.5234375, + "learning_rate": 1.0705104685069973e-05, + "loss": 0.524358332157135, + "step": 1020 + }, + { + "epoch": 2.174653887113951, + "grad_norm": 0.392578125, + "learning_rate": 1.0660475112909354e-05, + "loss": 0.6041074395179749, + "step": 1022 + }, + { + "epoch": 2.1789137380191694, + "grad_norm": 0.25, + "learning_rate": 1.0616007149819543e-05, + "loss": 0.6296215653419495, + "step": 1024 + }, + { + "epoch": 2.1831735889243875, + "grad_norm": 0.37890625, + "learning_rate": 1.057170177494105e-05, + "loss": 0.6504489779472351, + "step": 1026 + }, + { + "epoch": 2.187433439829606, + "grad_norm": 0.361328125, + "learning_rate": 1.052755996383437e-05, + "loss": 0.6803485155105591, + "step": 1028 + }, + { + "epoch": 2.191693290734824, + "grad_norm": 0.2333984375, + "learning_rate": 1.0483582688458472e-05, + "loss": 0.6579641699790955, + "step": 1030 + }, + { + "epoch": 2.1959531416400426, + "grad_norm": 0.5234375, + "learning_rate": 1.0439770917149414e-05, + "loss": 0.6605786085128784, + "step": 1032 + }, + { + "epoch": 2.2002129925452607, + "grad_norm": 0.189453125, + "learning_rate": 1.0396125614599018e-05, + "loss": 0.6570585370063782, + "step": 1034 + }, + { + "epoch": 2.2044728434504792, + "grad_norm": 0.337890625, + "learning_rate": 1.0352647741833637e-05, + "loss": 0.6363896131515503, + "step": 1036 + }, + { + "epoch": 2.2087326943556977, + "grad_norm": 0.296875, + "learning_rate": 1.0309338256192982e-05, + "loss": 0.6393426656723022, + "step": 1038 + }, + { + "epoch": 2.212992545260916, + "grad_norm": 0.345703125, + "learning_rate": 1.0266198111309041e-05, + "loss": 0.7091052532196045, + "step": 1040 + }, + { + "epoch": 2.2172523961661343, + "grad_norm": 0.75, + "learning_rate": 1.0223228257085083e-05, + "loss": 0.6515456438064575, + "step": 1042 + }, + { + "epoch": 2.2215122470713524, + "grad_norm": 0.244140625, + "learning_rate": 1.0180429639674761e-05, + "loss": 0.6235453486442566, + "step": 1044 + }, + { + "epoch": 2.225772097976571, + "grad_norm": 0.263671875, + "learning_rate": 1.0137803201461248e-05, + "loss": 0.5850796699523926, + "step": 1046 + }, + { + "epoch": 2.230031948881789, + "grad_norm": 0.443359375, + "learning_rate": 1.0095349881036508e-05, + "loss": 0.5203170776367188, + "step": 1048 + }, + { + "epoch": 2.2342917997870075, + "grad_norm": 0.72265625, + "learning_rate": 1.0053070613180625e-05, + "loss": 0.6159985065460205, + "step": 1050 + }, + { + "epoch": 2.2385516506922256, + "grad_norm": 0.203125, + "learning_rate": 1.0010966328841206e-05, + "loss": 0.6239602565765381, + "step": 1052 + }, + { + "epoch": 2.242811501597444, + "grad_norm": 0.265625, + "learning_rate": 9.969037955112908e-06, + "loss": 0.6027981042861938, + "step": 1054 + }, + { + "epoch": 2.247071352502662, + "grad_norm": 0.30078125, + "learning_rate": 9.927286415217005e-06, + "loss": 0.591469407081604, + "step": 1056 + }, + { + "epoch": 2.2513312034078807, + "grad_norm": 0.1982421875, + "learning_rate": 9.88571262848107e-06, + "loss": 0.5683766007423401, + "step": 1058 + }, + { + "epoch": 2.255591054313099, + "grad_norm": 0.35546875, + "learning_rate": 9.844317510318719e-06, + "loss": 0.6158217191696167, + "step": 1060 + }, + { + "epoch": 2.2598509052183173, + "grad_norm": 0.60546875, + "learning_rate": 9.803101972209462e-06, + "loss": 0.5769312381744385, + "step": 1062 + }, + { + "epoch": 2.264110756123536, + "grad_norm": 0.24609375, + "learning_rate": 9.762066921678647e-06, + "loss": 0.5810741186141968, + "step": 1064 + }, + { + "epoch": 2.268370607028754, + "grad_norm": 0.28125, + "learning_rate": 9.721213262277447e-06, + "loss": 0.5853366255760193, + "step": 1066 + }, + { + "epoch": 2.2726304579339724, + "grad_norm": 0.2314453125, + "learning_rate": 9.680541893563e-06, + "loss": 0.5754764676094055, + "step": 1068 + }, + { + "epoch": 2.2768903088391905, + "grad_norm": 0.376953125, + "learning_rate": 9.640053711078571e-06, + "loss": 0.6414265632629395, + "step": 1070 + }, + { + "epoch": 2.281150159744409, + "grad_norm": 0.703125, + "learning_rate": 9.599749606333844e-06, + "loss": 0.5730122327804565, + "step": 1072 + }, + { + "epoch": 2.285410010649627, + "grad_norm": 0.224609375, + "learning_rate": 9.559630466785301e-06, + "loss": 0.6548243761062622, + "step": 1074 + }, + { + "epoch": 2.2896698615548456, + "grad_norm": 0.359375, + "learning_rate": 9.519697175816675e-06, + "loss": 0.6757615804672241, + "step": 1076 + }, + { + "epoch": 2.2939297124600637, + "grad_norm": 0.271484375, + "learning_rate": 9.4799506127195e-06, + "loss": 0.6540831923484802, + "step": 1078 + }, + { + "epoch": 2.2981895633652822, + "grad_norm": 0.5390625, + "learning_rate": 9.44039165267372e-06, + "loss": 0.5985897779464722, + "step": 1080 + }, + { + "epoch": 2.3024494142705008, + "grad_norm": 0.318359375, + "learning_rate": 9.40102116672848e-06, + "loss": 0.6373129487037659, + "step": 1082 + }, + { + "epoch": 2.306709265175719, + "grad_norm": 0.32421875, + "learning_rate": 9.361840021782899e-06, + "loss": 0.5798696279525757, + "step": 1084 + }, + { + "epoch": 2.3109691160809374, + "grad_norm": 0.53125, + "learning_rate": 9.322849080566986e-06, + "loss": 0.6472339034080505, + "step": 1086 + }, + { + "epoch": 2.3152289669861554, + "grad_norm": 0.2236328125, + "learning_rate": 9.284049201622668e-06, + "loss": 0.5931280851364136, + "step": 1088 + }, + { + "epoch": 2.319488817891374, + "grad_norm": 0.26171875, + "learning_rate": 9.245441239284858e-06, + "loss": 0.6150895953178406, + "step": 1090 + }, + { + "epoch": 2.323748668796592, + "grad_norm": 0.37890625, + "learning_rate": 9.207026043662654e-06, + "loss": 0.5743486285209656, + "step": 1092 + }, + { + "epoch": 2.3280085197018106, + "grad_norm": 0.302734375, + "learning_rate": 9.168804460620634e-06, + "loss": 0.6586934328079224, + "step": 1094 + }, + { + "epoch": 2.3322683706070286, + "grad_norm": 0.333984375, + "learning_rate": 9.130777331760208e-06, + "loss": 0.581457793712616, + "step": 1096 + }, + { + "epoch": 2.336528221512247, + "grad_norm": 0.236328125, + "learning_rate": 9.092945494401107e-06, + "loss": 0.602104663848877, + "step": 1098 + }, + { + "epoch": 2.3407880724174652, + "grad_norm": 0.318359375, + "learning_rate": 9.055309781562922e-06, + "loss": 0.5987313985824585, + "step": 1100 + }, + { + "epoch": 2.3450479233226837, + "grad_norm": 0.212890625, + "learning_rate": 9.017871021946787e-06, + "loss": 0.5123194456100464, + "step": 1102 + }, + { + "epoch": 2.349307774227902, + "grad_norm": 0.494140625, + "learning_rate": 8.980630039917124e-06, + "loss": 0.5810441374778748, + "step": 1104 + }, + { + "epoch": 2.3535676251331203, + "grad_norm": 0.5390625, + "learning_rate": 8.943587655483478e-06, + "loss": 0.5871768593788147, + "step": 1106 + }, + { + "epoch": 2.357827476038339, + "grad_norm": 0.2412109375, + "learning_rate": 8.906744684282483e-06, + "loss": 0.6104775667190552, + "step": 1108 + }, + { + "epoch": 2.362087326943557, + "grad_norm": 0.296875, + "learning_rate": 8.870101937559877e-06, + "loss": 0.6351394653320312, + "step": 1110 + }, + { + "epoch": 2.3663471778487755, + "grad_norm": 0.52734375, + "learning_rate": 8.833660222152663e-06, + "loss": 0.6355900168418884, + "step": 1112 + }, + { + "epoch": 2.3706070287539935, + "grad_norm": 0.2294921875, + "learning_rate": 8.797420340471334e-06, + "loss": 0.5833765268325806, + "step": 1114 + }, + { + "epoch": 2.374866879659212, + "grad_norm": 0.2216796875, + "learning_rate": 8.761383090482205e-06, + "loss": 0.6019313931465149, + "step": 1116 + }, + { + "epoch": 2.37912673056443, + "grad_norm": 0.2265625, + "learning_rate": 8.725549265689833e-06, + "loss": 0.5999468564987183, + "step": 1118 + }, + { + "epoch": 2.3833865814696487, + "grad_norm": 0.484375, + "learning_rate": 8.689919655119559e-06, + "loss": 0.6521666646003723, + "step": 1120 + }, + { + "epoch": 2.3876464323748667, + "grad_norm": 0.224609375, + "learning_rate": 8.654495043300129e-06, + "loss": 0.612395703792572, + "step": 1122 + }, + { + "epoch": 2.3919062832800853, + "grad_norm": 0.466796875, + "learning_rate": 8.619276210246427e-06, + "loss": 0.5964239239692688, + "step": 1124 + }, + { + "epoch": 2.3961661341853033, + "grad_norm": 0.26171875, + "learning_rate": 8.584263931442275e-06, + "loss": 0.6384221911430359, + "step": 1126 + }, + { + "epoch": 2.400425985090522, + "grad_norm": 0.28125, + "learning_rate": 8.549458977823395e-06, + "loss": 0.6933798789978027, + "step": 1128 + }, + { + "epoch": 2.40468583599574, + "grad_norm": 0.1962890625, + "learning_rate": 8.514862115760396e-06, + "loss": 0.5889874696731567, + "step": 1130 + }, + { + "epoch": 2.4089456869009584, + "grad_norm": 0.30859375, + "learning_rate": 8.480474107041925e-06, + "loss": 0.6254542469978333, + "step": 1132 + }, + { + "epoch": 2.413205537806177, + "grad_norm": 0.314453125, + "learning_rate": 8.446295708857888e-06, + "loss": 0.6616327166557312, + "step": 1134 + }, + { + "epoch": 2.417465388711395, + "grad_norm": 0.375, + "learning_rate": 8.412327673782774e-06, + "loss": 0.6202198266983032, + "step": 1136 + }, + { + "epoch": 2.4217252396166136, + "grad_norm": 0.267578125, + "learning_rate": 8.378570749759076e-06, + "loss": 0.6176246404647827, + "step": 1138 + }, + { + "epoch": 2.4259850905218316, + "grad_norm": 0.369140625, + "learning_rate": 8.345025680080836e-06, + "loss": 0.5884604454040527, + "step": 1140 + }, + { + "epoch": 2.43024494142705, + "grad_norm": 0.4453125, + "learning_rate": 8.311693203377277e-06, + "loss": 0.5704495906829834, + "step": 1142 + }, + { + "epoch": 2.4345047923322682, + "grad_norm": 0.357421875, + "learning_rate": 8.278574053596534e-06, + "loss": 0.5104537606239319, + "step": 1144 + }, + { + "epoch": 2.4387646432374868, + "grad_norm": 0.375, + "learning_rate": 8.245668959989489e-06, + "loss": 0.6920484900474548, + "step": 1146 + }, + { + "epoch": 2.443024494142705, + "grad_norm": 0.3359375, + "learning_rate": 8.212978647093724e-06, + "loss": 0.605790376663208, + "step": 1148 + }, + { + "epoch": 2.4472843450479234, + "grad_norm": 0.431640625, + "learning_rate": 8.180503834717563e-06, + "loss": 0.6005589962005615, + "step": 1150 + }, + { + "epoch": 2.451544195953142, + "grad_norm": 0.30078125, + "learning_rate": 8.148245237924212e-06, + "loss": 0.6908122301101685, + "step": 1152 + }, + { + "epoch": 2.45580404685836, + "grad_norm": 0.392578125, + "learning_rate": 8.116203567016035e-06, + "loss": 0.5939027667045593, + "step": 1154 + }, + { + "epoch": 2.460063897763578, + "grad_norm": 0.357421875, + "learning_rate": 8.084379527518908e-06, + "loss": 0.6245042681694031, + "step": 1156 + }, + { + "epoch": 2.4643237486687966, + "grad_norm": 0.25390625, + "learning_rate": 8.05277382016666e-06, + "loss": 0.5638337731361389, + "step": 1158 + }, + { + "epoch": 2.468583599574015, + "grad_norm": 0.2890625, + "learning_rate": 8.021387140885672e-06, + "loss": 0.665945291519165, + "step": 1160 + }, + { + "epoch": 2.472843450479233, + "grad_norm": 0.2255859375, + "learning_rate": 7.99022018077955e-06, + "loss": 0.5603002309799194, + "step": 1162 + }, + { + "epoch": 2.4771033013844517, + "grad_norm": 0.287109375, + "learning_rate": 7.959273626113896e-06, + "loss": 0.5992410182952881, + "step": 1164 + }, + { + "epoch": 2.4813631522896697, + "grad_norm": 0.216796875, + "learning_rate": 7.9285481583012e-06, + "loss": 0.6628497242927551, + "step": 1166 + }, + { + "epoch": 2.4856230031948883, + "grad_norm": 0.984375, + "learning_rate": 7.898044453885837e-06, + "loss": 0.5260273218154907, + "step": 1168 + }, + { + "epoch": 2.4898828541001063, + "grad_norm": 0.265625, + "learning_rate": 7.867763184529182e-06, + "loss": 0.6244964599609375, + "step": 1170 + }, + { + "epoch": 2.494142705005325, + "grad_norm": 0.369140625, + "learning_rate": 7.837705016994796e-06, + "loss": 0.6657370328903198, + "step": 1172 + }, + { + "epoch": 2.498402555910543, + "grad_norm": 0.333984375, + "learning_rate": 7.80787061313377e-06, + "loss": 0.6410002708435059, + "step": 1174 + }, + { + "epoch": 2.5026624068157615, + "grad_norm": 0.1953125, + "learning_rate": 7.77826062987014e-06, + "loss": 0.5408449769020081, + "step": 1176 + }, + { + "epoch": 2.50692225772098, + "grad_norm": 0.35546875, + "learning_rate": 7.748875719186413e-06, + "loss": 0.5735031962394714, + "step": 1178 + }, + { + "epoch": 2.511182108626198, + "grad_norm": 0.322265625, + "learning_rate": 7.71971652810923e-06, + "loss": 0.6153873801231384, + "step": 1180 + }, + { + "epoch": 2.515441959531416, + "grad_norm": 0.33984375, + "learning_rate": 7.690783698695106e-06, + "loss": 0.5873544216156006, + "step": 1182 + }, + { + "epoch": 2.5197018104366347, + "grad_norm": 0.259765625, + "learning_rate": 7.662077868016297e-06, + "loss": 0.6717422604560852, + "step": 1184 + }, + { + "epoch": 2.523961661341853, + "grad_norm": 0.94921875, + "learning_rate": 7.633599668146775e-06, + "loss": 0.6083505153656006, + "step": 1186 + }, + { + "epoch": 2.5282215122470713, + "grad_norm": 0.30078125, + "learning_rate": 7.605349726148296e-06, + "loss": 0.6134154200553894, + "step": 1188 + }, + { + "epoch": 2.5324813631522898, + "grad_norm": 0.275390625, + "learning_rate": 7.577328664056617e-06, + "loss": 0.589963972568512, + "step": 1190 + }, + { + "epoch": 2.536741214057508, + "grad_norm": 0.51171875, + "learning_rate": 7.549537098867776e-06, + "loss": 0.5288025140762329, + "step": 1192 + }, + { + "epoch": 2.5410010649627264, + "grad_norm": 0.2451171875, + "learning_rate": 7.521975642524525e-06, + "loss": 0.616111159324646, + "step": 1194 + }, + { + "epoch": 2.545260915867945, + "grad_norm": 0.3046875, + "learning_rate": 7.494644901902843e-06, + "loss": 0.6015118360519409, + "step": 1196 + }, + { + "epoch": 2.549520766773163, + "grad_norm": 0.30078125, + "learning_rate": 7.467545478798574e-06, + "loss": 0.5770639777183533, + "step": 1198 + }, + { + "epoch": 2.553780617678381, + "grad_norm": 0.365234375, + "learning_rate": 7.440677969914182e-06, + "loss": 0.6590741872787476, + "step": 1200 + }, + { + "epoch": 2.5580404685835996, + "grad_norm": 0.2392578125, + "learning_rate": 7.4140429668456115e-06, + "loss": 0.47983720898628235, + "step": 1202 + }, + { + "epoch": 2.562300319488818, + "grad_norm": 0.31640625, + "learning_rate": 7.38764105606926e-06, + "loss": 0.549656093120575, + "step": 1204 + }, + { + "epoch": 2.566560170394036, + "grad_norm": 1.1640625, + "learning_rate": 7.361472818929058e-06, + "loss": 0.5793447494506836, + "step": 1206 + }, + { + "epoch": 2.5708200212992547, + "grad_norm": 0.251953125, + "learning_rate": 7.335538831623676e-06, + "loss": 0.637956976890564, + "step": 1208 + }, + { + "epoch": 2.5750798722044728, + "grad_norm": 0.5625, + "learning_rate": 7.309839665193839e-06, + "loss": 0.5784144401550293, + "step": 1210 + }, + { + "epoch": 2.5793397231096913, + "grad_norm": 0.2275390625, + "learning_rate": 7.284375885509741e-06, + "loss": 0.6299670338630676, + "step": 1212 + }, + { + "epoch": 2.5835995740149094, + "grad_norm": 0.322265625, + "learning_rate": 7.259148053258603e-06, + "loss": 0.674586296081543, + "step": 1214 + }, + { + "epoch": 2.587859424920128, + "grad_norm": 0.3671875, + "learning_rate": 7.234156723932312e-06, + "loss": 0.6188330054283142, + "step": 1216 + }, + { + "epoch": 2.592119275825346, + "grad_norm": 0.271484375, + "learning_rate": 7.20940244781519e-06, + "loss": 0.6208375096321106, + "step": 1218 + }, + { + "epoch": 2.5963791267305645, + "grad_norm": 0.388671875, + "learning_rate": 7.184885769971888e-06, + "loss": 0.6017476916313171, + "step": 1220 + }, + { + "epoch": 2.600638977635783, + "grad_norm": 0.333984375, + "learning_rate": 7.160607230235378e-06, + "loss": 0.6354559659957886, + "step": 1222 + }, + { + "epoch": 2.604898828541001, + "grad_norm": 0.31640625, + "learning_rate": 7.136567363195069e-06, + "loss": 0.6745753884315491, + "step": 1224 + }, + { + "epoch": 2.609158679446219, + "grad_norm": 0.427734375, + "learning_rate": 7.112766698185027e-06, + "loss": 0.5988171100616455, + "step": 1226 + }, + { + "epoch": 2.6134185303514377, + "grad_norm": 0.25390625, + "learning_rate": 7.089205759272327e-06, + "loss": 0.6004793643951416, + "step": 1228 + }, + { + "epoch": 2.617678381256656, + "grad_norm": 0.31640625, + "learning_rate": 7.06588506524552e-06, + "loss": 0.5850980877876282, + "step": 1230 + }, + { + "epoch": 2.6219382321618743, + "grad_norm": 0.33203125, + "learning_rate": 7.042805129603193e-06, + "loss": 0.5615159869194031, + "step": 1232 + }, + { + "epoch": 2.626198083067093, + "grad_norm": 0.333984375, + "learning_rate": 7.019966460542681e-06, + "loss": 0.6120025515556335, + "step": 1234 + }, + { + "epoch": 2.630457933972311, + "grad_norm": 0.35546875, + "learning_rate": 6.997369560948859e-06, + "loss": 0.6796953082084656, + "step": 1236 + }, + { + "epoch": 2.6347177848775294, + "grad_norm": 0.306640625, + "learning_rate": 6.975014928383083e-06, + "loss": 0.5794081091880798, + "step": 1238 + }, + { + "epoch": 2.6389776357827475, + "grad_norm": 0.28515625, + "learning_rate": 6.952903055072226e-06, + "loss": 0.5920906066894531, + "step": 1240 + }, + { + "epoch": 2.643237486687966, + "grad_norm": 0.2412109375, + "learning_rate": 6.9310344278978505e-06, + "loss": 0.5745714902877808, + "step": 1242 + }, + { + "epoch": 2.647497337593184, + "grad_norm": 0.2578125, + "learning_rate": 6.909409528385466e-06, + "loss": 0.5876143574714661, + "step": 1244 + }, + { + "epoch": 2.6517571884984026, + "grad_norm": 0.2734375, + "learning_rate": 6.888028832693953e-06, + "loss": 0.586786150932312, + "step": 1246 + }, + { + "epoch": 2.656017039403621, + "grad_norm": 0.380859375, + "learning_rate": 6.86689281160506e-06, + "loss": 0.5594542622566223, + "step": 1248 + }, + { + "epoch": 2.660276890308839, + "grad_norm": 0.23046875, + "learning_rate": 6.846001930513041e-06, + "loss": 0.6434107422828674, + "step": 1250 + }, + { + "epoch": 2.6645367412140573, + "grad_norm": 0.361328125, + "learning_rate": 6.825356649414415e-06, + "loss": 0.6385661959648132, + "step": 1252 + }, + { + "epoch": 2.668796592119276, + "grad_norm": 0.291015625, + "learning_rate": 6.80495742289783e-06, + "loss": 0.6039466261863708, + "step": 1254 + }, + { + "epoch": 2.6730564430244943, + "grad_norm": 0.298828125, + "learning_rate": 6.784804700134056e-06, + "loss": 0.6025973558425903, + "step": 1256 + }, + { + "epoch": 2.6773162939297124, + "grad_norm": 0.296875, + "learning_rate": 6.764898924866091e-06, + "loss": 0.6119323372840881, + "step": 1258 + }, + { + "epoch": 2.681576144834931, + "grad_norm": 0.32421875, + "learning_rate": 6.7452405353993985e-06, + "loss": 0.617369532585144, + "step": 1260 + }, + { + "epoch": 2.685835995740149, + "grad_norm": 0.330078125, + "learning_rate": 6.72582996459225e-06, + "loss": 0.6640692949295044, + "step": 1262 + }, + { + "epoch": 2.6900958466453675, + "grad_norm": 0.3359375, + "learning_rate": 6.706667639846196e-06, + "loss": 0.6609706282615662, + "step": 1264 + }, + { + "epoch": 2.6943556975505856, + "grad_norm": 0.2490234375, + "learning_rate": 6.687753983096654e-06, + "loss": 0.53211909532547, + "step": 1266 + }, + { + "epoch": 2.698615548455804, + "grad_norm": 0.455078125, + "learning_rate": 6.669089410803617e-06, + "loss": 0.667971134185791, + "step": 1268 + }, + { + "epoch": 2.702875399361022, + "grad_norm": 0.265625, + "learning_rate": 6.650674333942487e-06, + "loss": 0.5798393487930298, + "step": 1270 + }, + { + "epoch": 2.7071352502662407, + "grad_norm": 0.439453125, + "learning_rate": 6.632509157995023e-06, + "loss": 0.6258153915405273, + "step": 1272 + }, + { + "epoch": 2.711395101171459, + "grad_norm": 0.392578125, + "learning_rate": 6.614594282940414e-06, + "loss": 0.624832272529602, + "step": 1274 + }, + { + "epoch": 2.7156549520766773, + "grad_norm": 0.349609375, + "learning_rate": 6.596930103246468e-06, + "loss": 0.5772223472595215, + "step": 1276 + }, + { + "epoch": 2.7199148029818954, + "grad_norm": 0.61328125, + "learning_rate": 6.579517007860933e-06, + "loss": 0.5936267971992493, + "step": 1278 + }, + { + "epoch": 2.724174653887114, + "grad_norm": 0.2197265625, + "learning_rate": 6.562355380202927e-06, + "loss": 0.668041467666626, + "step": 1280 + }, + { + "epoch": 2.7284345047923324, + "grad_norm": 0.25, + "learning_rate": 6.5454455981545e-06, + "loss": 0.5487772226333618, + "step": 1282 + }, + { + "epoch": 2.7326943556975505, + "grad_norm": 0.205078125, + "learning_rate": 6.528788034052311e-06, + "loss": 0.6349499225616455, + "step": 1284 + }, + { + "epoch": 2.736954206602769, + "grad_norm": 0.23828125, + "learning_rate": 6.512383054679422e-06, + "loss": 0.5938593149185181, + "step": 1286 + }, + { + "epoch": 2.741214057507987, + "grad_norm": 0.263671875, + "learning_rate": 6.496231021257242e-06, + "loss": 0.6245843172073364, + "step": 1288 + }, + { + "epoch": 2.7454739084132056, + "grad_norm": 0.357421875, + "learning_rate": 6.480332289437552e-06, + "loss": 0.5823163390159607, + "step": 1290 + }, + { + "epoch": 2.749733759318424, + "grad_norm": 0.3359375, + "learning_rate": 6.464687209294682e-06, + "loss": 0.5846402049064636, + "step": 1292 + }, + { + "epoch": 2.753993610223642, + "grad_norm": 0.412109375, + "learning_rate": 6.44929612531781e-06, + "loss": 0.6277037262916565, + "step": 1294 + }, + { + "epoch": 2.7582534611288603, + "grad_norm": 0.20703125, + "learning_rate": 6.434159376403363e-06, + "loss": 0.6208704113960266, + "step": 1296 + }, + { + "epoch": 2.762513312034079, + "grad_norm": 0.32421875, + "learning_rate": 6.419277295847563e-06, + "loss": 0.5632691979408264, + "step": 1298 + }, + { + "epoch": 2.7667731629392973, + "grad_norm": 0.296875, + "learning_rate": 6.404650211339093e-06, + "loss": 0.6156328320503235, + "step": 1300 + }, + { + "epoch": 2.7710330138445154, + "grad_norm": 0.259765625, + "learning_rate": 6.390278444951868e-06, + "loss": 0.6689990758895874, + "step": 1302 + }, + { + "epoch": 2.775292864749734, + "grad_norm": 0.25, + "learning_rate": 6.376162313137955e-06, + "loss": 0.6374217867851257, + "step": 1304 + }, + { + "epoch": 2.779552715654952, + "grad_norm": 0.439453125, + "learning_rate": 6.3623021267205975e-06, + "loss": 0.6087695360183716, + "step": 1306 + }, + { + "epoch": 2.7838125665601705, + "grad_norm": 0.2734375, + "learning_rate": 6.348698190887377e-06, + "loss": 0.5766043066978455, + "step": 1308 + }, + { + "epoch": 2.7880724174653886, + "grad_norm": 0.359375, + "learning_rate": 6.3353508051834924e-06, + "loss": 0.6857935786247253, + "step": 1310 + }, + { + "epoch": 2.792332268370607, + "grad_norm": 0.259765625, + "learning_rate": 6.322260263505159e-06, + "loss": 0.6080771684646606, + "step": 1312 + }, + { + "epoch": 2.796592119275825, + "grad_norm": 0.232421875, + "learning_rate": 6.309426854093147e-06, + "loss": 0.5428948402404785, + "step": 1314 + }, + { + "epoch": 2.8008519701810437, + "grad_norm": 0.609375, + "learning_rate": 6.2968508595264195e-06, + "loss": 0.6500948667526245, + "step": 1316 + }, + { + "epoch": 2.8051118210862622, + "grad_norm": 0.306640625, + "learning_rate": 6.284532556715927e-06, + "loss": 0.6038864850997925, + "step": 1318 + }, + { + "epoch": 2.8093716719914803, + "grad_norm": 0.404296875, + "learning_rate": 6.272472216898501e-06, + "loss": 0.6369448304176331, + "step": 1320 + }, + { + "epoch": 2.8136315228966984, + "grad_norm": 0.32421875, + "learning_rate": 6.260670105630885e-06, + "loss": 0.6288717985153198, + "step": 1322 + }, + { + "epoch": 2.817891373801917, + "grad_norm": 0.3984375, + "learning_rate": 6.2491264827838775e-06, + "loss": 0.6535931825637817, + "step": 1324 + }, + { + "epoch": 2.8221512247071354, + "grad_norm": 0.28515625, + "learning_rate": 6.237841602536627e-06, + "loss": 0.6414341330528259, + "step": 1326 + }, + { + "epoch": 2.8264110756123535, + "grad_norm": 0.2578125, + "learning_rate": 6.226815713371023e-06, + "loss": 0.5740489959716797, + "step": 1328 + }, + { + "epoch": 2.830670926517572, + "grad_norm": 0.2890625, + "learning_rate": 6.216049058066229e-06, + "loss": 0.5453130602836609, + "step": 1330 + }, + { + "epoch": 2.83493077742279, + "grad_norm": 0.228515625, + "learning_rate": 6.205541873693331e-06, + "loss": 0.531428873538971, + "step": 1332 + }, + { + "epoch": 2.8391906283280086, + "grad_norm": 0.3046875, + "learning_rate": 6.195294391610128e-06, + "loss": 0.6185562014579773, + "step": 1334 + }, + { + "epoch": 2.8434504792332267, + "grad_norm": 0.3671875, + "learning_rate": 6.185306837456027e-06, + "loss": 0.6069992184638977, + "step": 1336 + }, + { + "epoch": 2.847710330138445, + "grad_norm": 0.26953125, + "learning_rate": 6.1755794311470824e-06, + "loss": 0.5699736475944519, + "step": 1338 + }, + { + "epoch": 2.8519701810436633, + "grad_norm": 0.875, + "learning_rate": 6.166112386871149e-06, + "loss": 0.5937331318855286, + "step": 1340 + }, + { + "epoch": 2.856230031948882, + "grad_norm": 0.30078125, + "learning_rate": 6.15690591308317e-06, + "loss": 0.5383535623550415, + "step": 1342 + }, + { + "epoch": 2.8604898828541003, + "grad_norm": 0.2119140625, + "learning_rate": 6.14796021250058e-06, + "loss": 0.5439456701278687, + "step": 1344 + }, + { + "epoch": 2.8647497337593184, + "grad_norm": 0.2578125, + "learning_rate": 6.139275482098847e-06, + "loss": 0.5950272083282471, + "step": 1346 + }, + { + "epoch": 2.8690095846645365, + "grad_norm": 0.255859375, + "learning_rate": 6.130851913107137e-06, + "loss": 0.5372447967529297, + "step": 1348 + }, + { + "epoch": 2.873269435569755, + "grad_norm": 0.2294921875, + "learning_rate": 6.122689691004103e-06, + "loss": 0.5755343437194824, + "step": 1350 + }, + { + "epoch": 2.8775292864749735, + "grad_norm": 0.28515625, + "learning_rate": 6.114788995513787e-06, + "loss": 0.6370142102241516, + "step": 1352 + }, + { + "epoch": 2.8817891373801916, + "grad_norm": 0.28125, + "learning_rate": 6.107150000601684e-06, + "loss": 0.5815765261650085, + "step": 1354 + }, + { + "epoch": 2.88604898828541, + "grad_norm": 0.2490234375, + "learning_rate": 6.099772874470899e-06, + "loss": 0.6185727715492249, + "step": 1356 + }, + { + "epoch": 2.890308839190628, + "grad_norm": 0.36328125, + "learning_rate": 6.092657779558442e-06, + "loss": 0.5713162422180176, + "step": 1358 + }, + { + "epoch": 2.8945686900958467, + "grad_norm": 0.283203125, + "learning_rate": 6.08580487253166e-06, + "loss": 0.6169173121452332, + "step": 1360 + }, + { + "epoch": 2.8988285410010652, + "grad_norm": 0.34375, + "learning_rate": 6.079214304284781e-06, + "loss": 0.5929686427116394, + "step": 1362 + }, + { + "epoch": 2.9030883919062833, + "grad_norm": 0.294921875, + "learning_rate": 6.072886219935593e-06, + "loss": 0.5761704444885254, + "step": 1364 + }, + { + "epoch": 2.9073482428115014, + "grad_norm": 0.25390625, + "learning_rate": 6.066820758822244e-06, + "loss": 0.5787940621376038, + "step": 1366 + }, + { + "epoch": 2.91160809371672, + "grad_norm": 0.26171875, + "learning_rate": 6.0610180545001845e-06, + "loss": 0.5501613020896912, + "step": 1368 + }, + { + "epoch": 2.9158679446219384, + "grad_norm": 0.1943359375, + "learning_rate": 6.055478234739217e-06, + "loss": 0.5612152218818665, + "step": 1370 + }, + { + "epoch": 2.9201277955271565, + "grad_norm": 0.255859375, + "learning_rate": 6.050201421520689e-06, + "loss": 0.6078463792800903, + "step": 1372 + }, + { + "epoch": 2.924387646432375, + "grad_norm": 0.1953125, + "learning_rate": 6.045187731034801e-06, + "loss": 0.5890936255455017, + "step": 1374 + }, + { + "epoch": 2.928647497337593, + "grad_norm": 0.302734375, + "learning_rate": 6.040437273678055e-06, + "loss": 0.6533024311065674, + "step": 1376 + }, + { + "epoch": 2.9329073482428116, + "grad_norm": 4.3125, + "learning_rate": 6.0359501540508174e-06, + "loss": 0.6827770471572876, + "step": 1378 + }, + { + "epoch": 2.9371671991480297, + "grad_norm": 0.3359375, + "learning_rate": 6.0317264709550185e-06, + "loss": 0.6418617963790894, + "step": 1380 + }, + { + "epoch": 2.9414270500532482, + "grad_norm": 0.2431640625, + "learning_rate": 6.02776631739198e-06, + "loss": 0.5774567127227783, + "step": 1382 + }, + { + "epoch": 2.9456869009584663, + "grad_norm": 0.38671875, + "learning_rate": 6.0240697805603594e-06, + "loss": 0.6014460921287537, + "step": 1384 + }, + { + "epoch": 2.949946751863685, + "grad_norm": 0.51171875, + "learning_rate": 6.020636941854242e-06, + "loss": 0.5642235279083252, + "step": 1386 + }, + { + "epoch": 2.9542066027689033, + "grad_norm": 0.333984375, + "learning_rate": 6.017467876861333e-06, + "loss": 0.5891353487968445, + "step": 1388 + }, + { + "epoch": 2.9584664536741214, + "grad_norm": 0.244140625, + "learning_rate": 6.014562655361307e-06, + "loss": 0.5744375586509705, + "step": 1390 + }, + { + "epoch": 2.9627263045793395, + "grad_norm": 0.255859375, + "learning_rate": 6.011921341324265e-06, + "loss": 0.5458447933197021, + "step": 1392 + }, + { + "epoch": 2.966986155484558, + "grad_norm": 0.23828125, + "learning_rate": 6.009543992909327e-06, + "loss": 0.6621728539466858, + "step": 1394 + }, + { + "epoch": 2.9712460063897765, + "grad_norm": 0.236328125, + "learning_rate": 6.007430662463352e-06, + "loss": 0.5778822898864746, + "step": 1396 + }, + { + "epoch": 2.9755058572949946, + "grad_norm": 0.5390625, + "learning_rate": 6.005581396519782e-06, + "loss": 0.5913535952568054, + "step": 1398 + }, + { + "epoch": 2.979765708200213, + "grad_norm": 1.1328125, + "learning_rate": 6.0039962357976234e-06, + "loss": 0.5911454558372498, + "step": 1400 + }, + { + "epoch": 2.984025559105431, + "grad_norm": 0.2197265625, + "learning_rate": 6.002675215200546e-06, + "loss": 0.5291861295700073, + "step": 1402 + }, + { + "epoch": 2.9882854100106497, + "grad_norm": 0.29296875, + "learning_rate": 6.001618363816112e-06, + "loss": 0.577559232711792, + "step": 1404 + }, + { + "epoch": 2.992545260915868, + "grad_norm": 0.28125, + "learning_rate": 6.000825704915147e-06, + "loss": 0.5995616912841797, + "step": 1406 + }, + { + "epoch": 2.9968051118210863, + "grad_norm": 0.251953125, + "learning_rate": 6.000297255951213e-06, + "loss": 0.5123644471168518, + "step": 1408 + }, + { + "epoch": 3.0, + "grad_norm": 0.2890625, + "learning_rate": 6.000033028560234e-06, + "loss": 0.584560215473175, + "step": 1410 + }, + { + "epoch": 3.0, + "step": 1410, + "total_flos": 3.6491913262740275e+18, + "train_loss": 0.9255026633857836, + "train_runtime": 18814.6678, + "train_samples_per_second": 2.396, + "train_steps_per_second": 0.075 + } + ], + "logging_steps": 2, + "max_steps": 1410, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 99999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.6491913262740275e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}